## Import libraries

In [1]:
import pandas as pd
import json
from tqdm import tqdm
from multiprocessing import Pool
import os
from pprint import pprint
import numpy as np
import math
from operator import itemgetter
import pickle as pk
from tqdm import tqdm

## Read raw data

In [164]:
data_path = "./data/raw-data/omdb-clean-full.csv"

raw_df = pd.read_csv(data_path)
raw_df.rename(columns=dict((col, col.lower()) for col in raw_df.columns), inplace=True)
raw_df.index = raw_df.index.map(str)
raw_df.head()


Unnamed: 0,title,year,rating,genre,director,language,country,type
0,The Secret Life of Walter Mitty,2013,PG,"Adventure, Comedy, Drama",Ben Stiller,English,USA,movie
1,In Secret,2013,R,"Crime, Drama, Thriller",Charlie Stratton,English,USA,movie
2,The Giver,2014,PG-13,"Drama, Sci-Fi",Phillip Noyce,English,USA,movie
3,42,2013,PG-13,"Biography, Drama, Sport",Brian Helgeland,English,USA,movie
4,House at the End of the Drive,2014,R,"Horror, Thriller",David Worth,English,USA,movie


In [17]:
## Read Scenario3 info for the functional dependencies

with open("./scenarios.json", 'r') as fp:
    scenarios = json.load(fp)
required_scenario_info = scenarios["3"]
hypothesis_space = [hypothesis['cfd'] for hypothesis in required_scenario_info['hypothesis_space']]
print(hypothesis_space)

['(title) => director', '(rating) => title, year, director', '(title) => year, director, rating', '(year, rating) => director', '(year, director, title) => rating', '(title) => year, rating', '(year) => title, director', '(year) => rating, director', '(year) => title, rating', '(rating, title) => year', '(director) => year, rating', '(year, rating, title) => director', '(title, rating) => year, director', '(year) => title', '(director) => title, rating', '(title) => year, director', '(year, director) => title, rating', '(director, rating) => year', '(year, rating) => title', '(title) => rating', '(title, year) => director, rating', '(year, rating) => title, director', '(rating) => title, year', '(director) => title, year', '(year) => title, director, rating', '(director) => title, year, rating', '(director, rating, title) => year', '(year, director) => title', '(rating) => year', '(director, title) => year', '(rating) => year, director', '(title, director) => year, rating', '(year, tit

## Add info to new scenarios dict and dump the file

In [18]:
def parse_hypothesis(fd):
    lfd, rfd = fd.split("=>")

    '''Parse left fd and separate out the attributes'''
    left_attributes = lfd.strip().strip("(").strip(")").split(",")
    right_attributes = rfd.strip("(").strip(")").split(",")

    left_attributes = [attribute.strip() for attribute in left_attributes]
    right_attributes = [attribute.strip() for attribute in right_attributes]

    return left_attributes, right_attributes

In [19]:
def is_support_violation(fd_components, tuple_1, tuple_2):
    '''Parse the hypothesis'''
    lfd, rfd = fd_components

    '''Violation check is only needed if the lfd values are same in both tuples otherwise it's not a violation'''
    is_left_same = all(tuple_1[left_attribute] == tuple_2[left_attribute] for left_attribute in lfd)

    if is_left_same:
        is_right_same = all(tuple_1[left_attribute] == tuple_2[left_attribute] for left_attribute in rfd)
        if is_right_same:
            return True, False
        else:
            return False, True
    else:
        return False, False


In [20]:
def get_support_violation_tuples(data, idx, fd_components):
    supports = []
    violations = []
    for idx_ in data.index:
        if idx == idx_:
            continue
        is_support, is_violation = is_support_violation(fd_components=fd_components, tuple_1=data.iloc[idx], tuple_2=data.iloc[idx_])
        if is_support:
            supports.append(idx_)
        elif is_violation:
            violations.append(idx_)
    return supports, violations
        

In [21]:
def get_hypothesis_info_dict(hypothesis):
    '''Extract left and right attributes from the hypothesis as list of attributes'''
    lfd, rfd = parse_hypothesis(hypothesis)
    info_dict = {'lfd': lfd, 'rfd':rfd}

    '''Find pairwise violations of each tuple with respect to other tuples in the dataset'''
    info_dict['supports'] = dict()
    info_dict['violations'] = dict()
    for idx in tqdm(raw_df.index):
        supports, violations = get_support_violation_tuples(data=raw_df[lfd+rfd], idx=idx, fd_components=(lfd, rfd))
        info_dict['supports'][idx] = supports
        info_dict['violations'][idx] = violations
    
    return info_dict

# get_hypothesis_info_dict(hypothesis_space[0])
# hypothesis_space[0]

In [22]:
cpu_num = os.cpu_count()
cpu_num = 1

In [None]:
new_scenarios_dict = dict()
new_scenarios_dict['omdb'] = dict()
new_scenarios_dict['omdb']['hypothesis_space'] = dict()


with Pool(cpu_num) as p:
    hypothesis_info = p.map(get_hypothesis_info_dict, hypothesis_space)

for hypothesis, info_dict in zip(hypothesis_space, hypothesis_info):
    new_scenarios_dict['omdb']['hypothesis_space'][hypothesis] = info_dict

# for hypothesis in tqdm(hypothesis_space):
    
    # '''Extract left and right attributes from the hypothesis as list of attributes'''
    # lfd, rfd = parse_hypothesis(hypothesis)
    # new_scenarios_dict['omdb']['hypothesis_space'][hypothesis] = {'lfd': lfd, 'rfd':rfd}

    # '''Find pairwise violations of each tuple with respect to other tuples in the dataset'''
    # new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]['violations'] = dict()
    # for idx in raw_df.index:
    #     new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]['violations'][idx]=get_violation_tuples(data=raw_df, idx=idx, fd=hypothesis)

In [None]:
for dataset in new_scenarios_dict:
    new_scenarios_dict[dataset]['data_indices'] = [str(x) for x in raw_df.index]
    for hypothesis in new_scenarios_dict[dataset]['hypothesis_space']:
        for val_type in ['supports', 'violations']:
            for idx in new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type]:

                if int(idx) in new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx]:
                    new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx].remove(int(idx))

                '''Don't assign if the list contains self index'''
                if new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx] != []:
                    new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx] = [str(x) for x in new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx] if str(x)!=str(idx)]

In [24]:
new_scenarios_dict['omdb']['dataset_path']="../raw-data/omdb-clean-full.csv"

In [25]:
# with open("./new_scenarios.json", 'w') as fp:
#     json.dump(new_scenarios_dict, fp)

In [173]:
with open("./new_scenarios.json", 'r') as fp:
    new_scenarios_dict = json.load(fp) 

In [24]:
new_scenarios_dict['omdb']['hypothesis_space'][hypothesis_space[0]]

{'lfd': ['title'],
 'rfd': ['director'],
 'supports': {'18': ['251'],
  '24': ['1098'],
  '33': ['1497'],
  '39': ['531'],
  '42': ['261'],
  '46': ['73'],
  '54': ['55'],
  '55': ['54'],
  '71': ['1505'],
  '73': ['46'],
  '81': ['128'],
  '96': ['404'],
  '98': ['1207', '2464'],
  '99': ['2524'],
  '108': ['2547'],
  '109': ['237'],
  '128': ['81'],
  '131': ['1691'],
  '187': ['314'],
  '189': ['1560'],
  '207': ['208'],
  '208': ['207'],
  '237': ['109'],
  '249': ['343'],
  '251': ['18'],
  '261': ['42'],
  '288': ['289'],
  '289': ['288'],
  '292': ['293'],
  '293': ['292'],
  '314': ['187'],
  '315': ['318'],
  '318': ['315'],
  '343': ['249'],
  '404': ['96'],
  '424': ['1202'],
  '429': ['430'],
  '430': ['429'],
  '449': ['1680'],
  '451': ['675'],
  '466': ['1697'],
  '469': ['1248'],
  '474': ['1792'],
  '499': ['1599'],
  '506': ['509'],
  '509': ['506'],
  '512': ['2553'],
  '527': ['1678'],
  '531': ['39'],
  '554': ['558'],
  '558': ['554'],
  '591': ['2623'],
  '616': 

## Model

In [116]:
hypothesis_support_violation_ratio_info = dict()
for hypothesis in new_scenarios_dict['omdb']['hypothesis_space']:
    hypothesis_info_dict = new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]
    if len(hypothesis_info_dict['lfd']+hypothesis_info_dict['rfd']) not in [3,4]:
        continue
    
    support_pairs_num, violation_pairs_num = 0,0
    for idx in hypothesis_info_dict['supports']:
        support_pairs_num += len(hypothesis_info_dict['supports'][idx])

    for idx in hypothesis_info_dict['violations']:
        violation_pairs_num += len(hypothesis_info_dict['violations'][idx])
    
    hypothesis_support_violation_ratio_info[hypothesis] = support_pairs_num/(support_pairs_num+violation_pairs_num)
pprint(hypothesis_support_violation_ratio_info)


{'(director) => title, rating': 0.031937850668968495,
 '(director) => title, year': 0.031650122284563376,
 '(director) => title, year, rating': 0.03150625809236081,
 '(director) => year, rating': 0.4865486980290606,
 '(director, rating) => title': 0.04189469711266277,
 '(director, rating) => title, year': 0.04132855255708624,
 '(director, rating) => year': 0.6382336289866013,
 '(director, rating, title) => year': 0.9864864864864865,
 '(director, title) => rating': 0.9866666666666667,
 '(director, title) => year': 0.9777777777777777,
 '(rating) => title, director': 0.00019602458607466064,
 '(rating) => title, year': 0.0004653376435195773,
 '(rating) => title, year, director': 0.00019337560518175982,
 '(rating) => year, director': 0.0029862844599301904,
 '(rating, title) => director': 0.28756476683937826,
 '(rating, title) => year': 0.6826424870466321,
 '(title) => director, rating': 0.17156105100463678,
 '(title) => year, director': 0.17001545595054096,
 '(title) => year, director, rati

In [117]:
'''Sample confidence from 0 to support_violation_ratio'''
np.random.seed(1000)
model = dict((hypothesis, np.random.uniform(max(0,ratio-0.25), min(1,ratio+0.25) )) for hypothesis, ratio in hypothesis_support_violation_ratio_info.items())
model_dict={'omdb':{'model':model}}
# model = dict((hypothesis, ratio ) for hypothesis, ratio in hypothesis_support_violation_ratio_info.items())

pprint(model)

{'(director) => title, rating': 0.11056309192538867,
 '(director) => title, year': 0.008162687910321851,
 '(director) => title, year, rating': 0.09561013342732713,
 '(director) => year, rating': 0.6574190601555914,
 '(director, rating) => title': 0.263930489076405,
 '(director, rating) => title, year': 0.11600845233877378,
 '(director, rating) => year': 0.4230246694715786,
 '(director, rating, title) => year': 0.9227081119031307,
 '(director, title) => rating': 0.8010856919034102,
 '(director, title) => year': 0.7373343833312039,
 '(rating) => title, director': 0.0026074680492768117,
 '(rating) => title, year': 0.10405105392196197,
 '(rating) => title, year, director': 0.16352378464778208,
 '(rating) => year, director': 0.21631801048408889,
 '(rating, title) => director': 0.07484313491059225,
 '(rating, title) => year': 0.5492085857208221,
 '(title) => director, rating': 0.37312070859434476,
 '(title) => year, director': 0.07655055532775958,
 '(title) => year, director, rating': 0.0482

In [118]:
with open("./trainer/trainer_model.json", 'w') as fp:
    json.dump(model_dict, fp)

### Predict a tuple using a model

In [119]:
def predict_clean_tuple(idx, model):
    total_score = 0
    for hypothesis, conf in model.items():
        '''Check the number of supports and violations based on the model'''
        support_pairs_num = len(new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]['supports'].get(idx, []))
        violation_pairs_num = len(new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]['violations'].get(idx, []))

        '''Vote according to the hypothesis conf'''
        total_score += (conf*(support_pairs_num-violation_pairs_num))
    
    if total_score > 0:
        return True, total_score
    
    else:
        return False, total_score

In [120]:
clean_tuple_indices = set()
model_score_dict = dict()
for idx in new_scenarios_dict['omdb']['data_indices']:         
    is_clean, total_score = predict_clean_tuple(idx, model)
    if is_clean:
        clean_tuple_indices.add(idx)
    model_score_dict[idx] = total_score


In [121]:
len(clean_tuple_indices)

0

In [122]:
model_dict['omdb']['predictions'] =  dict((idx, True) if idx in clean_tuple_indices else (idx,False) for idx in new_scenarios_dict['omdb']['data_indices'])

In [123]:
with open("./trainer/trainer_model.json", 'w') as fp:
    json.dump(model_dict, fp)

In [124]:
0.9**10

0.3486784401000001

## Compute the coditional probability of a tuple being cleaned conditional to all the tuples being clean
- Let's suppose t1 has compliance and violations with t2, t3 and t4 only. Then the conditional probability becomes independent of other variables
- P(t1=C|t2=C,t3=C,......) = P(t1=C|t2=C,t3=C,t4=C)
    - = P(t1=C, t2=C, t3=C, t4=C)/P(t2=C, t3=C, t4=C)
    - = P(t1=C, t2=C, t3=C, t4=C)/(P(t1=C, t2=C, t3=C, t4=C) + P(t1=D, t2=C, t3=C, t4=C))
    - = 1/Z\*exp(p*(#ofcompliance(t1=C, t2=C, t3=C, t4=C)-#ofviolations_(t1=C, t2=C, t3=C, t4=C)))/[1/Z*(exp(p*(#ofcompliance(t1=C, t2=C, t3=C, t4=C)-#ofviolations_(t1=C, t2=C, t3=C, t4=C)))+(exp(p*(#ofcompliance(t1=D, t2=C, t3=C, t4=C)-#ofviolations_(t1=D, t2=C, t3=C, t4=C))))]

In [141]:
def get_conditional_clean_prob(idx, fd, valid_indices = None):
    if valid_indices is None:
        compliance_num = len(new_scenarios_dict['omdb']['hypothesis_space'][fd]['supports'].get(str(idx), []))
        violation_num = len(new_scenarios_dict['omdb']['hypothesis_space'][fd]['violations'].get(str(idx),[]))
    else:
        compliance_num = len([idx_ for idx_ in new_scenarios_dict['omdb']['hypothesis_space'][fd]['supports'].get(str(idx), []) if idx_ in valid_indices])
        violation_num = len([idx_ for idx_ in new_scenarios_dict['omdb']['hypothesis_space'][fd]['violations'].get(str(idx),[]) if idx_ in valid_indices])

    tuple_clean_score = math.exp(model_probab*(compliance_num-violation_num))
    tuple_dirty_score = math.exp(model_probab*(-compliance_num+violation_num))
    cond_p_clean = tuple_clean_score/(tuple_clean_score+tuple_dirty_score)
    return cond_p_clean


In [142]:
model = model_dict['omdb']['model']
conditional_clean_probability_dict = dict()
clean_indices = set()
dirty_indices = set()

dirty_sample_percentage = 0.1

data_indices = new_scenarios_dict['omdb']['data_indices']

top_10_fds = dict(sorted(model.items(), key=itemgetter(1), reverse=True)[:10])

for idx in data_indices:
    conditional_clean_probability_dict[idx] = {'hypothesis':dict()}
    for fd, model_probab in top_10_fds.items():
        conditional_clean_probability_dict[idx]['hypothesis'][fd] = get_conditional_clean_prob(idx, fd)
    conditional_clean_probability_dict[idx]['average'] = np.mean(list(conditional_clean_probability_dict[idx]['hypothesis'].values()))
    is_idx_clean = conditional_clean_probability_dict[idx]['average']>0.5
    conditional_clean_probability_dict[idx]['is_clean'] = is_idx_clean

    if is_idx_clean:
        clean_indices.add(idx)
    else:
        dirty_indices.add(idx)
else:
    pprint(conditional_clean_probability_dict)
    print(len(clean_indices), len(dirty_indices))

sampled_data_indices = clean_indices.union(np.random.choice(list(dirty_indices), int(dirty_sample_percentage*len(clean_indices)), replace=False))
print(len(sampled_data_indices))


{'0': {'average': 0.5,
       'hypothesis': {'(director) => year, rating': 0.5,
                      '(director, rating, title) => year': 0.5,
                      '(director, title) => rating': 0.5,
                      '(director, title) => year': 0.5,
                      '(rating, title) => year': 0.5,
                      '(title) => year, rating': 0.5,
                      '(title, director) => year, rating': 0.5,
                      '(title, year) => director, rating': 0.5,
                      '(year, director) => rating': 0.5,
                      '(year, director, title) => rating': 0.5},
       'is_clean': False},
 '1': {'average': 0.43634946749089626,
       'hypothesis': {'(director) => year, rating': 0.2116783776987315,
                      '(director, rating, title) => year': 0.5,
                      '(director, title) => rating': 0.5,
                      '(director, title) => year': 0.5,
                      '(rating, title) => year': 0.5,
              

## Rerun the model computation and is_clean prediction using this computed model

In [143]:
## Check the prob in only clean data
new_model = dict()
for hypothesis in new_scenarios_dict['omdb']['hypothesis_space']:
    hypothesis_info_dict = new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]
    if len(hypothesis_info_dict['lfd']+hypothesis_info_dict['rfd']) not in [3,4]:
        continue
    
    support_pairs_num, violation_pairs_num = 0,0
    for idx in hypothesis_info_dict['supports']:
        if idx not in sampled_data_indices:
            continue
        support_pairs_num += len([idx1 for idx1 in hypothesis_info_dict['supports'][idx] if idx1 in sampled_data_indices])

    for idx in hypothesis_info_dict['violations']:
        if idx not in sampled_data_indices:
            continue
        violation_pairs_num += len([idx1 for idx1 in hypothesis_info_dict['violations'][idx] if idx1 in sampled_data_indices])
    
    new_model[hypothesis] = support_pairs_num/(support_pairs_num+violation_pairs_num)
pprint(new_model)

{'(director) => title, rating': 0.05758611622403366,
 '(director) => title, year': 0.05758611622403366,
 '(director) => title, year, rating': 0.05758611622403366,
 '(director) => year, rating': 0.7788587956876151,
 '(director, rating) => title': 0.060884070058381985,
 '(director, rating) => title, year': 0.060884070058381985,
 '(director, rating) => year': 0.8234639977759244,
 '(director, rating, title) => year': 1.0,
 '(director, title) => rating': 1.0,
 '(director, title) => year': 1.0,
 '(rating) => title, director': 0.0006384355704815378,
 '(rating) => title, year': 0.0008279255799851906,
 '(rating) => title, year, director': 0.0006384355704815378,
 '(rating) => year, director': 0.008634913971535685,
 '(rating, title) => director': 0.7373737373737373,
 '(rating, title) => year': 0.9562289562289562,
 '(title) => director, rating': 0.6460176991150443,
 '(title) => year, director': 0.6460176991150443,
 '(title) => year, director, rating': 0.6460176991150443,
 '(title) => year, rating'

In [146]:
new_conditional_clean_probability_dict = dict()
new_clean_indices = set()
new_dirty_indices = set()


top_10_fds = dict(sorted(new_model.items(), key=itemgetter(1), reverse=True)[:10])

for idx in sampled_data_indices:
    new_conditional_clean_probability_dict[idx] = {'hypothesis':dict()}
    for fd, model_probab in top_10_fds.items():
        new_conditional_clean_probability_dict[idx]['hypothesis'][fd] = get_conditional_clean_prob(idx, fd, valid_indices=sampled_data_indices)
    new_conditional_clean_probability_dict[idx]['average'] = np.mean(list(new_conditional_clean_probability_dict[idx]['hypothesis'].values()))
    is_idx_clean = new_conditional_clean_probability_dict[idx]['average']>0.5
    new_conditional_clean_probability_dict[idx]['is_clean'] = is_idx_clean

    if is_idx_clean:
        new_clean_indices.add(idx)
    else:
        new_dirty_indices.add(idx)

else:
    pprint(new_conditional_clean_probability_dict)
    print(len(new_clean_indices), len(new_dirty_indices))

{'1000': {'average': 0.598948670233451,
          'hypothesis': {'(director, rating) => year': 0.9929017538403582,
                         '(director, rating, title) => year': 0.5,
                         '(director, title) => rating': 0.5,
                         '(director, title) => year': 0.5,
                         '(rating, title) => year': 0.5,
                         '(title) => year, rating': 0.5,
                         '(title, director) => year, rating': 0.5,
                         '(year, director) => rating': 0.996584948494152,
                         '(year, director, title) => rating': 0.5,
                         '(year, title) => rating': 0.5},
          'is_clean': True},
 '1001': {'average': 0.8001810488158542,
          'hypothesis': {'(director, rating) => year': 0.16152457211852314,
                         '(director, rating, title) => year': 0.8807970779778824,
                         '(director, title) => rating': 0.8807970779778824,
              

In [148]:
print(len(clean_indices.intersection(new_clean_indices))+len(dirty_indices.intersection(new_dirty_indices)), len(sampled_data_indices))

1461 1470


## Sample df 

In [167]:
model_dict={'omdb':{'model':new_model}}
model_dict['omdb']['predictions'] =  dict((idx, True) if idx in new_clean_indices else (idx,False) for idx in sampled_data_indices)
with open("./trainer/trainer_model.json", 'w') as fp:
    json.dump(model_dict, fp)

In [181]:
sampled_df = raw_df.loc[list(sampled_data_indices)]
# sampled_df['is_clean'] = sampled_df.index.map(lambda x: model_dict['omdb']['predictions'][x])
# del sampled_df['is_clean']
sampled_df

Unnamed: 0,title,year,rating,genre,director,language,country,type
2218,The Scavenger Vortex,2013,TV-PG,Comedy,Mark Cendrowski,English,USA,episode
1447,New Birthday,2013,TV-MA,"Crime, Drama",Lesli Linka Glatter,English,USA,episode
1338,Big Episode: Someone Stole a Spoon,2013,TV-14,Comedy,James Widdoes,English,USA,episode
522,Everybody's a Fucking Critic,2013,TV-MA,"Comedy, Drama",Seith Mann,English,USA,episode
2486,Pay Pal,2014,TV-PG,"Animation, Comedy",Michael Polcino,English,USA,episode
...,...,...,...,...,...,...,...,...
1248,The Appearing,2014,R,"Horror, Mystery, Thriller",Daric Gates,English,USA,movie
117,House Hunting,2013,R,"Mystery, Thriller",Eric Hurt,English,USA,movie
2636,Rosie's Diner,2013,Not Rated,"Short, Action, Crime",Wes Williams II,English,USA,movie
1755,The Big Bust Theory,2013,TV-MA,Comedy,Dean McKendrick,English,USA,movie


In [170]:
os.makedirs("./data/processed-data", exist_ok=True)
sampled_df.to_csv("./data/processed-data/omdb-sampled.csv")

In [179]:
new_scenarios_dict['omdb']['hypothesis_space']['(title) => director'].keys()

dict_keys(['lfd', 'rfd', 'supports', 'violations'])

In [176]:
# new_scenarios_dict['omdb']['processed_dataset_path'] = "data/processed-data/omdb-sampled.csv"
# new_scenarios_dict['omdb']['raw_dataset_path'] = "data/raw-data/omdb-clean-full.csv"
# del new_scenarios_dict['omdb']['dataset_path']

# with open("./new_scenarios.json", 'w') as fp:
#     json.dump(new_scenarios_dict, fp)

## Final Process and Dump pickled data

In [5]:
with open('./trainer_model.json', 'r') as f:
    models_dict = json.load(f)

required_fds = dict(
    (scenario, set(models_dict[scenario]['model'].keys())) for scenario in models_dict)

with open("./data/processed-exp-data/trainer_model.json", 'w') as fp:
    json.dump(models_dict, fp)

with open("./data/processed-exp-data/required_fds.pk", 'wb') as fp:
    pk.dump(required_fds, fp)


with open('./new_scenarios.json', 'r') as f:
    scenarios = json.load(f)

'''Process new_scenarios to make the processing faster later'''
processed_df = dict()
filtered_processed_scenarios = dict()
for dataset in scenarios:

    processed_df[dataset] = pd.read_csv(
        scenarios['omdb']['processed_dataset_path'], index_col=0)
    processed_df[dataset].index = processed_df[dataset].index.map(str)
    required_indices = set(processed_df[dataset].index)

    filtered_processed_scenarios= {dataset:{'data_indices': set(
        scenarios[dataset]['data_indices']).intersection(required_indices), 'hypothesis_space': dict()}}

    '''Filter required fds and data_indices'''
    for hypothesis in tqdm(scenarios[dataset]['hypothesis_space']):
        if hypothesis not in required_fds[dataset]:
            continue
        

        filtered_processed_scenarios[dataset]['hypothesis_space'][hypothesis]={'lfd':set(
            scenarios[dataset]['hypothesis_space'][hypothesis]['lfd']),
        'rfd': set(
            scenarios[dataset]['hypothesis_space'][hypothesis]['rfd'])}

        for info_type in ['supports', 'violations']:
            filtered_processed_scenarios[dataset]['hypothesis_space'][hypothesis][info_type] = dict()

            filtered_processed_scenarios[dataset]['hypothesis_space'][hypothesis][f'{info_type[:-1]}_pairs'] = set()
            for idx in scenarios[dataset]['hypothesis_space'][hypothesis][info_type]:
                if idx not in required_indices:
                    continue

                filtered_processed_scenarios[dataset]['hypothesis_space'][hypothesis][info_type][idx] = set(
                    scenarios[dataset]['hypothesis_space'][hypothesis][info_type][idx]).intersection(required_indices)
                
                pairs = set((idx, idx_) if idx<idx_ else (idx_, idx) for idx_ in filtered_processed_scenarios[dataset]['hypothesis_space'][hypothesis][info_type][idx])
                filtered_processed_scenarios[dataset]['hypothesis_space'][hypothesis][f'{info_type[:-1]}_pairs'] |= pairs


with open("./data/processed-exp-data/filtered_processed_scenarios.pk", 'wb') as fp:
    pk.dump(filtered_processed_scenarios, fp)

with open("./data/processed-exp-data/processed_dfs.pk", 'wb') as fp:
    pk.dump(processed_df, fp)

100%|██████████| 50/50 [00:05<00:00,  8.36it/s]


In [222]:
filtered_processed_scenarios['omdb'].keys()

dict_keys(['data_indices', 'hypothesis_space'])

In [None]:
print(filtered_processed_scenarios)

In [None]:
filtered_processed_scenarios['omdb']['hypothesis_space']['(year, director) => rating']['supports']

In [219]:
'3111' in required_indices

True