## Import libraries

In [2]:
import pandas as pd
import json
from tqdm import tqdm
from multiprocessing import Pool
import os
from pprint import pprint
import numpy as np
import math
from operator import itemgetter
import pickle as pk
from tqdm import tqdm

## Read raw data

In [3]:
data_path = "./data/preprocess-data/omdb-clean-full.csv"

raw_df = pd.read_csv(data_path)
raw_df.rename(columns=dict((col, col.lower()) for col in raw_df.columns), inplace=True)
raw_df.index = raw_df.index.map(str)
raw_df.head()


Unnamed: 0,title,year,rating,genre,director,language,country,type
0,The Secret Life of Walter Mitty,2013,PG,"Adventure, Comedy, Drama",Ben Stiller,English,USA,movie
1,In Secret,2013,R,"Crime, Drama, Thriller",Charlie Stratton,English,USA,movie
2,The Giver,2014,PG-13,"Drama, Sci-Fi",Phillip Noyce,English,USA,movie
3,42,2013,PG-13,"Biography, Drama, Sport",Brian Helgeland,English,USA,movie
4,House at the End of the Drive,2014,R,"Horror, Thriller",David Worth,English,USA,movie


## Add info to new scenarios dict and dump the file

In [4]:
def parse_hypothesis(fd):
    lfd, rfd = fd.split("=>")

    '''Parse left fd and separate out the attributes'''
    left_attributes = lfd.strip().strip("(").strip(")").split(",")
    right_attributes = rfd.strip("(").strip(")").split(",")

    left_attributes = [attribute.strip() for attribute in left_attributes]
    right_attributes = [attribute.strip() for attribute in right_attributes]

    return left_attributes, right_attributes

In [5]:
def is_support_violation(fd_components, tuple_1, tuple_2):
    '''Parse the hypothesis'''
    lfd, rfd = fd_components

    '''Violation check is only needed if the lfd values are same in both tuples otherwise it's not a violation'''
    is_left_same = all(tuple_1[left_attribute] == tuple_2[left_attribute] for left_attribute in lfd)

    if is_left_same:
        is_right_same = all(tuple_1[left_attribute] == tuple_2[left_attribute] for left_attribute in rfd)
        if is_right_same:
            return True, False
        else:
            return False, True
    else:
        return False, False


In [6]:
def get_support_violation_tuples(data, idx, fd_components):
    supports = []
    violations = []
    for idx_ in data.index:
        if idx == idx_:
            continue
        is_support, is_violation = is_support_violation(fd_components=fd_components, tuple_1=data.iloc[idx], tuple_2=data.iloc[idx_])
        if is_support:
            supports.append(idx_)
        elif is_violation:
            violations.append(idx_)
    return supports, violations
        

In [7]:
def get_hypothesis_info_dict(hypothesis):
    '''Extract left and right attributes from the hypothesis as list of attributes'''
    lfd, rfd = parse_hypothesis(hypothesis)
    info_dict = {'lfd': lfd, 'rfd':rfd}

    '''Find pairwise violations of each tuple with respect to other tuples in the dataset'''
    info_dict['supports'] = dict()
    info_dict['violations'] = dict()
    for idx in tqdm(raw_df.index):
        supports, violations = get_support_violation_tuples(data=raw_df[lfd+rfd], idx=idx, fd_components=(lfd, rfd))
        info_dict['supports'][idx] = supports
        info_dict['violations'][idx] = violations
    
    return info_dict

# get_hypothesis_info_dict(hypothesis_space[0])
# hypothesis_space[0]

In [8]:
cpu_num = os.cpu_count()
cpu_num = 1

In [9]:
new_scenarios_dict = dict()
new_scenarios_dict['omdb'] = dict()
new_scenarios_dict['omdb']['hypothesis_space'] = dict()


with Pool(cpu_num) as p:
    hypothesis_info = p.map(get_hypothesis_info_dict, hypothesis_space)

for hypothesis, info_dict in zip(hypothesis_space, hypothesis_info):
    new_scenarios_dict['omdb']['hypothesis_space'][hypothesis] = info_dict

# for hypothesis in tqdm(hypothesis_space):
    
    # '''Extract left and right attributes from the hypothesis as list of attributes'''
    # lfd, rfd = parse_hypothesis(hypothesis)
    # new_scenarios_dict['omdb']['hypothesis_space'][hypothesis] = {'lfd': lfd, 'rfd':rfd}

    # '''Find pairwise violations of each tuple with respect to other tuples in the dataset'''
    # new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]['violations'] = dict()
    # for idx in raw_df.index:
    #     new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]['violations'][idx]=get_violation_tuples(data=raw_df, idx=idx, fd=hypothesis)

NameError: name 'hypothesis_space' is not defined

In [None]:
for dataset in new_scenarios_dict:
    new_scenarios_dict[dataset]['data_indices'] = [str(x) for x in raw_df.index]
    for hypothesis in new_scenarios_dict[dataset]['hypothesis_space']:
        for val_type in ['supports', 'violations']:
            for idx in new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type]:

                if int(idx) in new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx]:
                    new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx].remove(int(idx))

                '''Don't assign if the list contains self index'''
                if new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx] != []:
                    new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx] = [str(x) for x in new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx] if str(x)!=str(idx)]

In [24]:
new_scenarios_dict['omdb']['processed_dataset_path'] = "data/processed-data/omdb-sampled.csv"
new_scenarios_dict['omdb']['raw_dataset_path'] = "data/preprocess-data/omdb-clean-full.csv"

with open("./new_scenarios.json", 'w') as fp:
    json.dump(new_scenarios_dict, fp)

In [25]:
# with open("./new_scenarios.json", 'w') as fp:
#     json.dump(new_scenarios_dict, fp)

## Read new scenarios file

In [118]:
with open("./new_scenarios.json", 'r') as fp:
    new_scenarios_dict = json.load(fp) 

In [119]:
# new_scenarios_dict['omdb']['hypothesis_space'][hypothesis_space[0]]

## Model

In [120]:
hypothesis_support_violation_ratio_info = dict()
for hypothesis in new_scenarios_dict['omdb']['hypothesis_space']:
    hypothesis_info_dict = new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]
    if len(hypothesis_info_dict['lfd']+hypothesis_info_dict['rfd']) not in [3,4]:
        continue
    
    support_pairs_num, violation_pairs_num = 0,0
    for idx in hypothesis_info_dict['supports']:
        support_pairs_num += len(hypothesis_info_dict['supports'][idx])

    for idx in hypothesis_info_dict['violations']:
        violation_pairs_num += len(hypothesis_info_dict['violations'][idx])
    
    hypothesis_support_violation_ratio_info[hypothesis] = support_pairs_num/(support_pairs_num+violation_pairs_num)
# pprint(hypothesis_support_violation_ratio_info)


In [121]:
'''Sample confidence from 0 to support_violation_ratio'''
np.random.seed(1000)
model = dict((hypothesis, np.random.uniform(max(0,ratio-0.25), min(1,ratio+0.25) )) for hypothesis, ratio in hypothesis_support_violation_ratio_info.items())
model_dict={'omdb':{'model':model}}
# model = dict((hypothesis, ratio ) for hypothesis, ratio in hypothesis_support_violation_ratio_info.items())

# pprint(model)

In [122]:
with open("./trainer_model.json", 'w') as fp:
    json.dump(model_dict, fp)

## Compute the coditional probability of a tuple being cleaned conditional to all the tuples being clean
- Let's suppose t1 has compliance and violations with t2, t3 and t4 only. Then the conditional probability becomes independent of other variables
- P(t1=C|t2=C,t3=C,......) = P(t1=C|t2=C,t3=C,t4=C)
    - = P(t1=C, t2=C, t3=C, t4=C)/P(t2=C, t3=C, t4=C)
    - = P(t1=C, t2=C, t3=C, t4=C)/(P(t1=C, t2=C, t3=C, t4=C) + P(t1=D, t2=C, t3=C, t4=C))
    - = 1/Z\*exp(p*(#ofcompliance(t1=C, t2=C, t3=C, t4=C)-#ofviolations_(t1=C, t2=C, t3=C, t4=C)))/[1/Z*(exp(p*(#ofcompliance(t1=C, t2=C, t3=C, t4=C)-#ofviolations_(t1=C, t2=C, t3=C, t4=C)))+(exp(p*(#ofcompliance(t1=D, t2=C, t3=C, t4=C)-#ofviolations_(t1=D, t2=C, t3=C, t4=C))))]

In [123]:
def get_conditional_clean_prob(idx, fd, model_probab, valid_indices = None):
    if valid_indices is None:
        compliance_num = len(new_scenarios_dict['omdb']['hypothesis_space'][fd]['supports'].get(str(idx), []))
        violation_num = len(new_scenarios_dict['omdb']['hypothesis_space'][fd]['violations'].get(str(idx),[]))
    else:
        compliance_num = len([idx_ for idx_ in new_scenarios_dict['omdb']['hypothesis_space'][fd]['supports'].get(str(idx), []) if idx_ in valid_indices])
        violation_num = len([idx_ for idx_ in new_scenarios_dict['omdb']['hypothesis_space'][fd]['violations'].get(str(idx),[]) if idx_ in valid_indices])

    tuple_clean_score = math.exp(model_probab*(compliance_num-violation_num))
    tuple_dirty_score = math.exp(model_probab*(-compliance_num+violation_num))
    cond_p_clean = tuple_clean_score/(tuple_clean_score+tuple_dirty_score)
    return cond_p_clean


In [124]:
model = model_dict['omdb']['model']
conditional_clean_probability_dict = dict()
clean_indices = set()
dirty_indices = set()

clean_max_num = 1500
dirty_sample_percentage = 0.1

data_indices = new_scenarios_dict['omdb']['data_indices']

top_10_fds = dict(sorted(model.items(), key=itemgetter(1), reverse=True)[:10])

for idx in data_indices:
    conditional_clean_probability_dict[idx] = {'hypothesis':dict()}
    for fd, model_probab in top_10_fds.items():
        conditional_clean_probability_dict[idx]['hypothesis'][fd] = get_conditional_clean_prob(idx, fd, model_probab=model_probab)
    conditional_clean_probability_dict[idx]['average'] = np.mean(list(conditional_clean_probability_dict[idx]['hypothesis'].values()))
    is_idx_clean = conditional_clean_probability_dict[idx]['average']>=0.5
    conditional_clean_probability_dict[idx]['is_clean'] = is_idx_clean

    if is_idx_clean:
        clean_indices.add(idx)
    else:
        dirty_indices.add(idx)
else:
    # pprint(conditional_clean_probability_dict)
    print(len(clean_indices), len(dirty_indices))

clean_sample_idxs = np.random.choice(list(clean_indices), min(len(clean_indices), clean_max_num), replace=False)
dirty_sample_idxs = np.random.choice(list(dirty_indices), int(dirty_sample_percentage*len(clean_sample_idxs)), replace=False)
sampled_data_indices = set(clean_sample_idxs).union(set(dirty_sample_idxs))
print(len(sampled_data_indices))


2403 890
1650


## Rerun the model computation and is_clean prediction using this computed model

In [125]:
from copy import deepcopy

In [126]:
'''Assume every data to be clean at the beginning and compute the model based on that'''
new_model = deepcopy(model_dict['omdb']['model'])
model_mae = float("inf")

new_data_indices = sampled_data_indices

while model_mae > 1e-05:

    '''Use current model to predict clean and dirty indices'''
    top_10_fds = dict(
        sorted(new_model.items(), key=itemgetter(1), reverse=True)[:10])

    new_conditional_clean_probability_dict = dict()
    new_clean_indices = set()
    new_dirty_indices = set()

    for idx in new_data_indices:
        new_conditional_clean_probability_dict[idx] = {'hypothesis': dict()}
        for fd, model_probab in top_10_fds.items():
            new_conditional_clean_probability_dict[idx]['hypothesis'][fd] = get_conditional_clean_prob(
                idx, fd, model_probab=model_probab, valid_indices=new_data_indices)
        new_conditional_clean_probability_dict[idx]['average'] = np.mean(
            list(new_conditional_clean_probability_dict[idx]['hypothesis'].values()))
        is_idx_clean = new_conditional_clean_probability_dict[idx]['average'] >= 0.5
        new_conditional_clean_probability_dict[idx]['is_clean'] = is_idx_clean

        if is_idx_clean:
            new_clean_indices.add(idx)
        else:
            new_dirty_indices.add(idx)

    else:
        # pprint(new_conditional_clean_probability_dict)
        print(f"Clean Data Number: {len(new_clean_indices)},"
              f"Dirty Data Number: {len(new_dirty_indices)},"
              f"Dirty Data Proportion: {len(new_dirty_indices)/len(new_clean_indices.union(new_dirty_indices))}")

    '''Use clean data to estimate model'''
    model_mae = 0
    for hypothesis in new_scenarios_dict['omdb']['hypothesis_space']:
        hypothesis_info_dict = new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]
        if len(hypothesis_info_dict['lfd']+hypothesis_info_dict['rfd']) not in [3, 4]:
            continue

        '''Only consider the clean estimated indices'''
        support_pairs_num, violation_pairs_num = 0, 0
        for idx in hypothesis_info_dict['supports']:
            if idx not in new_clean_indices:
                continue
            support_pairs_num += len([idx1 for idx1 in hypothesis_info_dict['supports']
                                     [idx] if idx1 in new_clean_indices])

        for idx in hypothesis_info_dict['violations']:
            if idx not in new_clean_indices:
                continue
            violation_pairs_num += len(
                [idx1 for idx1 in hypothesis_info_dict['violations'][idx] if idx1 in new_clean_indices])

        fd_prob = support_pairs_num/(support_pairs_num+violation_pairs_num)

        '''Compute mae with previous model value'''
        model_mae += abs(new_model[hypothesis]-fd_prob)
        new_model[hypothesis] = fd_prob

    print(f"MAE: {model_mae}")


Clean Data Number: 1529,Dirty Data Number: 121,Dirty Data Proportion: 0.07333333333333333
MAE: 8.596094074371745
Clean Data Number: 1548,Dirty Data Number: 102,Dirty Data Proportion: 0.06181818181818182
MAE: 0.3976471601897835
Clean Data Number: 1548,Dirty Data Number: 102,Dirty Data Proportion: 0.06181818181818182
MAE: 0.0


## Sample df 

In [127]:
# clean_max_num = 1500
# dirty_sample_percentage = 0.1

# clean_sample_idxs = np.random.choice(list(new_clean_indices), min(len(new_clean_indices), clean_max_num), replace=False)
# dirty_sample_idxs = np.random.choice(list(new_dirty_indices), int(dirty_sample_percentage*len(clean_sample_idxs)), replace=False)
# # sampled_data_indices = set(clean_sample_idxs).union(set(dirty_sample_idxs))
# sampled_data_indices = new_clean_indices.union(new_dirty_indices)


In [128]:
model_dict={'omdb':{'model':new_model}}
model_dict['omdb']['predictions'] =  dict((idx, True) if idx in new_clean_indices else (idx,False) for idx in new_data_indices)
with open("./trainer_model.json", 'w') as fp:
    json.dump(model_dict, fp)

In [129]:
sampled_df = raw_df.loc[list(new_data_indices)]
# sampled_df['is_clean'] = sampled_df.index.map(lambda x: model_dict['omdb']['predictions'][x])
# del sampled_df['is_clean']
sampled_df

Unnamed: 0,title,year,rating,genre,director,language,country,type
3212,We're Done,2014,TV-14,"Comedy, Drama",Cherie Nowlan,English,USA,episode
3166,Battles Bouts and Brawls: The Story of Pro Wre...,2014,Not Rated,Sport,Mark Nowotarski,English,USA,movie
1727,World's Busiest Border Crossing,2013,TV-G,Documentary,Eoin O'Shea,English,USA,movie
126,The Armstrong Lie,2013,R,Documentary,Alex Gibney,English,USA,movie
1145,Jaz,2013,Unrated,"Short, Music",Greg W. Locke,English,USA,movie
...,...,...,...,...,...,...,...,...
798,By the Gun,2014,R,"Crime, Drama, Thriller",James Mottern,English,USA,movie
1338,Big Episode: Someone Stole a Spoon,2013,TV-14,Comedy,James Widdoes,English,USA,episode
2189,The Gift,2013,TV-14,Comedy,Gail Mancuso,English,USA,episode
391,Oconomowoc,2013,Not Rated,"Comedy, Drama",Andy Gillies,English,USA,movie


In [130]:
os.makedirs("./data/processed-data", exist_ok=True)
sampled_df.to_csv("./data/processed-data/omdb-sampled.csv")

In [131]:
new_scenarios_dict['omdb']['hypothesis_space']['(title) => director'].keys()

dict_keys(['lfd', 'rfd', 'supports', 'violations'])

## Final Process and Dump pickled data

In [133]:
with open('./trainer_model.json', 'r') as f:
    models_dict = json.load(f)

required_fds = dict(
    (scenario, set(models_dict[scenario]['model'].keys())) for scenario in models_dict)

with open("./data/processed-exp-data/trainer_model.json", 'w') as fp:
    json.dump(models_dict, fp)

with open("./data/processed-exp-data/required_fds.pk", 'wb') as fp:
    pk.dump(required_fds, fp)


with open('./new_scenarios.json', 'r') as f:
    scenarios = json.load(f)

'''Process new_scenarios to make the processing faster later'''
processed_df = dict()
filtered_processed_scenarios = dict()
for dataset in scenarios:

    processed_df[dataset] = pd.read_csv(
        scenarios['omdb']['processed_dataset_path'], index_col=0)
    processed_df[dataset].index = processed_df[dataset].index.map(str)
    required_indices = set(processed_df[dataset].index)

    filtered_processed_scenarios= {dataset:{'data_indices': set(
        scenarios[dataset]['data_indices']).intersection(required_indices), 'hypothesis_space': dict()}}

    '''Filter required fds and data_indices'''
    for hypothesis in tqdm(scenarios[dataset]['hypothesis_space']):
        if hypothesis not in required_fds[dataset]:
            continue
        

        filtered_processed_scenarios[dataset]['hypothesis_space'][hypothesis]={'lfd':set(
            scenarios[dataset]['hypothesis_space'][hypothesis]['lfd']),
        'rfd': set(
            scenarios[dataset]['hypothesis_space'][hypothesis]['rfd'])}

        for info_type in ['supports', 'violations']:
            filtered_processed_scenarios[dataset]['hypothesis_space'][hypothesis][info_type] = dict()

            filtered_processed_scenarios[dataset]['hypothesis_space'][hypothesis][f'{info_type[:-1]}_pairs'] = set()
            for idx in scenarios[dataset]['hypothesis_space'][hypothesis][info_type]:
                if idx not in required_indices:
                    continue

                filtered_processed_scenarios[dataset]['hypothesis_space'][hypothesis][info_type][idx] = set(
                    scenarios[dataset]['hypothesis_space'][hypothesis][info_type][idx]).intersection(required_indices)
                
                pairs = set((idx, idx_) if idx<idx_ else (idx_, idx) for idx_ in filtered_processed_scenarios[dataset]['hypothesis_space'][hypothesis][info_type][idx])
                filtered_processed_scenarios[dataset]['hypothesis_space'][hypothesis][f'{info_type[:-1]}_pairs'] |= pairs


with open("./data/processed-exp-data/filtered_processed_scenarios.pk", 'wb') as fp:
    pk.dump(filtered_processed_scenarios, fp)

with open("./data/processed-exp-data/processed_dfs.pk", 'wb') as fp:
    pk.dump(processed_df, fp)

100%|██████████| 50/50 [00:06<00:00,  8.10it/s]


In [134]:
filtered_processed_scenarios['omdb'].keys()

dict_keys(['data_indices', 'hypothesis_space'])

## Final Validation

In [135]:
import json
import pickle as pk
from statistics import mean
from operator import itemgetter
import math

In [136]:
with open('./trainer_model.json', 'r') as f:
    _models_dict = json.load(f)

with open("./data/processed-exp-data/filtered_processed_scenarios.pk", 'rb') as fp:
    _filtered_processed_scenarios = pk.load(fp)

with open("./data/processed-exp-data/processed_dfs.pk", 'rb') as fp:
    _processed_df = pk.load(fp)



In [137]:
def compute_conditional_clean_prob(idx, fd, fd_prob, scenario_id, data_indices=None):
    if data_indices is None:
        compliance_num = len(
            _filtered_processed_scenarios[scenario_id]['hypothesis_space'][fd]['supports'].get(idx, []))
        violation_num = len(
            _filtered_processed_scenarios[scenario_id]['hypothesis_space'][fd]['violations'].get(idx, []))
    else:
        compliance_num = len([idx_ for idx_ in _filtered_processed_scenarios[scenario_id]['hypothesis_space']
                                [fd]['supports'].get(idx, [])
                                if idx_ in data_indices])
        violation_num = len([idx_ for idx_ in _filtered_processed_scenarios[scenario_id]['hypothesis_space']
                            [fd]['violations'].get(idx, []) if idx_ in data_indices])

    tuple_clean_score = math.exp(fd_prob*(compliance_num-violation_num))
    tuple_dirty_score = math.exp(fd_prob*(-compliance_num+violation_num))
    cond_p_clean = tuple_clean_score/(tuple_clean_score+tuple_dirty_score)

    return cond_p_clean

def get_average_cond_clean_prediction(indices, model, scenario_id):
    conditional_clean_probability_dict = dict()
    indices = set(indices)
    for idx in indices:
        cond_clean_prob = mean([compute_conditional_clean_prob(
            idx=idx, fd=fd, fd_prob=fd_prob, scenario_id=scenario_id,
            data_indices=indices) for fd, fd_prob in model.items()])  # whether to include the validation_indices or all the data_indices while computing the conditional clean probability
        conditional_clean_probability_dict[idx] = cond_clean_prob
    return conditional_clean_probability_dict

In [138]:

_model = dict(sorted(_models_dict['omdb']['model'].items(), key=itemgetter(1),
                    reverse=True)[:10])
_clean_indices = set([idx for idx in _models_dict['omdb']['predictions'] if _models_dict['omdb']['predictions'][idx]])
_clean_probab_dict = get_average_cond_clean_prediction(_processed_df['omdb'].index, model=_model, scenario_id='omdb')

In [139]:
# Check the clean label in the model file
for idx in _processed_df['omdb'].index:
    _clean = _clean_probab_dict[idx] >= 0.5
    if _clean == _models_dict['omdb']['predictions'][idx]:
        continue
    print(idx, _clean, _models_dict['omdb']['predictions'][idx])

In [140]:
# Check the aggreage model on the overall data
for hypothesis in _models_dict['omdb']['model']:
    hypothesis_info_dict = _filtered_processed_scenarios['omdb']['hypothesis_space'][hypothesis]

    support_pairs_num, violation_pairs_num = 0,0
    for idx in hypothesis_info_dict['supports']:
        if idx not in _clean_indices:
            continue
        support_pairs_num += len(set(hypothesis_info_dict['supports'][idx]).intersection(_clean_indices))

    for idx in hypothesis_info_dict['violations']:
        if idx not in _clean_indices:
            continue
        violation_pairs_num += len(set(hypothesis_info_dict['violations'][idx]).intersection(_clean_indices))
    
    is_correct = (support_pairs_num/(support_pairs_num+violation_pairs_num)) == _models_dict['omdb']['model'][hypothesis]
    
    if not is_correct:
        print((support_pairs_num/(support_pairs_num+violation_pairs_num)),  _models_dict['omdb']['model'][hypothesis])

In [142]:
from random import sample
validation_indices = {'omdb': sample(list(_models_dict['omdb']['predictions'].keys()), 1000)}

with open('./data/processed-data/validation_indices.json','w') as fp:
    json.dump(validation_indices, fp)

validation_indices['omdb'] = set(validation_indices['omdb'])

with open('./data/processed-exp-data/validation_indices.pk','wb') as fp:
    pk.dump(validation_indices, fp)