## Import libraries

In [62]:
import pandas as pd
import json
from tqdm import tqdm
from multiprocessing import Pool
import os
from pprint import pprint
import numpy as np

## Read raw data

In [3]:
data_path = "./data/raw-data/omdb-clean-full.csv"

raw_df = pd.read_csv(data_path)
raw_df.rename(columns=dict((col, col.lower()) for col in raw_df.columns), inplace=True)
raw_df.head()


Unnamed: 0,title,year,rating,genre,director,language,country,type
0,The Secret Life of Walter Mitty,2013,PG,"Adventure, Comedy, Drama",Ben Stiller,English,USA,movie
1,In Secret,2013,R,"Crime, Drama, Thriller",Charlie Stratton,English,USA,movie
2,The Giver,2014,PG-13,"Drama, Sci-Fi",Phillip Noyce,English,USA,movie
3,42,2013,PG-13,"Biography, Drama, Sport",Brian Helgeland,English,USA,movie
4,House at the End of the Drive,2014,R,"Horror, Thriller",David Worth,English,USA,movie


In [6]:
## Read Scenario3 info for the functional dependencies

with open("./scenarios.json", 'r') as fp:
    scenarios = json.load(fp)
required_scenario_info = scenarios["3"]
hypothesis_space = [hypothesis['cfd'] for hypothesis in required_scenario_info['hypothesis_space']]
print(hypothesis_space)

['(title) => director', '(rating) => title, year, director', '(title) => year, director, rating', '(year, rating) => director', '(year, director, title) => rating', '(title) => year, rating', '(year) => title, director', '(year) => rating, director', '(year) => title, rating', '(rating, title) => year', '(director) => year, rating', '(year, rating, title) => director', '(title, rating) => year, director', '(year) => title', '(director) => title, rating', '(title) => year, director', '(year, director) => title, rating', '(director, rating) => year', '(year, rating) => title', '(title) => rating', '(title, year) => director, rating', '(year, rating) => title, director', '(rating) => title, year', '(director) => title, year', '(year) => title, director, rating', '(director) => title, year, rating', '(director, rating, title) => year', '(year, director) => title', '(rating) => year', '(director, title) => year', '(rating) => year, director', '(title, director) => year, rating', '(year, tit

## Add info to new scenarios dict and dump the file

In [7]:
def parse_hypothesis(fd):
    lfd, rfd = fd.split("=>")

    '''Parse left fd and separate out the attributes'''
    left_attributes = lfd.strip().strip("(").strip(")").split(",")
    right_attributes = rfd.strip("(").strip(")").split(",")

    left_attributes = [attribute.strip() for attribute in left_attributes]
    right_attributes = [attribute.strip() for attribute in right_attributes]

    return left_attributes, right_attributes

In [8]:
def is_support_violation(fd_components, tuple_1, tuple_2):
    '''Parse the hypothesis'''
    lfd, rfd = fd_components

    '''Violation check is only needed if the lfd values are same in both tuples otherwise it's not a violation'''
    is_left_same = all(tuple_1[left_attribute] == tuple_2[left_attribute] for left_attribute in lfd)

    if is_left_same:
        is_right_same = all(tuple_1[left_attribute] == tuple_2[left_attribute] for left_attribute in rfd)
        if is_right_same:
            return True, False
        else:
            return False, True
    else:
        return False, False


In [9]:
def get_support_violation_tuples(data, idx, fd_components):
    supports = []
    violations = []
    for idx_ in data.index:
        if idx == idx_:
            continue
        is_support, is_violation = is_support_violation(fd_components=fd_components, tuple_1=data.iloc[idx], tuple_2=data.iloc[idx_])
        if is_support:
            supports.append(idx_)
        elif is_violation:
            violations.append(idx_)
    return supports, violations
        

In [21]:
def get_hypothesis_info_dict(hypothesis):
    '''Extract left and right attributes from the hypothesis as list of attributes'''
    lfd, rfd = parse_hypothesis(hypothesis)
    info_dict = {'lfd': lfd, 'rfd':rfd}

    '''Find pairwise violations of each tuple with respect to other tuples in the dataset'''
    info_dict['supports'] = dict()
    info_dict['violations'] = dict()
    for idx in tqdm(raw_df.index):
        supports, violations = get_support_violation_tuples(data=raw_df[lfd+rfd], idx=idx, fd_components=(lfd, rfd))
        info_dict['supports'][idx] = supports
        info_dict['violations'][idx] = violations
    
    return info_dict

# get_hypothesis_info_dict(hypothesis_space[0])
# hypothesis_space[0]

In [19]:
cpu_num = os.cpu_count()
# cpu_num = 1

In [20]:
new_scenarios_dict = dict()
new_scenarios_dict['omdb'] = dict()
new_scenarios_dict['omdb']['hypothesis_space'] = dict()


with Pool(cpu_num) as p:
    hypothesis_info = p.map(get_hypothesis_info_dict, hypothesis_space)

for hypothesis, info_dict in zip(hypothesis_space, hypothesis_info):
    new_scenarios_dict['omdb']['hypothesis_space'][hypothesis] = info_dict

# for hypothesis in tqdm(hypothesis_space):
    
    # '''Extract left and right attributes from the hypothesis as list of attributes'''
    # lfd, rfd = parse_hypothesis(hypothesis)
    # new_scenarios_dict['omdb']['hypothesis_space'][hypothesis] = {'lfd': lfd, 'rfd':rfd}

    # '''Find pairwise violations of each tuple with respect to other tuples in the dataset'''
    # new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]['violations'] = dict()
    # for idx in raw_df.index:
    #     new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]['violations'][idx]=get_violation_tuples(data=raw_df, idx=idx, fd=hypothesis)

  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          | 0/3293 [00:00<?, ?it/s]
  0%|          |

{'omdb': {'hypothesis_space': {'(title) => director': {'lfd': ['title'], 'rfd': ['director'], 'supports': {0: []}, 'violations': {0: []}}, '(rating) => title, year, director': {'lfd': ['rating'], 'rfd': ['title', 'year', 'director'], 'supports': {0: []}, 'violations': {0: [13, 14, 15, 20, 29, 36, 74, 78, 82, 119, 120, 121, 125, 132, 133, 142, 150, 153, 176, 181, 195, 200, 203, 207, 208, 209, 222, 225, 238, 267, 288, 289, 299, 321, 337, 381, 387, 403, 424, 434, 436, 447, 512, 520, 527, 528, 529, 530, 550, 559, 596, 678, 679, 718, 722, 740, 746, 782, 803, 806, 917, 1061, 1179, 1202, 1246, 1266, 1310, 1432, 1512, 1519, 1523, 1569, 1678, 1694, 1704, 1724, 1730, 1735, 1747, 1754, 1786, 1808, 1817, 1833, 1897, 1903, 1933, 1934, 1935, 1937, 1943, 1957, 1959, 1966, 1970, 1996, 2005, 2024, 2050, 2154, 2159, 2299, 2379, 2457, 2553, 2658, 2813, 2830, 2939, 3068, 3109, 3157, 3186, 3250, 3251]}}, '(title) => year, director, rating': {'lfd': ['title'], 'rfd': ['year', 'director', 'rating'], 'support

In [115]:
for dataset in new_scenarios_dict:
    new_scenarios_dict[dataset]['data_indices'] = [str(x) for x in raw_df.index]
    for hypothesis in new_scenarios_dict[dataset]['hypothesis_space']:
        for val_type in ['supports', 'violations']:
            for idx in new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type]:

                if int(idx) in new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx]:
                    new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx].remove(int(idx))

                '''Don't assign if the list contains self index'''
                if new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx] != []:
                    new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx] = [str(x) for x in new_scenarios_dict[dataset]['hypothesis_space'][hypothesis][val_type][idx] if str(x)!=str(idx)]

In [116]:
with open("./new_scenarios.json", 'w') as fp:
    json.dump(new_scenarios_dict, fp)

In [117]:
with open("./new_scenarios.json", 'r') as fp:
    new_scenarios_dict = json.load(fp) 

In [118]:
new_scenarios_dict['omdb']['hypothesis_space'][hypothesis_space[0]]

{'lfd': ['title'],
 'rfd': ['director'],
 'supports': {'18': ['251'],
  '24': ['1098'],
  '33': ['1497'],
  '39': ['531'],
  '42': ['261'],
  '46': ['73'],
  '54': ['55'],
  '55': ['54'],
  '71': ['1505'],
  '73': ['46'],
  '81': ['128'],
  '96': ['404'],
  '98': ['1207', '2464'],
  '99': ['2524'],
  '108': ['2547'],
  '109': ['237'],
  '128': ['81'],
  '131': ['1691'],
  '187': ['314'],
  '189': ['1560'],
  '207': ['208'],
  '208': ['207'],
  '237': ['109'],
  '249': ['343'],
  '251': ['18'],
  '261': ['42'],
  '288': ['289'],
  '289': ['288'],
  '292': ['293'],
  '293': ['292'],
  '314': ['187'],
  '315': ['318'],
  '318': ['315'],
  '343': ['249'],
  '404': ['96'],
  '424': ['1202'],
  '429': ['430'],
  '430': ['429'],
  '449': ['1680'],
  '451': ['675'],
  '466': ['1697'],
  '469': ['1248'],
  '474': ['1792'],
  '499': ['1599'],
  '506': ['509'],
  '509': ['506'],
  '512': ['2553'],
  '527': ['1678'],
  '531': ['39'],
  '554': ['558'],
  '558': ['554'],
  '591': ['2623'],
  '616': 

## Model

In [119]:
hypothesis_support_violation_ratio_info = dict()
for hypothesis in new_scenarios_dict['omdb']['hypothesis_space']:
    hypothesis_info_dict = new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]
    
    support_pairs_num, violation_pairs_num = 0,0
    for idx in hypothesis_info_dict['supports']:
        support_pairs_num += len(hypothesis_info_dict['supports'][idx])

    for idx in hypothesis_info_dict['violations']:
        violation_pairs_num += len(hypothesis_info_dict['violations'][idx])
    
    hypothesis_support_violation_ratio_info[hypothesis] = support_pairs_num/(support_pairs_num+violation_pairs_num)
pprint(hypothesis_support_violation_ratio_info)


{'(director) => rating': 0.7623363544813696,
 '(director) => title': 0.032369443245576175,
 '(director) => title, rating': 0.031937850668968495,
 '(director) => title, year': 0.031650122284563376,
 '(director) => title, year, rating': 0.03150625809236081,
 '(director) => year': 0.6180405697022011,
 '(director) => year, rating': 0.4865486980290606,
 '(director, rating) => title': 0.04189469711266277,
 '(director, rating) => title, year': 0.04132855255708624,
 '(director, rating) => year': 0.6382336289866013,
 '(director, rating, title) => year': 0.9864864864864865,
 '(director, title) => rating': 0.9866666666666667,
 '(director, title) => year': 0.9777777777777777,
 '(rating) => director': 0.004678983250493814,
 '(rating) => title': 0.0006816710831064776,
 '(rating) => title, director': 0.00019602458607466064,
 '(rating) => title, year': 0.0004653376435195773,
 '(rating) => title, year, director': 0.00019337560518175982,
 '(rating) => year': 0.5452061834277989,
 '(rating) => year, direc

In [151]:
'''Sample confidence from 0 to support_violation_ratio'''
np.random.seed(1000)
model = dict((hypothesis, np.random.uniform(0, ratio) ) for hypothesis, ratio in hypothesis_support_violation_ratio_info.items())
model_dict={'model':model}
# model = dict((hypothesis, ratio ) for hypothesis, ratio in hypothesis_support_violation_ratio_info.items())

pprint(model)

{'(director) => rating': 0.3035660592438397,
 '(director) => title': 0.001471124619013116,
 '(director) => title, rating': 0.023747050789736022,
 '(director) => title, year': 0.02236673610351559,
 '(director) => title, year, rating': 0.0011060554814196738,
 '(director) => year': 0.11186998476993641,
 '(director) => year, rating': 0.10075564504617725,
 '(director, rating) => title': 0.018119421803703906,
 '(director, rating) => title, year': 0.018125582971772414,
 '(director, rating) => year': 0.6080096920489252,
 '(director, rating, title) => year': 0.8435034119804025,
 '(director, title) => rating': 0.12651256045826575,
 '(director, title) => year': 0.5417741882709632,
 '(rating) => director': 0.00034213190468242166,
 '(rating) => title': 0.00016675665837643453,
 '(rating) => title, director': 3.5565138100533445e-06,
 '(rating) => title, year': 0.00015804619935329286,
 '(rating) => title, year, director': 2.223953722678619e-05,
 '(rating) => year': 0.41745510295349525,
 '(rating) => y

In [152]:
with open("./trainer/trainer_model.json", 'w') as fp:
    json.dump(model_dict, fp)

### Predict a tuple using a model

In [153]:
def predict_clean_tuple(idx, model):
    total_score = 0
    for hypothesis, conf in model.items():
        '''Check the number of supports and violations based on the model'''
        support_pairs_num = len(new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]['supports'].get(idx, []))
        violation_pairs_num = len(new_scenarios_dict['omdb']['hypothesis_space'][hypothesis]['violations'].get(idx, []))

        '''Vote according to the hypothesis conf'''
        total_score += (conf*(support_pairs_num-violation_pairs_num))
    
    if total_score > 0:
        return True, total_score
    
    else:
        return False, total_score

In [154]:
clean_tuple_indices = set()
model_score_dict = dict()
for idx in new_scenarios_dict['omdb']['data_indices']:         
    is_clean, total_score = predict_clean_tuple(idx, model)
    if is_clean:
        clean_tuple_indices.add(idx)
    model_score_dict[idx] = total_score


In [155]:
len(clean_tuple_indices)

1154

In [160]:
model_dict['predictions'] =  dict((idx, True) if idx in clean_tuple_indices else (idx,False) for idx in new_scenarios_dict['omdb']['data_indices'])

In [161]:
with open("./trainer/trainer_model.json", 'w') as fp:
    json.dump(model_dict, fp)