# Scoring function Scratchpad

Features
1. number of  common columns
2. avg, min and max value overlap ratios of common columns
3. maximum coherent overlap of value lattice of common columns, ratio
4. number of columns that exhibit coherent overlap when number 3 is above a certain threshold
5. number of columns that have unique values, that are non unique in the counterpart version.
6. contraction ratio of number of rows in original to grouped col attributes


In [66]:
# Imports and Environmental Setup

import sys
sys.path.insert(0, '..')

import warnings
from tqdm import TqdmExperimentalWarning
warnings.simplefilter('ignore', TqdmExperimentalWarning)

from tqdm.autonotebook import tqdm


import matplotlib
%matplotlib inline

import ipywidgets 
import agglomerative

import numpy as np
import pandas as pd

from lineage import similarity, graphs

import itertools



In [29]:
base_dir = agglomerative.BASE_DIR
nb_name = agglomerative.NB_NAME
df_dict = similarity.load_dataset_dir(agglomerative.BASE_DIR+agglomerative.NB_NAME+'/artifacts/', '*.csv')

In [30]:
def common_columns(df1,df2):
    return set(df1).intersection(set(df2))

def get_col_valset(df):
    for col in set(df):
        yield col, set(df[col].values)

In [56]:
def compute_df_pair_features(df1,df2, df1_value_set_dict=None, df2_value_set_dict=None):
    common_cols = set(df1).intersection(set(df2))
    
    if not df1_value_set_dict:
        df1_value_set_dict = {name:valset for name, valset in get_col_valset(df1)}
    if not df2_value_set_dict:
        df2_value_set_dict = {name:valset for name, valset in get_col_valset(df2)}
    
    cell_jaccard = similarity.compute_jaccard_DF(df1,df2)
    col_jaccard = similarity.compute_col_jaccard_DF(df1,df2) #TODO: Use value set
    
    containment_scores  = [similarity.set_max_containment(df1_value_set_dict[col], 
                                                          df2_value_set_dict[col]) for col in common_cols]
    
    #Uniqueness computation
    unique_cols = 0
    contraction_ratios = []
    for col in common_cols:
        rows_not_equal = len(df1[col].values) != len(df2[col].values)
        df1_unique = len(df1_value_set_dict[col]) == len(df1[col])
        df2_unique = len(df2_value_set_dict[col]) == len(df2[col])
        
        df1_subset = df1_value_set_dict[col].issubset(df2_value_set_dict[col])
        df2_subset = df2_value_set_dict[col].issubset(df1_value_set_dict[col])
        
        if rows_not_equal and (df1_unique or df2_unique) and (df1_subset or df2_subset):
            unique_cols += 1
            numerator, denominator = max(len(df1[col]), len(df2[col])), min(len(df1[col]), len(df2[col]))
            contraction_ratios.append(numerator/denominator)
    
    result = {
        'common_cols' : len(common_cols),
        'cell_jaccard' : cell_jaccard,
        'col_jaccard' : col_jaccard,
        'unique_cols': unique_cols,
    }
    
    if containment_scores:
        result.update({
            'max_containment' : max(containment_scores),
            'avg_containment' : np.average(containment_scores),
            'min_containment' : min(containment_scores),
        })
        
    if contraction_ratios:
        result.update({
            'max_contraction' : max(contraction_ratios),
            'avg_contraction' : np.average(contraction_ratios),
            'min_contraction' : min(contraction_ratios), 
        })
        
    
    return result

In [57]:
compute_df_pair_features(df_dict['crimes.csv'], df_dict['crimeTypes.csv'])

{'common_cols': 6,
 'cell_jaccard': 3.7355053055381854e-07,
 'col_jaccard': 1.0,
 'unique_cols': 1,
 'max_containment': 1.0,
 'avg_containment': 0.28205128205128205,
 'min_containment': 0.0,
 'max_contraction': 34320.692307692305,
 'avg_contraction': 34320.692307692305,
 'min_contraction': 34320.692307692305}

In [71]:
def get_all_node_pair_scores(dataset, gt_graph):
    pairwise_scores = []
    pairs = list(itertools.combinations(dataset.keys(), 2))
    df_value_dicts = {}
    for d1, d2 in tqdm(pairs, desc='graph pairs', leave=False):
        if d1 not in df_value_dicts.keys():
            df_value_dicts[d1] = {name:valset for name, valset in get_col_valset(dataset[d1])}
        if d2 not in df_value_dicts.keys():
            df_value_dicts[d2] = {name:valset for name, valset in get_col_valset(dataset[d2])}
        
        result = compute_df_pair_features(dataset[d1], dataset[d2], df1_value_set_dict=df_value_dicts[d1], 
                                          df2_value_set_dict=df_value_dicts[d2])
        
        gt = False
        
        if gt_graph.has_edge(d1, d2):
            result['operation'] = gt_graph[d1][d2]['operation']
            result['source'] = d1
            result['dest'] = d2
            gt=True
        elif gt_graph.has_edge(d2, d1):
            result['operation'] = gt_graph[d2][d1]['operation']
            result['source'] = d2
            result['dest'] = d1
            gt=True
     
        result['ground_truth'] = gt
        pairwise_scores.append(result)

    return pairwise_scores

In [72]:
gt_graph = graphs.get_graph(base_dir,nb_name)

In [73]:
data = get_all_node_pair_scores(df_dict, gt_graph)

HBox(children=(IntProgress(value=0, description='graph pairs', max=153), HTML(value='')))

In [76]:
pd.DataFrame(data)

Unnamed: 0,avg_containment,avg_contraction,cell_jaccard,col_jaccard,common_cols,dest,ground_truth,max_containment,max_contraction,min_containment,min_contraction,operation,source,unique_cols
0,0.282051,34320.692308,3.201862e-07,1.000000,6,crimeTypes.csv,False,1.000000,34320.692308,0.000000,34320.692308,,crimes__2.csv,1
1,0.230415,,6.403723e-07,0.184971,6,areaCrime__1.csv,False,0.969697,,0.000000,,,crimes__2.csv,0
2,0.309557,,9.245376e-06,1.000000,5,typeLoc__2.csv,False,1.000000,,0.000000,,,crimes__2.csv,0
3,0.277778,37180.750000,6.403723e-07,1.000000,6,crimesByMonth.csv,False,1.000000,37180.750000,0.000000,37180.750000,,crimes__2.csv,1
4,0.425849,,2.073205e-05,1.000000,6,typeLoc__1.csv,False,1.000000,,0.000000,,,crimes__2.csv,0
5,0.252525,,6.403723e-07,0.184971,6,areaCrime__3.csv,False,0.969697,,0.000000,,,crimes__2.csv,0
6,0.235465,2594.005814,5.603258e-07,1.000000,6,locationRecode__1.csv,False,1.000000,2594.005814,0.000000,2594.005814,,crimes__2.csv,1
7,1.000000,,8.571429e-01,1.000000,6,crimes.csv,True,1.000000,,1.000000,,apply,crimes__2.csv,0
8,0.252525,,6.403723e-07,0.184971,6,areaCrime__2.csv,False,0.969697,,0.000000,,,crimes__2.csv,0
9,0.425849,,2.369378e-05,1.000000,6,typeLoc.csv,False,1.000000,,0.000000,,,crimes__2.csv,0


In [1]:
def col_containment(df1, df2, colname, col2name=None):
    if(col2name==None):
        col2name = colname

    df1valset = set(df1[colname])
    df2valset = set(df2[col2name])

    return df1valset.intersection(df2valset) / len(df2valset)

In [2]:
col_containment(df_dict['crimes.csv'])

NameError: name 'df_dict' is not defined