# Functions

In [2]:
import networkx as nx
import math
import matplotlib.pyplot as plt
from networkx.algorithms import tree
import pandas as pd
import glob
import os

from lineage import similarity


In [3]:
# Load a directory and return filename:df pairs
def build_df_dict_dir(nb_dir):
    dataset = {}
    for file in glob.glob(nb_dir+'*.csv'):
        csvfile = os.path.basename(file)
        try:
            dataset[csvfile] = pd.read_csv(file, index_col=0)
        except (pd.parser.CParserError, UnicodeDecodeError) as e:
            # Star Wars: encoding="ISO-8859-1"
            #df = pd.read_csv(
            #"http://math-info.hse.ru/f/2015-16/all-py/data/tariff2012.csv",
            #sep=';')
            if(csvfile == 'StarWars.csv'):
                dataset[csvfile] = pd.read_csv(file, encoding="ISO-8859-1", index_col=0)
            elif(csvfile == 'tariff2012.csv'):
                dataset[csvfile] = pd.read_csv(file, sep=";", index_col=0)
            else:
                print("Error reading file:", file)
            
    return dataset

In [4]:
def check_csv_graph(artifact_dir, g_truth):
    missing_files = []
    for node in g_truth.nodes():
        if not os.path.exists(artifact_dir+node):
            print("Missing File: "+artifact_dir+node)
            missing_files.append(node)
    return missing_files

In [26]:
import os
from lineage import graphs, similarity
import pandas as pd

def lineage_inference(wf_dir, pre_cluster='no_pre_clustering', index=True, threshold=0.0001):
    
    nb_name = os.path.basename(wf_dir)
    if index:
        artifact_dir = wf_dir+'/artifacts/'
    else:
        artifact_dir = wf_dir+'/artifacts_1/'
    
    result_dir = wf_dir+'/inferred/'
    os.makedirs(result_dir, exist_ok=True)
    schema_file = result_dir+'schema_matching.csv'
    row_file = result_dir+'row_matching.csv'
    cluster_file = result_dir+'clusters.csv'
    
    dataset = build_df_dict_dir(artifact_dir)
    
    # Run the inference
    pairwise_jaccard = similarity.get_pairwise_similarity(dataset, similarity.compute_jaccard_DF, threshold=threshold)
    pw_jaccard_graph = graphs.generate_pairwise_graph(pairwise_jaccard)
    nx.to_pandas_adjacency(pw_jaccard_graph,weight='weight').to_csv(
                                                result_dir+'cell_sim.csv')
    
    g_inferred = graphs.generate_spanning_tree(pw_jaccard_graph)
    nx.write_edgelist(g_inferred,result_dir+'infered_mst_cell.csv',data=True)

      
    # Load Ground Truth:
    g_truth = nx.read_gpickle(wf_dir+'/'+nb_name+'_gt.pkl')

    missing_files = check_csv_graph(artifact_dir, g_truth)
    
    pr_df = pd.DataFrame(columns = ['nb_name', 'index', 'preclustering', 'distance_metric',
                                        'edges_correct', 'edges_missing', 'edges_to_remove', 
                                        'precision', 'recall', 'F1', 'missing_files' ])
    
    result = graphs.get_precision_recall(g_truth,g_inferred)

    pr_df = pr_df.append({
            'nb_name': nb_name,
            'index': index,
            'preclustering': pre_cluster,
            'distance_metric': 'pandas_cell',
            'edges_correct': len(result['correct_edges']),
            'edges_missing': len(result['to_add']),
            'edges_to_remove': len(result['to_remove']),
            'precision': result['Precision'],
            'recall': result['Recall'],
            'F1': result['F1'],
            'missing_files': len(missing_files)
        }, ignore_index=True)
    
    return pr_df

In [161]:
base_dir = '/media/suhail/Data/experiments/reexec/res/'
sample_wf = base_dir+'nb_484354.ipynb'
artifact_dir = sample_wf+'/artifacts/'
dataset = build_df_dict_dir(artifact_dir)

In [6]:
lineage_inference(sample_wf)

HBox(children=(IntProgress(value=0, description='graph pairs', max=253), HTML(value='')))



FileNotFoundError: [Errno 2] No such file or directory: '/home/suhail/Projects/relic/primitives/python/generator/dataset/dataset_gt.pkl'

In [10]:
sample_wf = '/home/suhail/Projects/relic/primitives/python/generator/dataset'
lineage_inference(sample_wf)

HBox(children=(IntProgress(value=0, description='graph pairs', max=253), HTML(value='')))



Unnamed: 0,nb_name,index,preclustering,distance_metric,edges_correct,edges_missing,edges_to_remove,precision,recall,F1,missing_files
0,dataset,True,no_pre_clustering,pandas_cell,15,7,6,0.681818,0.714286,0.697674,0


# Single Notebook Test

In [18]:
pre_cluster_types = ['no_pre_cluster']
index_types = [True]

# Run for Multiple Notebooks

In [22]:
nb_list = [
    'nb_331056',
    'nb_23457',
    # nb_336256', #
    'nb_33614',
    # 'nb_650868', #
    'nb_316514',
    'nb_386796',
    'nb_266913',
    'nb_417011',
    'nb_269991',
    'nb_495072',
    'nb_315236',
    'nb_484354',
    'nb_772851',
    #'nb_924102',
    #'nb_921915',
    'nb_986282',
    # 'nb_582525', #
    'nb_639263',
]

fakerdir = '/home/suhail/Projects/relic/primitives/python/generator/dataset/'

nb_list = [d for d in os.listdir(fakerdir) if os.path.isdir(os.path.join(fakerdir, d))]
nb_list

['20190802-112317',
 '20190802-112309',
 '20190802-112248',
 '20190802-112314',
 '20190802-112250',
 '20190802-112311',
 '20190802-112245']

In [165]:
from tqdm import tqdm_notebook, tqdm
import glob

all_pr_df = pd.DataFrame(columns = ['nb_name', 'index', 'preclustering', 'distance_metric',
                                        'edges_correct', 'edges_missing', 'edges_to_remove', 
                                        'precision', 'recall', 'F1','missing_files' ])

errors = []

for nb in tqdm_notebook(nb_list, desc='notebook', leave=True):
    nb_dir = ('/media/suhail/Data/experiments/reexec/res/'+nb+'.ipynb')
    # print('Processing:', nb_dir)
    os.makedirs(nb_dir+'/inferred', exist_ok=True)

    for cluster in tqdm_notebook(pre_cluster_types, desc='cluster', leave=False):
        for index in tqdm_notebook(index_types,  desc='index', leave=False):
            files = glob.glob(nb_dir+'/inferred/*')
            for f in files:
                os.remove(f)
            try:
                result_df = lineage_inference(nb_dir, index=index, pre_cluster=cluster)
                all_pr_df = pd.concat([all_pr_df, result_df],ignore_index=True)
            except FileNotFoundError as e:
                errors.append((nb_dir, cluster,index))
                pass
                
                

HBox(children=(IntProgress(value=0, description='notebook', max=14), HTML(value='')))

HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=55), HTML(value='')))

HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=210), HTML(value='')))

  return this.join(other, how=how, return_indexers=return_indexers)
  return this.join(other, how=how, return_indexers=return_indexers)


HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=28), HTML(value='')))

HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=28), HTML(value='')))

HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=45), HTML(value='')))

HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=36), HTML(value='')))

HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=190), HTML(value='')))

HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', description='graph pairs', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=28), HTML(value='')))

HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=28), HTML(value='')))

In [36]:
from tqdm import tqdm_notebook, tqdm
import glob

all_pr_df = pd.DataFrame(columns = ['nb_name', 'index', 'preclustering', 'distance_metric',
                                        'edges_correct', 'edges_missing', 'edges_to_remove', 
                                        'precision', 'recall', 'F1','missing_files' ])

errors = []
threshold = 0.01

for nb in tqdm_notebook(nb_list, desc='notebook', leave=True):
    nb_dir = (fakerdir+nb)
    print('Processing:', nb_dir)
    os.makedirs(nb_dir+'/inferred', exist_ok=True)

    for cluster in tqdm_notebook(pre_cluster_types, desc='cluster', leave=False):
        for index in tqdm_notebook(index_types,  desc='index', leave=False):
            files = glob.glob(nb_dir+'/inferred/*')
            for f in files:
                os.remove(f)
            try:
                result_df = lineage_inference(nb_dir, index=index, pre_cluster=cluster, threshold=threshold)
                all_pr_df = pd.concat([all_pr_df, result_df],ignore_index=True)
            except FileNotFoundError as e:
                errors.append((nb_dir, cluster,index))
                pass
                
                

HBox(children=(IntProgress(value=0, description='notebook', max=7), HTML(value='')))

Processing: /home/suhail/Projects/relic/primitives/python/generator/dataset/20190802-112317


HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=1830), HTML(value='')))

  return this.join(other, how=how, return_indexers=return_indexers)
  return this.join(other, how=how, return_indexers=return_indexers)
  sort=self.sort)
  sort=self.sort)


Processing: /home/suhail/Projects/relic/primitives/python/generator/dataset/20190802-112309


HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=1485), HTML(value='')))

Processing: /home/suhail/Projects/relic/primitives/python/generator/dataset/20190802-112248


HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=210), HTML(value='')))

Processing: /home/suhail/Projects/relic/primitives/python/generator/dataset/20190802-112314


HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=1485), HTML(value='')))

Processing: /home/suhail/Projects/relic/primitives/python/generator/dataset/20190802-112250


HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=276), HTML(value='')))

Processing: /home/suhail/Projects/relic/primitives/python/generator/dataset/20190802-112311


HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=1653), HTML(value='')))

Processing: /home/suhail/Projects/relic/primitives/python/generator/dataset/20190802-112245


HBox(children=(IntProgress(value=0, description='cluster', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='index', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='graph pairs', max=276), HTML(value='')))




In [34]:
errors

[]

In [35]:
all_pr_df

Unnamed: 0,nb_name,index,preclustering,distance_metric,edges_correct,edges_missing,edges_to_remove,precision,recall,F1,missing_files
0,20190802-112317,True,no_pre_cluster,pandas_cell,39,21,15,0.65,0.722222,0.684211,0
1,20190802-112309,True,no_pre_cluster,pandas_cell,33,21,14,0.611111,0.702128,0.653465,0
2,20190802-112248,True,no_pre_cluster,pandas_cell,11,9,6,0.55,0.647059,0.594595,0
3,20190802-112314,True,no_pre_cluster,pandas_cell,33,21,18,0.611111,0.647059,0.628571,0
4,20190802-112250,True,no_pre_cluster,pandas_cell,14,9,5,0.608696,0.736842,0.666667,0
5,20190802-112311,True,no_pre_cluster,pandas_cell,30,27,18,0.526316,0.625,0.571429,0
6,20190802-112245,True,no_pre_cluster,pandas_cell,14,9,5,0.608696,0.736842,0.666667,0


In [149]:
all_pr_df.sort_values('F1', ascending=False)

Unnamed: 0,nb_name,index,preclustering,distance_metric,edges_correct,edges_missing,edges_to_remove,precision,recall,F1,missing_files
4,nb_266913.ipynb,True,no_pre_cluster,pandas_cell,5,1,1,0.833333,0.833333,0.833333,0
1,nb_23457.ipynb,True,no_pre_cluster,pandas_cell,4,1,1,0.8,0.8,0.8,0
7,nb_495072.ipynb,True,no_pre_cluster,pandas_cell,5,4,1,0.555556,0.833333,0.666667,0
5,nb_417011.ipynb,True,no_pre_cluster,pandas_cell,4,3,1,0.571429,0.8,0.666667,0
8,nb_315236.ipynb,True,no_pre_cluster,pandas_cell,5,3,2,0.625,0.714286,0.666667,0
2,nb_33614.ipynb,True,no_pre_cluster,pandas_cell,9,12,3,0.428571,0.75,0.545455,0
6,nb_269991.ipynb,True,no_pre_cluster,pandas_cell,2,4,2,0.333333,0.5,0.4,0
0,nb_331056.ipynb,True,no_pre_cluster,pandas_cell,4,7,6,0.363636,0.4,0.380952,0
10,nb_986282.ipynb,True,no_pre_cluster,pandas_cell,2,4,4,0.333333,0.333333,0.333333,0
3,nb_386796.ipynb,True,no_pre_cluster,pandas_cell,1,5,2,0.166667,0.333333,0.222222,0


In [78]:
nonindexed_cell = all_pr_df.loc[(all_pr_df.distance_metric == 'cell')
                        & (all_pr_df['index'] == False)]
nonindexed_col = all_pr_df.loc[(all_pr_df.distance_metric == 'col')
                        & (all_pr_df['index'] == False)]
#nonindexed_cell.to_excel('results_noindex_cell.xlsx')
#nonindexed_col.to_excel('results_noindex_col.xlsx')

In [77]:
#all_pr_df.to_excel('results.xlsx')

In [None]:
# CommandLine Debugging
'''
/home/suhail/Projects/relic/primitives/cpp/src/pre_clustering/pre_clustering -partial_schema -result /media/suhail/Data/experiments/results/ok/nb_639263.ipynb/inferred/ -schema_file /media/suhail/Data/experiments/results/ok/nb_639263.ipynb/inferred/schema_matching.csv
'''

In [39]:
all_pr_df

Unnamed: 0,nb_name,index,preclustering,distance_metric,edges_correct,edges_missing,edges_to_remove,precision,recall,F1
0,nb_331056.ipynb,True,no_pre_cluster,pandas_cell,5,7,5,0.416667,0.5,0.454545
1,nb_331056.ipynb,False,no_pre_cluster,pandas_cell,3,9,3,0.25,0.5,0.333333
2,nb_23457.ipynb,True,no_pre_cluster,pandas_cell,4,1,1,0.8,0.8,0.8
3,nb_23457.ipynb,False,no_pre_cluster,pandas_cell,4,1,1,0.8,0.8,0.8
4,nb_33614.ipynb,True,no_pre_cluster,pandas_cell,10,11,3,0.47619,0.769231,0.588235
5,nb_33614.ipynb,False,no_pre_cluster,pandas_cell,7,14,1,0.333333,0.875,0.482759
6,nb_316514.ipynb,True,no_pre_cluster,pandas_cell,3,5,1,0.375,0.75,0.5
7,nb_316514.ipynb,False,no_pre_cluster,pandas_cell,1,7,2,0.125,0.333333,0.181818
8,nb_386796.ipynb,True,no_pre_cluster,pandas_cell,1,5,1,0.166667,0.5,0.25
9,nb_386796.ipynb,False,no_pre_cluster,pandas_cell,1,5,0,0.166667,1.0,0.285714


In [40]:
groups = all_pr_df.groupby(pd.cut(all_pr_df.index, range(0,len(all_pr_df), 8)))
groups.max()

Unnamed: 0,nb_name,index,preclustering,distance_metric,edges_correct,edges_missing,edges_to_remove,precision,recall,F1
"(0, 8]",nb_386796.ipynb,True,no_pre_cluster,pandas_cell,10,14,3,0.8,0.875,0.8
"(8, 16]",nb_495072.ipynb,True,no_pre_cluster,pandas_cell,5,5,2,0.833333,1.0,0.833333
"(16, 24]",nb_986282.ipynb,True,no_pre_cluster,pandas_cell,4,22,10,0.5,0.8,0.615385


In [41]:
best_f1 = all_pr_df.groupby(['nb_name'], sort=False)['F1'].max().to_frame()
best_f1

Unnamed: 0_level_0,F1
nb_name,Unnamed: 1_level_1
nb_331056.ipynb,0.454545
nb_23457.ipynb,0.8
nb_33614.ipynb,0.588235
nb_316514.ipynb,0.5
nb_386796.ipynb,0.285714
nb_266913.ipynb,0.833333
nb_417011.ipynb,0.666667
nb_269991.ipynb,0.444444
nb_495072.ipynb,0.666667
nb_315236.ipynb,0.615385


In [43]:
idx = all_pr_df.groupby(['nb_name'])['F1'].transform(max) == all_pr_df['F1']
max_filter = all_pr_df.loc[idx]


In [44]:
max_filter = max_filter.loc[max_filter['F1'] != 0]
len(max_filter)

18

In [45]:
top_nb_scores = max_filter.drop(['index','preclustering','distance_metric'],axis=1)
top_nb_scores = top_nb_scores.set_index('nb_name')
top_nb_scores = top_nb_scores.drop_duplicates()
top_nb_scores

Unnamed: 0_level_0,edges_correct,edges_missing,edges_to_remove,precision,recall,F1
nb_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
nb_331056.ipynb,5,7,5,0.416667,0.5,0.454545
nb_23457.ipynb,4,1,1,0.8,0.8,0.8
nb_33614.ipynb,10,11,3,0.47619,0.769231,0.588235
nb_316514.ipynb,3,5,1,0.375,0.75,0.5
nb_386796.ipynb,1,5,0,0.166667,1.0,0.285714
nb_266913.ipynb,5,1,1,0.833333,0.833333,0.833333
nb_417011.ipynb,4,3,1,0.571429,0.8,0.666667
nb_269991.ipynb,2,4,1,0.333333,0.666667,0.444444
nb_495072.ipynb,5,4,1,0.555556,0.833333,0.666667
nb_315236.ipynb,4,4,1,0.5,0.8,0.615385


In [50]:
max_filter['index'].value_counts()

True     11
False     7
Name: index, dtype: int64