In [25]:
import os
import sys
import inspect
import networkx as nx
import pandas as pd

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir) 



In [14]:
def get_precision_recall(G_truth, T_inferred):
    g_edge_set = set([frozenset((v1, v2)) for v1, v2 in G_truth.edges])
    t_edge_set = set([frozenset((v1, v2)) for v1, v2 in T_inferred.edges])

    correct = g_edge_set.intersection(t_edge_set)

    to_add = g_edge_set - t_edge_set
    to_remove = t_edge_set - g_edge_set

    try:
        precision = float(len(correct)) / len(t_edge_set)
        recall = float(len(correct)) / len(g_edge_set)
        f1 = 2 * ((precision * recall) / (precision + recall))
    except ZeroDivisionError as e:
        precision = 0.0
        recall = 0.0
        f1 = 0.0

    return {'Precision': precision,
            'Recall': recall,
            'F1': f1,
            'correct_edges': correct,
            'to_add': to_add,
            'to_remove': to_remove}


In [17]:
g_truth_file='/home/suhail/Scratch/mixed_results/combined_all_gt_fixed.pkl'
inferred_graph='/home/suhail/Scratch/mixed_results/inferred_graph.csv'
g_truth = nx.read_gpickle(g_truth_file)
g_inferred = nx.read_edgelist(inferred_graph)

In [19]:
results=get_precision_recall(g_truth, g_inferred)
results['F1']

0.6055913978494624

In [32]:
fp_edges = results['to_remove']

all_fp_edge_data = []
for u,v in fp_edges:
    wf1, wf2 = u.split('_')[0], v.split('_')[0]
    u1, v1 = u.split('_')[1], v.split('_')[1]
    #print(wf1, wf2, g_inferred[u][v])
    e_dict = {
        'wf1': wf1,
        'wf2': wf2,
        'src': u1,
        'dst': v1,
        'edge': frozenset([u,v])
    }
    e_dict.update(g_inferred[u][v])
    all_fp_edge_data.append(e_dict)
    
fp_df = pd.DataFrame(all_fp_edge_data)

In [33]:
fp_df['cross_wf'] = fp_df['wf1'] != fp_df['wf2']
fp_df

Unnamed: 0,wf1,wf2,src,dst,edge,weight,type,num,cross_wf
0,34,92,000.csv,004.csv,"(34_000.csv, 92_004.csv)",1.000000,groupby,1071,True
1,17,17,009.csv,011.csv,"(17_009.csv, 17_011.csv)",1.000000,join,444,False
2,42,10,004.csv,008.csv,"(42_004.csv, 10_008.csv)",337.000000,containment,999,True
3,14,66,002.csv,002.csv,"(14_002.csv, 66_002.csv)",18.000000,jaccard,987,True
4,36,36,000.csv,006.csv,"(36_000.csv, 36_006.csv)",1.000000,join,443,False
...,...,...,...,...,...,...,...,...,...
599,40,40,004.csv,001.csv,"(40_004.csv, 40_001.csv)",1.000000,join,553,False
600,95,95,002.csv,000.csv,"(95_002.csv, 95_000.csv)",1.000000,join,595,False
601,45,45,005.csv,002.csv,"(45_005.csv, 45_002.csv)",15180.000000,jaccard,719,False
602,46,32,004.csv,000.csv,"(46_004.csv, 32_000.csv)",0.117111,jaccard,994,True


In [36]:
fp_df['type'].value_counts()

join           314
jaccard        177
containment     68
pivot           25
groupby         20
Name: type, dtype: int64

In [38]:
fp_df['cross_wf'].value_counts()

False    553
True      51
Name: cross_wf, dtype: int64

In [42]:
fp_df.groupby('type').mean()

Unnamed: 0_level_0,weight,num,cross_wf
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
containment,1134.810871,829.808824,0.617647
groupby,0.320227,1059.15,0.35
jaccard,4716.563342,376.943503,0.011299
join,1.0,467.541401,0.0
pivot,0.0,1088.84,0.0
