In [1]:
import numpy as np
from statsmodels.stats import power as pwr
from scipy.stats import chi2_contingency, chi2 
from sklearn import metrics

import pandas as pd
import json
import pickle
import joblib

In [2]:
from collections import deque

In [3]:
def map_label(t):
    # Gives different label mapping w.r.t count and MSE. 
    # We use count here as we are concerned with max recovery
    label_mapping = t[t.noise == False].groupby(['synthetic_label', 'labels']).\
    size().\
    reset_index(name='l').\
    sort_values(['synthetic_label', 'l'], ascending=[True, False]).\
    groupby(['synthetic_label']).\
    head(1)
    label_mapping = label_mapping[['synthetic_label', 'labels']].rename(columns = {'synthetic_label': 'new_labels'})


    # Map the rest of the (N-3) labels to a single cluster (as it is not considered recovery)
    # Used for plotting graphs
    pending_labels = deque()
    pending_labels.extend(set(range(0, t.labels.nunique())) - set(label_mapping.labels.tolist()))

    label_mapping_dict = []
    for i in range(4, 4 + len(pending_labels)):
        label_mapping_dict.append({'new_labels': 4, 'labels':pending_labels.pop()})

    # Add pending labels
    label_mapping = pd.concat([label_mapping, pd.DataFrame(label_mapping_dict)], axis = 0).reset_index(drop = True)
    t = pd.merge(t, label_mapping)
    
    return t



def map_label_by_mse(eval_df, t):
    label_mapping = eval_df[eval_df.mse == eval_df.groupby(['orig_clust'])['mse'].transform('min')]
    label_mapping = label_mapping[['orig_clust', 'pred_clust']].\
        rename(columns={'orig_clust':'new_labels',
                        'pred_clust':'labels'})
    
    # Map the rest of the (N-3) labels to a single cluster (as it is not considered recovery)
    # Used for plotting graphs
    pending_labels = deque()
    pending_labels.extend(set(range(0, t.labels.nunique())) - set(label_mapping.labels.tolist()))

    label_mapping_dict = []
    for i in range(4, 4 + len(pending_labels)):
        label_mapping_dict.append({'new_labels': 4, 'labels':pending_labels.pop()})

    # Add pending labels
    label_mapping = pd.concat([label_mapping, pd.DataFrame(label_mapping_dict)], axis = 0).reset_index(drop = True)
    t = pd.merge(t, label_mapping)
    
    return t


def map_clusters(t):
    # calculate cluster sizes
    test_clust_count = t[['time','id','synthetic_label', 
                   'new_labels', 'noise', 'text', 'sentence_id']].drop_duplicates().\
    groupby(['synthetic_label', 'new_labels', 'noise']).size().reset_index(name='l')

    # get cluster size for noise (cluster 4)
    noise_clust = test_clust_count[(test_clust_count.noise == True)].\
        groupby('new_labels').l.sum().reset_index()
    noise_clust['synthetic_label'] = 4
    
    # get cluster size without noise (cluster 1-3)
    test_clust_count = test_clust_count[test_clust_count.noise == False].drop(columns='noise')
    # merge
    test_clust_count = pd.concat([test_clust_count, noise_clust]).reset_index(drop = True)

    # label cluster
    test_clust_count['synthetic_label'] = 'C' + test_clust_count['synthetic_label'].astype(str)
    test_clust_count['new_labels'] = 'C' + test_clust_count['new_labels'].astype(str)
    
    # repeat row assignment mapping according to the size of cluster
    test_clust_count = test_clust_count.loc[test_clust_count.index.repeat(test_clust_count.l)]
    test_clust_count = test_clust_count.drop(columns='l')
    
    return(test_clust_count)



def get_proportions(synthetic_data, recovered_data):
    # size of original cluster
    op = synthetic_data[['time','id', 'synthetic_label', 'noise', 'text', 'sentence_id']].drop_duplicates()
    op = op[(op.noise == False)].\
        groupby(['synthetic_label', 'time']).size().reset_index(name='l')

    # size of recovered cluster
    test_count = recovered_data[['time','id','synthetic_label', 
                       'new_labels', 'noise', 'text', 'sentence_id']].drop_duplicates()
    tp = test_count[(test_count.noise == False) & 
             (test_count.synthetic_label == test_count.new_labels)].\
        groupby(['time', 'new_labels']).size().reset_index(name='l')
    
    tp = pd.merge(tp, op, 
             left_on = ['time', 'new_labels'],
             right_on = ['time', 'synthetic_label'])
    tp['prop'] = tp.l_x/tp.l_y
    
    result_prop = tp.groupby(['new_labels'])['prop'].mean()
    
    return(result_prop)

### Precision/recall for CANarEx

In [4]:
path='canarex_coref_text'
result_path = '{}/results.jsonl'.format(path)
eval_path = '{}/eval_output.csv'.format(path)

ctype='coref_text'

In [5]:
df = pd.read_json('{}/synthetic_test_data_{}.jsonl'.format('synthetic_data', ctype), 
                  lines = True, orient='records')

In [6]:
final = pd.read_json(result_path, lines = True)

In [7]:
eval_df = pd.read_csv(eval_path)
eval_df[eval_df.mse == eval_df.groupby(['orig_clust'])['mse'].transform('min')]

Unnamed: 0,orig_clust,pred_clust,intercept,coef,mse,r_sq
3,1,3,-3.81289,1.274363,46.770132,0.998164
6,2,0,-3.792096,1.375623,79.376873,0.998461
9,3,2,-4.492334,1.343579,155.413557,0.985731


In [8]:
label_mapping = final[final.noise == False].groupby(['synthetic_label', 'labels']).\
size().\
reset_index(name='l').\
sort_values(['synthetic_label', 'l'], ascending=[True, False]).\
groupby(['synthetic_label']).\
head(1)

In [9]:
label_mapping

Unnamed: 0,synthetic_label,labels,l
3,1,3,56737
4,2,0,75933
10,3,2,142879


In [10]:
final = map_label(final)

In [11]:
test_clust_count = map_clusters(final)

In [12]:
test_clust_count.head(5)

Unnamed: 0,synthetic_label,new_labels
0,C1,C1
0,C1,C1
0,C1,C1
0,C1,C1
0,C1,C1


In [13]:
print(metrics.confusion_matrix(test_clust_count.synthetic_label.tolist(), 
                               test_clust_count.new_labels.tolist()))

[[33501   182   430  3735]
 [  978 53681   139 10089]
 [  609   373 93769 15893]
 [  651  1208  2732 72915]]


In [14]:
print(metrics.classification_report(test_clust_count.synthetic_label.tolist(), 
                               test_clust_count.new_labels.tolist()))

              precision    recall  f1-score   support

          C1       0.94      0.89      0.91     37848
          C2       0.97      0.83      0.89     64887
          C3       0.97      0.85      0.90    110644
          C4       0.71      0.94      0.81     77506

    accuracy                           0.87    290885
   macro avg       0.90      0.88      0.88    290885
weighted avg       0.89      0.87      0.88    290885



#### Proportion recovered

In [15]:
canarex_prop = get_proportions(df, final)

### Precision/recall for Relatio

In [16]:
path='relatio_test/clust_100/'

result_path = '{}/results.jsonl'.format(path)
eval_path = '{}/eval_output.csv'.format(path)

ctype='true_text'

In [17]:
df = pd.read_json('{}/synthetic_test_data_{}.jsonl'.format('synthetic_data', ctype), 
                  lines = True, orient='records')

In [18]:
final = pd.read_json(result_path, lines = True)

In [19]:
final = map_label(final)

In [20]:
test_clust_count = map_clusters(final)

In [21]:
print(metrics.confusion_matrix(test_clust_count.synthetic_label.tolist(), 
                               test_clust_count.new_labels.tolist()))

[[10818  2810   426  6389]
 [ 1659 14157   335 19917]
 [  882  9198 49875  7463]
 [ 2013 24785  2290 13249]]


In [22]:
print(metrics.classification_report(test_clust_count.synthetic_label.tolist(), 
                               test_clust_count.new_labels.tolist()))

              precision    recall  f1-score   support

          C1       0.70      0.53      0.60     20443
          C2       0.28      0.39      0.33     36068
          C3       0.94      0.74      0.83     67418
          C4       0.28      0.31      0.30     42337

    accuracy                           0.53    166266
   macro avg       0.55      0.49      0.51    166266
weighted avg       0.60      0.53      0.56    166266



#### Proportion recovered

In [23]:
relatio_prop = get_proportions(df, final)

#### MSE labels

In [24]:
# note: MSE labels differ from count labels
eval_df = pd.read_csv(eval_path)
eval_df[eval_df.mse == eval_df.groupby(['orig_clust'])['mse'].transform('min')]

Unnamed: 0,orig_clust,pred_clust,intercept,coef,mse,r_sq
5,1,5,-19.232467,3.20669,471.754359,0.977219
10,2,4,-43.677397,5.831577,1444.872357,0.96783
14,3,2,4.528892,2.235056,580.683972,0.936984


In [25]:
final = pd.read_json(result_path, lines = True)

In [26]:
final = map_label_by_mse(eval_df, final)

In [27]:
test_clust_count = map_clusters(final)

In [28]:
print(metrics.confusion_matrix(test_clust_count.synthetic_label.tolist(), 
                               test_clust_count.new_labels.tolist()))

[[10818   512   426  8124]
 [ 1659 11148   335 22644]
 [  882  1151 49875 15198]
 [ 2013  3014  2290 33602]]


In [29]:
print(metrics.classification_report(test_clust_count.synthetic_label.tolist(), 
                               test_clust_count.new_labels.tolist()))

              precision    recall  f1-score   support

          C1       0.70      0.54      0.61     19880
          C2       0.70      0.31      0.43     35786
          C3       0.94      0.74      0.83     67106
          C4       0.42      0.82      0.56     40919

    accuracy                           0.64    163691
   macro avg       0.69      0.61      0.61    163691
weighted avg       0.73      0.64      0.65    163691



### A/B testing

In [30]:
canarex_prop

new_labels
1    0.766905
2    0.721745
3    0.732509
Name: prop, dtype: float64

In [31]:
relatio_prop

new_labels
1    0.270610
2    0.200972
3    0.415421
Name: prop, dtype: float64

In [32]:
from scipy.stats import ttest_ind

ttest_ind(canarex_prop, 
      relatio_prop)

Ttest_indResult(statistic=6.882829433158639, pvalue=0.0023351285628415482)