In [2]:
import itertools
import sklearn
import pandas as pd
import json
import numpy as np
import ast
from tqdm.auto import tqdm
import statistics
import math
import csv
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import NearestCentroid
from sklearn.cluster import AgglomerativeClustering
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, StratifiedShuffleSplit
from sklearn.calibration import calibration_curve
from copy import deepcopy
import pprint
import ast
import statistics
import math
import time
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    mean_squared_error,
    r2_score,
    roc_auc_score,
    average_precision_score,
)
import matplotlib.pyplot as plt
from nltk import agreement
import scipy.stats as stats
import seaborn as sns
np.random.seed(1)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
embedding_path = '../data/raw/df_embeddings.csv'
submission_path = '../data/raw/df_submission_rating.csv'


df_embeddings = pd.read_csv(embedding_path)
df_embeddings = df_embeddings.T
df_embeddings.columns=df_embeddings.iloc[0]
df_embeddings = df_embeddings.iloc[1: , :]

# Bootstrap with NLP
Reference paper https://aclanthology.org/D12-1091.pdf

Getting embedding matches

In [4]:
embedding_path = '../data/raw/df_embeddings.csv'
submission_path = '../data/raw/df_submission_rating.csv'


df_embeddings = pd.read_csv(embedding_path)
df_embeddings = df_embeddings.T
df_embeddings.columns=df_embeddings.iloc[0]
df_embeddings = df_embeddings.iloc[1: , :]


tqdm.pandas()
def get_numpy (row):
  return ast.literal_eval(row.embedding)

df_embeddings['embedding'] = df_embeddings.apply(lambda x: get_numpy(x), axis =1)

df_submissions = pd.read_csv(submission_path)
df_submission_labels = df_submissions[['id','title','conf_year','keywords','AVG_rating']]

df_embeddings_2017 = df_embeddings.merge(df_submission_labels[df_submission_labels['conf_year']==2017], left_on='paper_id',right_on='id')
df_embeddings_2018 = df_embeddings.merge(df_submission_labels[df_submission_labels['conf_year']==2018], left_on='paper_id',right_on='id')
assert df_submission_labels[df_submission_labels['conf_year']==2017].shape[0] == df_embeddings_2017.shape[0]

#Section 3.3 Step 2 agglomerative clustering on cosine distance
x = np.array(df_embeddings_2017.embedding.tolist())
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.1, linkage="average", metric = 'cosine').fit(x)
df_embeddings_2017['agg_cluster'] = clustering.labels_.tolist()
x = np.array(df_embeddings_2017.embedding.tolist())
y = np.array(df_embeddings_2017.agg_cluster.tolist())
clf = NearestCentroid()
clf.fit(x, y)

assert df_embeddings_2018.shape[0] == df_submission_labels[df_submission_labels['conf_year']==2018].shape[0]

#Section 3.3. Step 3

#setting up KNN for 2018
neigh = NearestNeighbors( n_neighbors=10, metric = 'cosine', radius = 0.3)
non_anchor_embedding_2018 = np.array(df_embeddings_2018.embedding.to_list())
neigh.fit(non_anchor_embedding_2018)

#setting up closest centroid for anchor group 2017
anchor_embedding_2017 = np.array(df_embeddings_2017.embedding.tolist())
anchor_agg_clusters_2017 = np.array(df_embeddings_2017.agg_cluster.tolist())
clf = NearestCentroid()
clf.fit(anchor_embedding_2017, anchor_agg_clusters_2017)



#dictionary of all the agg clusters and the 10 KNN from 2018
dict_agg_cluster_matches ={}
tuning_param_cos_dist = 0.3
for cluster_id in np.unique(clustering.labels_):
    
    distances, indices = neigh.kneighbors([clf.centroids_[cluster_id]])
    df_anchor_embedding = pd.concat([pd.DataFrame(data = distances.T,columns =['cos_dist']),pd.DataFrame(indices.T,columns=['indices'])],axis=1)

    #get all the specified cosine distance 2018 papers
    #tuple of (dataframe of 2018 matched papers, cosine distances)
    dict_agg_cluster_matches[cluster_id] = (
        df_embeddings_2018.iloc[df_anchor_embedding[df_anchor_embedding['cos_dist']<= tuning_param_cos_dist].indices.to_list(), :],
        df_anchor_embedding[df_anchor_embedding['cos_dist']<= tuning_param_cos_dist].cos_dist.to_list()
    )


def lambda_get_2018_matches(row):
    #get embedding matches from 2018 papers
    #returning relevant information
    df_clustered_papers = dict_agg_cluster_matches[row.agg_cluster]
    
    lst_paper_titles = df_clustered_papers[0].title.tolist()
    lst_paper_ids = df_clustered_papers[0].paper_id.tolist()
    ls_paper_keywords = df_clustered_papers[0].keywords.values.tolist()
    ls_cos_distances = df_clustered_papers[1]
    
    return lst_paper_titles, ls_paper_keywords, lst_paper_ids, ls_cos_distances

df_embeddings_2017[['titles_2018','keywords_2018','id_2018','cos_dist_2018']]= df_embeddings_2017.apply(lambda x: lambda_get_2018_matches(x),axis=1, result_type ='expand')
df_cos_dist_sample = pd.concat([df_embeddings_2017[['agg_cluster']],pd.DataFrame(df_embeddings_2017["cos_dist_2018"].to_list())], axis=1)

#tuning charts
#get the max number of samples with the lowest possible cosine distance
df_tuning = df_cos_dist_sample.drop(['agg_cluster'],axis = 1)

data = []
for tuning_param_cos_dist in np.linspace(0,0.3 ,1000):
    input_row = {}
    sample_number_from_2017 = df_tuning[df_tuning<=tuning_param_cos_dist].any(axis=1).sum()
    
    input_row['sample_number_2017'] = sample_number_from_2017
    input_row['cosine_distance'] = tuning_param_cos_dist
    if sample_number_from_2017 == df_tuning.shape[0]:
        input_row['full_sample_flag'] = 'Full Sample'
    else:
        input_row['full_sample_flag'] = 'Partial Sample'
    data.append(input_row)


x = pd.DataFrame(data)

#Tune KNN matching based on smallest possible cosine distance

tuning_param_knn = 10
tuning_param_cos_dist = 0.234535

neigh = NearestNeighbors( n_neighbors=tuning_param_knn, metric = 'cosine', radius = 0.3)
non_anchor_embedding_2018 = np.array(df_embeddings_2018.embedding.to_list())
neigh.fit(non_anchor_embedding_2018)

#setting up closest centroid for anchor group 2017
anchor_embedding_2017 = np.array(df_embeddings_2017.embedding.tolist())
anchor_agg_clusters_2017 = np.array(df_embeddings_2017.agg_cluster.tolist())
clf = NearestCentroid()
clf.fit(anchor_embedding_2017, anchor_agg_clusters_2017)


#dictionary of all the agg clusters and the 20 KNN from 2018
dict_agg_cluster_matches ={}
for cluster_id in np.unique(clustering.labels_):
    
    distances, indices = neigh.kneighbors([clf.centroids_[cluster_id]])
    df_anchor_embedding = pd.concat([pd.DataFrame(data = distances.T,columns =['cos_dist']),pd.DataFrame(indices.T,columns=['indices'])],axis=1)

    #get all the specified cosine distance 2018 papers
    #tuple of (dataframe of 2018 matched papers, cosine distances)
    dict_agg_cluster_matches[cluster_id] = (
        df_embeddings_2018.iloc[df_anchor_embedding[df_anchor_embedding['cos_dist']<= tuning_param_cos_dist].indices.to_list(), :],
        df_anchor_embedding[df_anchor_embedding['cos_dist']<= tuning_param_cos_dist].cos_dist.to_list()
    )

def lambda_get_2018_matches(row):
    #get embedding matches from 2018 papers
    #returning relevant information
    df_clustered_papers = dict_agg_cluster_matches[row.agg_cluster]
    
    lst_paper_titles = df_clustered_papers[0].title.tolist()
    lst_paper_ids = df_clustered_papers[0].paper_id.tolist()
    ls_paper_keywords = df_clustered_papers[0].keywords.values.tolist()
    ls_cos_distances = df_clustered_papers[1]
    
    return lst_paper_titles, ls_paper_keywords, lst_paper_ids, ls_cos_distances
    
    
    

df_embeddings_2017[['titles_2018','keywords_2018','id_2018','cos_dist_2018']]= df_embeddings_2017.apply(lambda x: lambda_get_2018_matches(x),axis=1, result_type ='expand')

def get_num_knn_matches(row):
    count = len(row.titles_2018)
    return(len(row.titles_2018))

df_embeddings_2017['num_knn_matches'] = df_embeddings_2017.apply(lambda x: get_num_knn_matches(x),axis =1)

assert df_embeddings_2017.shape[0] == df_submission_labels[df_submission_labels['conf_year']==2017].shape[0]

#Matching potential outcome estimator
#Keith et al. 2020
#https://aclanthology.org/2020.acl-main.474.pdf
#equation 7 and 8

def lambda_get_match_potential_outcomes(row):
    #equation 7 in the paper
    paper_ids = row.id_2018
    big_m = len(row.id_2018)
    if big_m == 0:
        return None
    ratings = [df_embeddings_2018[df_embeddings_2018['paper_id'] == paper_id].AVG_rating.values[0] for paper_id in paper_ids]
    return sum(ratings)/big_m
        
    

df_embeddings_2017['match_ave_rating'] = df_embeddings_2017.apply(lambda row: lambda_get_match_potential_outcomes(row), axis =1)
df_embeddings_2017['diff_2018_2017'] = df_embeddings_2017['match_ave_rating'] - df_embeddings_2017['AVG_rating']

df_embeddings_2017 = df_embeddings_2017.loc[df_embeddings_2017['match_ave_rating'].notnull(),]

#equation 8 in the paper
KNN_ATT = (df_embeddings_2017['match_ave_rating'] - df_embeddings_2017['AVG_rating']).sum()/df_embeddings_2017.shape[0]

assert df_embeddings_2017['match_ave_rating'].shape[0] == df_embeddings_2017['AVG_rating'].shape[0]

#bootstrap KNN confidence interval
bootstrap_mean = []
for n in range(5000):
    sample = df_embeddings_2017.diff_2018_2017.sample(n=df_embeddings_2017.shape[0], replace = True ,random_state=n)
    bootstrap_mean.append(sample.mean())
    
mean = statistics.mean(bootstrap_mean)
CI_lower_bound_knn = np.quantile(bootstrap_mean, 0.025)
CI_upper_bound_knn = np.quantile(bootstrap_mean, 0.975)
CI_knn_range = abs(CI_lower_bound_knn) -abs(CI_upper_bound_knn)

Per procedure by https://aclanthology.org/D12-1091.pdf

Because our method has an ATC of -0.17 and SPSM is -0.25, -0.17 - (-0.25) = 0.08
Delta(x) = 0.08. Follow bootstrap procedure of figure 1.

In [43]:
df_embeddings_2018.AVG_rating.mean()

5.431410256410257

In [45]:
sample_diff = df_embeddings_2018.AVG_rating.mean() - df_embeddings_2017.AVG_rating.mean() 

sample_diff

-0.251650968079538

In [50]:
boot_strap_means = pd.DataFrame(bootstrap_mean)
boot_strap_means.columns =['bootstrap_means']
s = 0
s2 = 0 
delta_x = 0.08
for n in range(5000):
    sample = boot_strap_means.bootstrap_means.sample(n=boot_strap_means.shape[0], replace = True ,random_state=n)
    sample2 = df_embeddings_2017.diff_2018_2017.sample(n=df_embeddings_2017.shape[0], replace = True ,random_state=n)
    
    delta = sample.mean() - -0.25
    #print(sample.mean())
    
    delta2 = sample2.mean() - sample_diff
    print(sample2.mean())
    print(delta2)

    if delta > 2*delta_x:
        s += 1
    
    if delta2 > 2*delta_x:
        s2 += 1

        
print(s/5000)
print(s2/5000)


-0.26335276967930027
-0.21221574344023317
-0.049761904761904785
-0.2790273728539035
-0.24358519598315512
-0.18381519274376415
-0.10991496598639452
-0.13071671525753156
-0.24188775510204089
-0.16636297376093295
-0.14548185941043087
-0.16387431162941365
-0.12025186264982188
-0.200988824101069
-0.13137998056365405
-0.19200113378684808
-0.1026951733074182
-0.17131114350502102
-0.29834710074505993
-0.13578150307742146
-0.15185374149659864
-0.231567055393586
-0.19108519598315518
-0.19370181405895692
-0.12115079365079368
-0.2811491739552964
-0.09804907677356654
-0.21465986394557823
-0.0990079365079365
-0.1655409782960803
-0.12976109491415613
-0.13487528344671207
-0.21379251700680274
-0.2643334953028831
-0.12218820861678005
-0.27965662455458373
-0.20926546809199872
-0.19191609977324264
-0.21520651117589892
-0.04849773242630379
-0.04674846128927762
-0.20508989310009723
-0.16089002267573696
-0.13335843861354066
-0.18778344671201808
-0.14137431162941363
-0.2551700680272109
-0.15940476190476188
-0

-0.05890022675736959
-0.19401603498542278
-0.08371558147068354
-0.1925396825396825
-0.09238095238095234
-0.22675979915775832
-0.3227178490443796
-0.21149092970521538
-0.15653061224489798
-0.2612925170068027
-0.225285876255264
-0.23601716877227083
-0.17054907677356654
-0.3023412698412698
-0.1406827016520894
-0.1923461289277616
-0.20199546485260766
-0.19966229348882408
-0.1366747651441529
-0.14841431810819564
-0.16486637512147717
-0.1627202785876255
-0.3019719792678976
-0.21220440557175252
-0.04270651117589895
-0.12575396825396826
-0.14105037252996433
-0.17377227081308713
-0.1885204081632653
-0.15587868480725622
-0.1284118885649498
-0.16678814382896023
-0.045181405895691605
-0.16934483317136376
-0.1487325882734046
-0.21224732750242956
-0.2520294784580499
-0.06568270165208935
-0.15443877551020407
-0.17788710722384196
-0.09503401360544218
-0.15054421768707482
-0.14635082604470356
-0.23793245869776486
-0.13202057013281507
-0.1435366051182378
-0.08950599287333982
-0.1694298671849692
-0.18300

-0.21281988986070616
-0.2890387107223842
-0.2909977324263039
-0.09666099773242631
0.008639455782312927
-0.056632653061224565
-0.07806122448979592
-0.1416634272756722
-0.20491496598639455
-0.1333981211532232
0.038968253968253964
-0.24727810171687725
-0.3242573696145125
-0.18540492387431168
-0.08906705539358599
-0.16724732750242954
-0.14307256235827664
-0.25108276643990934
-0.12270894071914476
-0.15632329122125035
-0.14209426627793972
-0.27894800777453843
-0.12280855199222544
-0.14161240686750892
-0.20581308713961777
-0.06718253968253964
-0.1744104308390022
-0.1408219954648526
-0.1675566893424036
-0.13389698736637512
-0.27305555555555555
-0.04573048266925821
-0.11711370262390668
-0.09399092970521544
-0.1644671201814059
-0.18173145448655653
-0.1783471007450599
-0.18168610301263366
-0.17214933592484616
-0.16047619047619047
-0.16713718820861673
-0.14275186264982173
-0.22815759637188202
-0.2619468739876903
-0.05060333657272428
-0.2382134758665371
-0.21296161321671525
-0.11479591836734691
-0.

-0.07990605766115971
-0.26037414965986394
-0.09743440233236152
-0.2285908649173955
-0.12420553935860056
-0.2835997732426304
-0.19849611273080667
-0.18952947845804988
-0.14235827664399095
-0.09583333333333333
-0.1901700680272109
-0.21292759961127306
-0.1685584710074506
-0.18910997732426305
-0.0779365079365079
-0.23923712342079687
-0.17043893747975378
-0.22774943310657594
-0.07882896015549076
-0.15447764820213797
-0.1567743764172335
-0.16377227081308718
-0.23861678004535147
-0.17428814382896013
-0.21630628441852928
-0.15630385487528342
-0.22531341107871716
-0.2710455134434727
-0.08496841593780367
-0.15606494978943958
-0.13474165856818918
-0.21116780045351474
-0.14997975380628442
-0.19884029802397146
-0.2278684807256236
-0.12940719144800775
-0.11990605766115972
-0.06956268221574345
-0.04969063816002587
-0.021150793650793677
-0.15052964042759961
-0.10736394557823119
-0.16989229024943311
-0.11739229024943311
-0.14766925817946225
-0.17591998704243608
-0.15369047619047613
-0.1328539034661484


-0.17309523809523808
-0.0770229996760609
-0.15632086167800455
-0.2679130223517979
-0.1499967606090055
-0.22056365403304173
-0.14316002591512797
-0.09258746355685134
-0.18251943634596698
-0.14817460317460318
-0.04643019112406862
-0.11132329122125037
-0.1068221574344023
-0.022621477162293485
-0.3514820213799805
-0.18670068027210884
-0.20674036281179134
-0.19040492387431157
-0.20271541950113373
-0.0845238095238095
-0.09773971493359243
-0.3248914804016845
-0.19190476190476194
-0.24866618075801747
-0.18706916099773244
-0.24499433106575963
-0.21489229024943307
-0.09374392614188527
-0.09615079365079365
-0.16436993845157113
-0.03989229024943314
-0.17012147716229345
-0.16908730158730156
-0.2381486880466472
-0.10995140913508261
-0.1840079365079365
-0.06085276967930024
-0.18721655328798187
-0.21192662779397473
-0.03160673793326854
-0.10624878522837707
-0.25203433754454163
-0.1816269841269841
-0.19857709750566888
-0.06649092970521536
-0.14396501457725946
-0.23172011661807584
-0.2444946550048591
-0

-0.10195578231292515
-0.2535625202461937
-0.08841755749919016
-0.18380385487528345
-0.162375283446712
-0.23069727891156463
-0.2142031098153547
-0.13355118237771296
-0.21080741820537743
-0.23451490119857463
-0.28733722060252664
-0.27611111111111114
-0.23892290249433107
-0.10378927761580821
-0.2516569484936832
-0.2330895691609977
-0.15713961775186264
-0.123531746031746
-0.12692176870748298
-0.12928814382896017
-0.18857385811467442
-0.11428814382896017
-0.12122691933916424
-0.21719387755102043
-0.23590379008746354
-0.09807823129251701
-0.25024619371558143
-0.15022351797862
-0.1361702299967606
-0.1623153547133139
-0.10195821185617106
-0.20194363459669587
-0.17024862325882728
-0.06253644314868806
-0.18647068351149979
-0.12734612892776154
-0.09389131843213473
-0.1320545837382572
-0.16248542274052477
-0.18833819241982505
-0.11023242630385485
-0.12084710074505987
-0.11401360544217684
-0.14882329122125043
-0.24590379008746358
-0.32101473922902496
-0.2367565597667639
-0.09683268545513445
-0.1696

-0.23269517330741818
0.006468253968253982
-0.2359936831875607
-0.33960803368966636
-0.08211694201490118
-0.1352721088435374
-0.029608033689666368
-0.22559766763848396
-0.2116836734693877
-0.17045027534823454
-0.2906292517006803
-0.22235422740524782
-0.09914642047295111
-0.12069727891156459
-0.18320537738905082
-0.20938208616780046
-0.15594590217039198
-0.11370667314544866
-0.10275996112730808
-0.19166666666666665
-0.15467687074829933
-0.23996841593780371
-0.17902170391966307
-0.23072238419177196
-0.08279721412374473
-0.11961694201490117
-0.31095157110463234
-0.142437641723356
-0.16400145772594749
-0.19488581146744413
-0.09028587625526399
-0.22490929705215418
-0.3193561710398445
-0.11532312925170067
-0.1415273728539034
-0.06433349530288306
-0.0603514739229024
-0.25724489795918365
-0.19373015873015875
-0.11325315840621966
-0.12115565273728536
-0.20383948817622285
-0.2314083252348558
-0.17619209588597343
-0.2092517006802721
-0.12454324586977648
-0.17470521541950115
-0.2717031098153547
-0.

-0.17013605442176866
-0.2364957887917072
-0.14907839326206668
-0.18350016196954977
-0.10953109815354713
-0.22631357304826696
-0.06396258503401363
-0.24929948169744084
-0.09946145124716557
-0.26131438289601555
-0.1621420472951085
-0.20161564625850334
-0.2011556527372854
-0.11324182053773889
-0.18201490119857466
-0.1784661483641075
-0.18646501457725947
-0.18006721736313572
-0.10757531584062194
-0.15668043407839327
-0.19009880142533203
-0.1897813411078718
-0.29256478781988987
-0.16192662779397474
-0.1865071266601879
-0.22161564625850338
-0.1751125688370586
-0.2123809523809523
-0.05976352445740199
-0.19157839326206672
-0.1867800453514739
-0.17799886621315192
-0.18024295432458695
-0.21828474246841592
-0.1822586653709102
-0.1205547457078069
-0.18114755425979914
-0.2680879494655005
-0.24912698412698417
-0.0480264010366051
-0.21035957240038872
-0.08493359248461284
-0.191790573372206
-0.22642290249433103
-0.18066326530612242
-0.24416585681891803
-0.14045918367346946
-0.16110787172011665
-0.1423

-0.056272270813087125
-0.21879494655004858
-0.16541059280855197
-0.12263038548752835
-0.12535957240038872
-0.17452380952380955
-0.08529721412374468
-0.1078652413346291
-0.1454016844833171
-0.10975299643666994
-0.26700113378684814
-0.2680069646906381
-0.2699092970521542
-0.15092322643343054
-0.26207725947521865
-0.16850016196954964
-0.20397878198898609
-0.16080498866213153
-0.06909540006478784
-0.1710568513119533
-0.1603595724003887
-0.1419671201814059
-0.11420634920634926
-0.18369290573372207
-0.24249109167476515
-0.08200032393909944
-0.24547862001943646
-0.22794217687074836
-0.15715905409782954
-0.16411807580174928
-0.16105685131195335
-0.07641156462585033
-0.24547619047619046
-0.179812925170068
-0.15628927761580824
-0.15199222546161323
-0.16418610301263362
-0.19965905409782958
-0.22875850340136042
-0.06788467768059603
-0.219211208292841
-0.14921849692257852
-0.21613378684807258
-0.2581859410430839
-0.20295837382572074
-0.2640929705215419
-0.1728539034661483
-0.20355685131195334
-0.31

In [18]:
s/50000

0.0678