In [1]:
import pandas as pd
import ast
import random
import uuid

In [2]:
rand_state =60

In [3]:
df_cluster = pd.read_csv('df_cluster_results_10_match_cosine_23.csv')
df_SPSM = pd.read_csv('df_SPSM_results.csv')
df_2018_titles = pd.read_csv('naive_titles_2018.csv')

In [4]:
#data cleaning
#turn string representation of list to list datatype
df_cluster['lst_2018_titles'] = df_cluster.titles_2018.apply(lambda x: ast.literal_eval(x))
df_cluster['lst_2018_id'] = df_cluster.id_2018.apply(lambda x: ast.literal_eval(x))

In [5]:
#data check
assert len(df_cluster.lst_2018_titles[rand_state]) == len(df_cluster.lst_2018_id[rand_state]) 

# Generate 100 2017 samples

In [10]:
#generate a sample of 100 2017 ICLR papers

sample_2017_id = df_cluster.sample(n=100, replace=False,random_state = rand_state).paper_id.to_list()

In [11]:
assert len(set(sample_2017_id)) == 100

# Generate Samples of KNN matching

In [12]:
#get sample of 2018 titles from KNN matching

data_knn = []
for n in range(len(sample_2017_id)):
    '''We take each 2017 title from the sampled data, and sample 1 of the 2018 matches for each 2017 paper'''
    
    
    knn_df = df_cluster[df_cluster['paper_id'] == sample_2017_id[n]]
    input_row = {}
    in_loop_random_state = rand_state + n
    #we set the random seed here for chosing which 2018 paper that is matched with 2017 through KNN
    random.seed(in_loop_random_state)
    sample = random.choice(knn_df.lst_2018_id.values[0])
    #print(knn_df.lst_2018_id.values[0])
    #check to see if paper exist in the KNN method
    assert df_cluster[df_cluster['paper_id'] == sample_2017_id[n]].id_2018.str.contains(sample).values[0]
    
    input_row['id_2017'] = sample_2017_id[n]
    input_row['id_2018'] = sample
    input_row['method'] = 'KNN'
    input_row['unique_id'] = str(uuid.uuid4())
    input_row['random_state'] = in_loop_random_state
    data_knn.append(input_row)
    
df_sample_knn = pd.DataFrame(data_knn)

In [13]:
#check to see if we got the all the 2017 papers from the samples
assert df_sample_knn.shape[0] == 100
assert df_sample_knn[df_sample_knn['id_2017'].isin(sample_2017_id)].shape[0] == 100 
assert len(set(df_sample_knn.id_2017.to_list())) == 100

# Generate Samples of Naive Method

In [14]:
#randomly selecting 100 samples from 2018 naive method

lst_2018_sample_naive = df_2018_titles[df_2018_titles['conf_year'] == 2018].sample(n = 100 ,replace=False,random_state = rand_state).paper_id.to_list()
lst_2018_uniqu_id = [str(uuid.uuid4()) for n in range(100)]
data_naive = {'id_2017':sample_2017_id,
              'id_2018':lst_2018_sample_naive,
              'method':['naive']*100,
              'unique_id':lst_2018_uniqu_id,
              'random_state':[rand_state] * 100
             }

df_sample_naive  = pd.DataFrame(data_naive)

In [15]:
#check to see if we got the all the 2017 papers from the samples
assert df_sample_naive.shape[0] == 100
assert df_sample_naive[df_sample_naive['id_2017'].isin(sample_2017_id)].shape[0] == 100 
assert len(set(df_sample_naive.id_2017.to_list())) == 100

# Generate Samples of SPSM

In [16]:
#We first need to get the strata from each 

dict_strata_count = df_SPSM[df_SPSM['paper_id'].isin(sample_2017_id)].groupby(['strata'])['paper_id'].count().to_dict()

df_sample_SPSM = pd.DataFrame(columns = ['id_2017','id_2018','method','unique_id','random_state'])
for strata, count in dict_strata_count.items():
    input_row ={}
    loop_random_state = rand_state + 100
    titles_2017_SPSM = df_SPSM[(df_SPSM['paper_id'].isin(sample_2017_id)) & (df_SPSM['strata']==strata)].paper_id.to_list()
    titles_2018_SPSM = df_SPSM[(df_SPSM['treatment']==0) & (df_SPSM['strata'] == strata)].sample(n=count,replace=False,random_state = loop_random_state).paper_id.to_list()
    
    assert (len(titles_2018_SPSM) == count) & (len(titles_2017_SPSM) == count) & (df_SPSM[df_SPSM['paper_id'].isin(titles_2018_SPSM)].conf_year.mean()==2018)
    lst_2018_uniqu_id = [str(uuid.uuid4()) for n in range(count)]
    
    data_SPSM = {'id_2017':titles_2017_SPSM,
                  'id_2018':titles_2018_SPSM,
                  'method':['SPSM']*count,
                  'unique_id':lst_2018_uniqu_id,
                  'random_state':[rand_state + 100] * count
                 }
    
    df_sample_SPSM = pd.concat([df_sample_SPSM,pd.DataFrame(data_SPSM)],ignore_index=True)    

In [17]:
#check for errors

assert df_sample_SPSM.shape[0] == 100

#check to see if papers are from 2018
assert df_sample_SPSM.merge(df_SPSM,left_on='id_2018',right_on='paper_id').treatment.mean() == 0

#check to see if we got the right strata distribution
assert df_sample_SPSM.merge(df_SPSM,left_on='id_2018',right_on='paper_id').groupby(['strata'])['id_2018'].count().to_dict() == dict_strata_count

#check to see if we got the all the 2017 papers from the samples
assert df_sample_SPSM[df_sample_SPSM['id_2017'].isin(sample_2017_id)].shape[0] == 100 
assert len(set(df_sample_SPSM.id_2017.to_list())) == 100

# Making final evaluation set

In [18]:
#check for errors
df_all_methods_sample = pd.concat([df_sample_SPSM,df_sample_naive,df_sample_knn])
df_all_methods_sample.groupby(['method'])['unique_id'].count()

method
KNN      100
SPSM     100
naive    100
Name: unique_id, dtype: int64

In [19]:
df_all_paper_titles_and_id = pd.concat([df_cluster[['paper_id','title']],df_2018_titles[['paper_id','title']]])

In [20]:
#turn paper_id into actual titles
def get_titles(row):
    title_2017 = df_all_paper_titles_and_id[df_all_paper_titles_and_id['paper_id']==row.id_2017].title.values[0]
    title_2018 = df_all_paper_titles_and_id[df_all_paper_titles_and_id['paper_id']==row.id_2018].title.values[0]
    return title_2017,title_2018

df_all_methods_sample[['title_2017','title_2018']] = df_all_methods_sample.apply(lambda x: get_titles(x),axis = 1,result_type='expand')

In [21]:
#check for errors

#see if we got all the 2017 papers
assert sorted(df_cluster.sample(n=100, replace=False,random_state = rand_state).title.to_list()) == sorted(list(set(df_all_methods_sample.title_2017.to_list())))

#see if we got all the 2018 papers
assert df_2018_titles[df_2018_titles['title'].isin(list(set(df_all_methods_sample.title_2018.to_list())))].shape[0] == len(list(set(df_all_methods_sample.title_2018.to_list())))

In [22]:
df_evaluation_data_with_labels_DONT_LOOK = df_all_methods_sample.sample(n=300,replace=False,random_state=rand_state)

df_evaluation_data_with_labels_DONT_LOOK = df_evaluation_data_with_labels_DONT_LOOK.sort_values(by=['title_2017'])

In [23]:
df_evaluation_data_no_labels = df_evaluation_data_with_labels_DONT_LOOK[['unique_id','title_2017','title_2018']]

In [24]:
df_evaluation_data_with_labels_DONT_LOOK.to_csv('evaluation_set_with_labels_DONT_LOOK.csv',index=False)
df_evaluation_data_no_labels.to_csv('evaluation_set.csv',index=False)

# Read in data to reveal method labels

In [512]:
#this is just the pseudo code
#pd.read_csv('test.csv',header=None).rename(columns={0:'unique_id'}).merge(pd.read_csv('test_labels.csv'), on='unique_id')