In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [2]:
import sys
import numpy as np
import pandas as pd

sys.path.append('/lfs/1/sahaana/enrichment/ember/utils')
from preprocessing_utils import compute_BM25, merge_columns, reindex_deepmatcher

%load_ext autoreload
%autoreload 2

# deepmatcher

In [90]:
path = "/lfs/1/sahaana/enrichment/data/deepmatcher"
merged_col = "merged_all"
separator = "[SEP]"

In [7]:
datasets = {0:"abt_buy_exp_data", 
            1:"amazon_google_exp_data", 
            2:"beer_exp_data", 
            3:"company_exp_data", 
            4:"dblp_acm_exp_data", 
            5:"dblp_scholar_exp_data", 
            6:"dirty_dblp_acm_exp_data", 
            7:"dirty_dblp_scholar_exp_data", 
            8:"dirty_itunes_amazon_exp_data", 
            9:"dirty_walmart_amazon_exp_data", 
            10:"fodors_zagat_exp_data", 
            11:"itunes_amazon_exp_data", 
            12:"walmart_amazon_exp_data"}

## Merge and BM25

In [61]:
for d in datasets:
    query_path = f"{path}/{datasets[d]}/tableA_processed.pkl"
    corpus_path = f"{path}/{datasets[d]}/tableB_processed.pkl"
    
    query_df = pd.read_csv(f"{path}/{datasets[d]}/tableA.csv")
    corpus_df = pd.read_csv(f"{path}/{datasets[d]}/tableB.csv")
    
    query_columns = list(query_df.columns)
    corpus_columns = list(corpus_df.columns)
    
    query_columns.remove('id')
    corpus_columns.remove('id')
    
    query_df = merge_columns(query_df, query_columns, merged_col, query_path, separator)
    corpus_df = merge_columns(corpus_df, corpus_columns, merged_col, corpus_path, separator)
    
    compute_BM25(corpus_df, query_df, "merged_all", datasets[d])

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved abt_buy_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved amazon_google_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved beer_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved company_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dblp_acm_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dblp_scholar_exp_

## Reindex Train/Val/Test

In [89]:
for d in datasets:
    l_df = pd.read_csv(f"{path}/{datasets[d]}/tableA.csv")
    r_df = pd.read_csv(f"{path}/{datasets[d]}/tableB.csv")
    
    train = pd.read_csv(f"{path}/{datasets[d]}/train.csv")
    test = pd.read_csv(f"{path}/{datasets[d]}/test.csv")
    val = pd.read_csv(f"{path}/{datasets[d]}/valid.csv")
    
    train_updated = reindex_deepmatcher(l_df, r_df, train)
    test_updated = reindex_deepmatcher(l_df, r_df, test)
    val_updated = reindex_deepmatcher(l_df, r_df, val)
    
    train_updated.to_csv(f"{path}/{datasets[d]}/train_updated.csv", index=False)
    test_updated.to_csv(f"{path}/{datasets[d]}/test_updated.csv", index=False)
    val_updated.to_csv(f"{path}/{datasets[d]}/val_updated.csv", index=False)
    

## Count the num keys in train/test/val, and overlap in train-test train-val val-test

In [33]:
for d in datasets:
    l_df = pd.read_csv(f"{path}/{datasets[d]}/tableA.csv")
    r_df = pd.read_csv(f"{path}/{datasets[d]}/tableB.csv")
    
    train = pd.read_csv(f"{path}/{datasets[d]}/train.csv")
    test = pd.read_csv(f"{path}/{datasets[d]}/test.csv")
    val = pd.read_csv(f"{path}/{datasets[d]}/valid.csv")
    print(datasets[d])
    print(f"{len(l_df)} \t lsize")
    print(f"{len(r_df)} \t rsize")
    print(f"{len(train)} \t train size")
    print(f"{len(val)} \t val size")
    print(f"{len(test)} \t test size")
    print()
    print(f"{train['ltable_id'].nunique()} \t uniques in train ltable")
    print(f"{train['rtable_id'].nunique()} \t uniques in train rtable")

    print(f"{val['ltable_id'].nunique()} \t uniques in val ltable")
    print(f"{val['rtable_id'].nunique()} \t uniques in val rtable")
    
    print(f"{test['ltable_id'].nunique()} \t uniques in test ltable")
    print(f"{test['rtable_id'].nunique()} \t uniques in test rtable")
    print()
    train_val_l = pd.merge(train, val, on='ltable_id', suffixes=('_l','_r'), how='inner')['ltable_id'].nunique()
    train_val_r = pd.merge(train, val, on='rtable_id', suffixes=('_l','_r'), how='inner')['rtable_id'].nunique()
    train_test_l = pd.merge(train, test, on='ltable_id', suffixes=('_l','_r'), how='inner')['ltable_id'].nunique()
    train_test_r = pd.merge(train, test, on='rtable_id', suffixes=('_l','_r'), how='inner')['rtable_id'].nunique()
    val_test_l = pd.merge(val, test, on='ltable_id', suffixes=('_l','_r'), how='inner')['ltable_id'].nunique()
    val_test_r = pd.merge(val, test, on='rtable_id', suffixes=('_l','_r'), how='inner')['rtable_id'].nunique()
    
    print(f"{train_val_l} \t train/val ltable overlap")
    print(f"{train_val_r} \t train/val rtable overlap")
    print(f"{train_test_l} \t train/test ltable overlap")
    print(f"{train_test_r} \t train/test rtable overlap")
    print(f"{val_test_l} \t test/val ltable overlap")
    print(f"{val_test_r} \t test/val rtable overlap")
    
    print()
    print()
    

abt_buy_exp_data
1081 	 lsize
1092 	 rsize
5743 	 train size
1916 	 val size
1916 	 test size

973 	 uniques in train ltable
956 	 uniques in train rtable
728 	 uniques in val ltable
702 	 uniques in val rtable
737 	 uniques in test ltable
700 	 uniques in test rtable

676 	 train/val ltable overlap
649 	 train/val rtable overlap
674 	 train/test ltable overlap
650 	 train/test rtable overlap
547 	 test/val ltable overlap
516 	 test/val rtable overlap


amazon_google_exp_data
1363 	 lsize
3226 	 rsize
6874 	 train size
2293 	 val size
2293 	 test size

1126 	 uniques in train ltable
1788 	 uniques in train rtable
772 	 uniques in val ltable
1107 	 uniques in val rtable
771 	 uniques in test ltable
1090 	 uniques in test rtable

673 	 train/val ltable overlap
903 	 train/val rtable overlap
674 	 train/test ltable overlap
888 	 train/test rtable overlap
531 	 test/val ltable overlap
641 	 test/val rtable overlap


beer_exp_data
4345 	 lsize
3000 	 rsize
268 	 train size
91 	 val size
91 

# MS Marco (go see Piecewise BM25 for MSMARCO)

## Merge and BM25

In [106]:
path = "/lfs/1/sahaana/enrichment/data/MSMARCO/"
merged_col = "merged_all"
separator = "[SEP]"
dataset = 'MARCO'

In [92]:
file = "collection.tsv"
collection = pd.read_csv(f"{path}/{file}",sep='\t', header=None, 
                         index_col=0, names=['PID','Passage'])
collection

  mask |= (ar1 == a)


Unnamed: 0_level_0,Passage
PID,Unnamed: 1_level_1
0,The presence of communication amid scientific ...
1,The Manhattan Project and its atomic bomb help...
2,Essay on The Manhattan Project - The Manhattan...
3,The Manhattan Project was the name for a proje...
4,versions of each volume as well as complementa...
...,...
8841818,When metal salts emit short wavelengths of vis...
8841819,Thousands of people across the United States w...
8841820,"The recipe that creates blue, for example, inc..."
8841821,"On Independence Days of yore, old-timey crowds..."


In [93]:
file = "queries.train"
queries = pd.read_csv(f"{path}/{file}.tsv",sep='\t',header=None,
                     names=['QID', 'Query'], index_col=0)
queries

Unnamed: 0_level_0,Query
QID,Unnamed: 1_level_1
121352,define extreme
634306,what does chattel mean on credit history
920825,what was the great leap forward brainly
510633,tattoo fixers how much does it cost
737889,what is decentralization process.
...,...
633855,what does canada post regulations mean
1059728,wholesale lularoe price
210839,how can i watch the day after
908165,what to use instead of pgp in windows


In [94]:
file = "top1000.train.txt"
top1000 = pd.read_csv(f"{path}/{file}",sep='\t', header=None, usecols=[0,1], names=['QID', 'PID'])
top1000

Unnamed: 0,QID,PID
0,965162,1000930
1,279558,1000930
2,279511,1000930
3,279498,1000930
4,739878,1000930
...,...,...
478002388,727506,999540
478002389,633883,999540
478002390,898757,999540
478002391,583095,999540


In [96]:
unique_queries = pd.DataFrame(top1000['QID'].unique(),columns=['QID'])
unique_passages = pd.DataFrame(top1000['PID'].unique(),columns=['PID'])

In [97]:
query_df = pd.merge(queries, unique_queries, on='QID')
corpus_df = pd.merge(collection, unique_passages, on='PID')

In [103]:
query_path = f"{path}/tableA_processed.pkl"
corpus_path = f"{path}/tableB_processed.pkl"

query_columns = list(query_df.columns)
corpus_columns = list(corpus_df.columns)

query_columns.remove('QID')
corpus_columns.remove('PID')

query_df = merge_columns(query_df, query_columns, merged_col, query_path, separator)
corpus_df = merge_columns(corpus_df, corpus_columns, merged_col, corpus_path, separator)

In [109]:
compute_BM25(corpus_df, query_df, "merged_all", dataset)

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


KeyboardInterrupt: 

# SQuAD

In [4]:
path = "/lfs/1/sahaana/enrichment/data/SQuAD/"
merged_col = "merged_all"
separator = "[SEP]"
dataset = 'SQuAD'

## Sentence

In [9]:
query_path = f"{path}/train_tableA_processed.pkl"
corpus_path = f"{path}/train_tableB_sent_processed.pkl"

query_df = pd.read_pickle(f"{path}/train_tableA.pkl")
corpus_df = pd.read_pickle(f"{path}/train_tableB_sent.pkl")

query_columns = list(query_df.columns)
corpus_columns = list(corpus_df.columns)

query_df = merge_columns(query_df, query_columns, merged_col, query_path, separator)
corpus_df = merge_columns(corpus_df, corpus_columns, merged_col, corpus_path, separator)

bm25_sent = compute_BM25(corpus_df, query_df, "merged_all", "SQuAD_sent", reindex=True)

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved SQuAD_sent


In [10]:
combined_bm25_sent = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25_sent[2]]))
combined_bm25_sent = combined_bm25_sent.set_index(0)
pd.to_pickle(combined_bm25_sent, '/lfs/1/sahaana/enrichment/data/Okapi25Queries/SQuAD_sent_argsort_indices.pkl')
combined_bm25_sent

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,60170,60171,60172,60173,60174,60175,60176,60177,60178,60179
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,204639,36501,38733,111237,70098,128885,215939,144864,179022,102518,...,135573,94037,127893,53939,19016,76127,38408,67168,54978,148554
1,38733,111237,102518,159393,150200,137107,65083,128885,127550,215939,...,94037,222620,216915,225200,594,1259,121314,57127,6879,84126
2,73705,115181,131688,58366,86138,97218,120773,39722,127817,183022,...,98,205,195,2,96,353,546,199,47,9
3,143023,137615,190980,103642,165099,149909,35231,59913,215413,159561,...,594,122491,22690,153055,66141,56315,21437,174196,50920,136415
4,38733,111237,215939,144864,102518,126554,143790,137107,135964,137762,...,65451,190821,29961,177183,96236,64792,14287,89023,60910,32761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86816,204639,36501,38733,111237,70098,144864,179022,102518,126554,137107,...,14287,22690,101804,92710,24604,136415,52117,116959,228265,227698
86817,36501,38733,111237,215939,144864,179022,102518,126554,143790,137107,...,71826,174102,16516,94037,107773,107318,107168,107914,107765,124240
86818,36501,38733,111237,126554,137107,127550,137762,174332,209485,205350,...,195276,70103,55205,228045,58515,161535,200544,227782,97071,227509
86819,36501,38733,111237,70098,144864,179022,102518,143790,137107,150200,...,37711,228224,227713,19311,227491,116959,228250,192318,65451,228265


## Paragraph

In [5]:
query_path = f"{path}/train_tableA_processed.pkl"
corpus_path = f"{path}/train_tableB_para_processed.pkl"

query_df = pd.read_pickle(f"{path}/train_tableA.pkl")
corpus_df = pd.read_pickle(f"{path}/train_tableB_para.pkl")

query_columns = list(query_df.columns)
corpus_columns = list(corpus_df.columns)

query_df = merge_columns(query_df, query_columns, merged_col, query_path, separator)
corpus_df = merge_columns(corpus_df, corpus_columns, merged_col, corpus_path, separator)

bm25_para = compute_BM25(corpus_df, query_df, "merged_all", "SQuAD_para", reindex=True)

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved SQuAD_para


In [6]:
combined_bm25_para = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25_para[2]]))
combined_bm25_para = combined_bm25_para.set_index(0)
pd.to_pickle(combined_bm25_para, '/lfs/1/sahaana/enrichment/data/Okapi25Queries/SQuAD_para_argsort_indices.pkl')
combined_bm25_para

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,18868,18869,18870,18871,18872,18873,18874,18875,18876,18877
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,12251,351,15189,18910,10178,4137,5399,8292,8602,17425,...,11323,5977,3921,5728,848,13777,1295,10808,15710,6025
1,16540,6801,765,18664,2383,5808,18877,15263,14172,14883,...,11248,17503,9418,4004,4435,7069,18760,10274,9371,9402
2,11485,7435,7415,11040,14697,18880,11406,12043,5876,6177,...,3685,16778,7,6,11250,0,11,1,8,36
3,15265,11456,7155,15871,11485,11513,8194,6545,11268,7415,...,466,4044,3003,7635,14679,5517,13151,2969,3921,3842
4,5399,13426,9821,195,16505,12394,10367,17130,9213,18910,...,951,3381,6462,10744,16409,5728,1871,3921,1627,16778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86816,17425,9821,1170,16505,13529,10367,1169,8597,9213,7097,...,11780,18968,18978,9193,18940,18979,5349,18945,3921,18938
86817,351,18910,12252,10178,4137,6853,5399,8292,6112,17425,...,3921,795,16061,10836,9500,5722,10844,14946,1576,9559
86818,9821,33,9000,7588,6220,308,11906,17279,17435,6576,...,18926,18933,18945,18925,18923,11479,18922,18935,18936,18938
86819,10890,5399,8602,17425,14448,9821,1150,195,12394,6296,...,18932,18934,18976,18930,18979,18968,18978,18940,18945,18939
