In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [3]:
import sys
import numpy as np
import pandas as pd

sys.path.append('/lfs/1/sahaana/enrichment/ember/utils')
from preprocessing_utils import compute_BM25, merge_columns, reindex_deepmatcher

%load_ext autoreload
%autoreload 2

# deepmatcher

In [3]:
path = "/lfs/1/sahaana/enrichment/data/deepmatcher"
merged_col = "merged_all"
separator = "[SEP]"

In [4]:
datasets = {0:"abt_buy_exp_data", 
            1:"amazon_google_exp_data", 
            2:"beer_exp_data", 
            3:"company_exp_data", 
            4:"dblp_acm_exp_data", 
            5:"dblp_scholar_exp_data", 
            6:"dirty_dblp_acm_exp_data", 
            7:"dirty_dblp_scholar_exp_data", 
            8:"dirty_itunes_amazon_exp_data", 
            9:"dirty_walmart_amazon_exp_data", 
            10:"fodors_zagat_exp_data", 
            11:"itunes_amazon_exp_data", 
            12:"walmart_amazon_exp_data"}

## Merge and BM25

In [8]:
for d in datasets:
    query_path = f"{path}/{datasets[d]}/tableA_processed.pkl"
    corpus_path = f"{path}/{datasets[d]}/tableB_processed.pkl"
    
    query_df = pd.read_csv(f"{path}/{datasets[d]}/tableA.csv").set_index('id')
    corpus_df = pd.read_csv(f"{path}/{datasets[d]}/tableB.csv").set_index('id')
    
    query_columns = list(query_df.columns)
    corpus_columns = list(corpus_df.columns)
    
    query_df = merge_columns(query_df, query_columns, merged_col, query_path, separator)
    corpus_df = merge_columns(corpus_df, corpus_columns, merged_col, corpus_path, separator)
    
    bm25 = compute_BM25(corpus_df, query_df, "merged_all", datasets[d], reindex=True)
    combined_bm25 = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25[2]]))
    combined_bm25 = combined_bm25.set_index(0)
    pd.to_pickle(combined_bm25, f'/lfs/1/sahaana/enrichment/data/Okapi25Queries/{datasets[d]}_argsort_indices.pkl')

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved abt_buy_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved amazon_google_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved beer_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved company_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dblp_acm_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dblp_scholar_exp_

In [None]:
#BM 25 for dev set 
for d in datasets:
    query_df = pd.read_pickle(f"{path}/{datasets[d]}/tableA_processed.pkl")
    corpus_df = pd.read_pickle(f"{path}/{datasets[d]}/tableB_processed.pkl")
    
    query_columns = list(query_df.columns)
    corpus_columns = list(corpus_df.columns)
    
    bm25 = compute_BM25(corpus_df, query_df, "merged_all", datasets[d], reindex=True)
    combined_bm25 = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25[2]]))
    combined_bm25 = combined_bm25.set_index(0)
    pd.to_pickle(combined_bm25, f'/lfs/1/sahaana/enrichment/data/Okapi25Queries/{datasets[d]}_argsort_indices.pkl')

## Reindex Train/Val/Test

In [44]:
"""for d in datasets:
    l_df = pd.read_csv(f"{path}/{datasets[d]}/tableA.csv")
    r_df = pd.read_csv(f"{path}/{datasets[d]}/tableB.csv")
    
    train = pd.read_csv(f"{path}/{datasets[d]}/train.csv")
    test = pd.read_csv(f"{path}/{datasets[d]}/test.csv")
    val = pd.read_csv(f"{path}/{datasets[d]}/valid.csv")
    
    train_updated = reindex_deepmatcher(l_df, r_df, train)
    test_updated = reindex_deepmatcher(l_df, r_df, test)
    val_updated = reindex_deepmatcher(l_df, r_df, val)
    
    train_updated.to_csv(f"{path}/{datasets[d]}/train_updated.csv", index=False)
    test_updated.to_csv(f"{path}/{datasets[d]}/test_updated.csv", index=False)
    val_updated.to_csv(f"{path}/{datasets[d]}/val_updated.csv", index=False)
    """

'for d in datasets:\n    l_df = pd.read_csv(f"{path}/{datasets[d]}/tableA.csv")\n    r_df = pd.read_csv(f"{path}/{datasets[d]}/tableB.csv")\n    \n    train = pd.read_csv(f"{path}/{datasets[d]}/train.csv")\n    test = pd.read_csv(f"{path}/{datasets[d]}/test.csv")\n    val = pd.read_csv(f"{path}/{datasets[d]}/valid.csv")\n    \n    train_updated = reindex_deepmatcher(l_df, r_df, train)\n    test_updated = reindex_deepmatcher(l_df, r_df, test)\n    val_updated = reindex_deepmatcher(l_df, r_df, val)\n    \n    train_updated.to_csv(f"{path}/{datasets[d]}/train_updated.csv", index=False)\n    test_updated.to_csv(f"{path}/{datasets[d]}/test_updated.csv", index=False)\n    val_updated.to_csv(f"{path}/{datasets[d]}/val_updated.csv", index=False)\n    '

## Count the num keys in train/test/val, and overlap in train-test train-val val-test

In [33]:
for d in datasets:
    l_df = pd.read_csv(f"{path}/{datasets[d]}/tableA.csv")
    r_df = pd.read_csv(f"{path}/{datasets[d]}/tableB.csv")
    
    train = pd.read_csv(f"{path}/{datasets[d]}/train.csv")
    test = pd.read_csv(f"{path}/{datasets[d]}/test.csv")
    val = pd.read_csv(f"{path}/{datasets[d]}/valid.csv")
    print(datasets[d])
    print(f"{len(l_df)} \t lsize")
    print(f"{len(r_df)} \t rsize")
    print(f"{len(train)} \t train size")
    print(f"{len(val)} \t val size")
    print(f"{len(test)} \t test size")
    print()
    print(f"{train['ltable_id'].nunique()} \t uniques in train ltable")
    print(f"{train['rtable_id'].nunique()} \t uniques in train rtable")

    print(f"{val['ltable_id'].nunique()} \t uniques in val ltable")
    print(f"{val['rtable_id'].nunique()} \t uniques in val rtable")
    
    print(f"{test['ltable_id'].nunique()} \t uniques in test ltable")
    print(f"{test['rtable_id'].nunique()} \t uniques in test rtable")
    print()
    train_val_l = pd.merge(train, val, on='ltable_id', suffixes=('_l','_r'), how='inner')['ltable_id'].nunique()
    train_val_r = pd.merge(train, val, on='rtable_id', suffixes=('_l','_r'), how='inner')['rtable_id'].nunique()
    train_test_l = pd.merge(train, test, on='ltable_id', suffixes=('_l','_r'), how='inner')['ltable_id'].nunique()
    train_test_r = pd.merge(train, test, on='rtable_id', suffixes=('_l','_r'), how='inner')['rtable_id'].nunique()
    val_test_l = pd.merge(val, test, on='ltable_id', suffixes=('_l','_r'), how='inner')['ltable_id'].nunique()
    val_test_r = pd.merge(val, test, on='rtable_id', suffixes=('_l','_r'), how='inner')['rtable_id'].nunique()
    
    print(f"{train_val_l} \t train/val ltable overlap")
    print(f"{train_val_r} \t train/val rtable overlap")
    print(f"{train_test_l} \t train/test ltable overlap")
    print(f"{train_test_r} \t train/test rtable overlap")
    print(f"{val_test_l} \t test/val ltable overlap")
    print(f"{val_test_r} \t test/val rtable overlap")
    
    print()
    print()
    

abt_buy_exp_data
1081 	 lsize
1092 	 rsize
5743 	 train size
1916 	 val size
1916 	 test size

973 	 uniques in train ltable
956 	 uniques in train rtable
728 	 uniques in val ltable
702 	 uniques in val rtable
737 	 uniques in test ltable
700 	 uniques in test rtable

676 	 train/val ltable overlap
649 	 train/val rtable overlap
674 	 train/test ltable overlap
650 	 train/test rtable overlap
547 	 test/val ltable overlap
516 	 test/val rtable overlap


amazon_google_exp_data
1363 	 lsize
3226 	 rsize
6874 	 train size
2293 	 val size
2293 	 test size

1126 	 uniques in train ltable
1788 	 uniques in train rtable
772 	 uniques in val ltable
1107 	 uniques in val rtable
771 	 uniques in test ltable
1090 	 uniques in test rtable

673 	 train/val ltable overlap
903 	 train/val rtable overlap
674 	 train/test ltable overlap
888 	 train/test rtable overlap
531 	 test/val ltable overlap
641 	 test/val rtable overlap


beer_exp_data
4345 	 lsize
3000 	 rsize
268 	 train size
91 	 val size
91 

# MS Marco (go see Piecewise BM25 for MSMARCO)

## Merge and BM25

In [106]:
path = "/lfs/1/sahaana/enrichment/data/MSMARCO/"
merged_col = "merged_all"
separator = "[SEP]"
dataset = 'MARCO'

In [92]:
file = "collection.tsv"
collection = pd.read_csv(f"{path}/{file}",sep='\t', header=None, 
                         index_col=0, names=['PID','Passage'])
collection

  mask |= (ar1 == a)


Unnamed: 0_level_0,Passage
PID,Unnamed: 1_level_1
0,The presence of communication amid scientific ...
1,The Manhattan Project and its atomic bomb help...
2,Essay on The Manhattan Project - The Manhattan...
3,The Manhattan Project was the name for a proje...
4,versions of each volume as well as complementa...
...,...
8841818,When metal salts emit short wavelengths of vis...
8841819,Thousands of people across the United States w...
8841820,"The recipe that creates blue, for example, inc..."
8841821,"On Independence Days of yore, old-timey crowds..."


In [93]:
file = "queries.train"
queries = pd.read_csv(f"{path}/{file}.tsv",sep='\t',header=None,
                     names=['QID', 'Query'], index_col=0)
queries

Unnamed: 0_level_0,Query
QID,Unnamed: 1_level_1
121352,define extreme
634306,what does chattel mean on credit history
920825,what was the great leap forward brainly
510633,tattoo fixers how much does it cost
737889,what is decentralization process.
...,...
633855,what does canada post regulations mean
1059728,wholesale lularoe price
210839,how can i watch the day after
908165,what to use instead of pgp in windows


In [94]:
file = "top1000.train.txt"
top1000 = pd.read_csv(f"{path}/{file}",sep='\t', header=None, usecols=[0,1], names=['QID', 'PID'])
top1000

Unnamed: 0,QID,PID
0,965162,1000930
1,279558,1000930
2,279511,1000930
3,279498,1000930
4,739878,1000930
...,...,...
478002388,727506,999540
478002389,633883,999540
478002390,898757,999540
478002391,583095,999540


In [96]:
unique_queries = pd.DataFrame(top1000['QID'].unique(),columns=['QID'])
unique_passages = pd.DataFrame(top1000['PID'].unique(),columns=['PID'])

In [97]:
query_df = pd.merge(queries, unique_queries, on='QID')
corpus_df = pd.merge(collection, unique_passages, on='PID')

In [103]:
query_path = f"{path}/tableA_processed.pkl"
corpus_path = f"{path}/tableB_processed.pkl"

query_columns = list(query_df.columns)
corpus_columns = list(corpus_df.columns)

query_columns.remove('QID')
corpus_columns.remove('PID')

query_df = merge_columns(query_df, query_columns, merged_col, query_path, separator)
corpus_df = merge_columns(corpus_df, corpus_columns, merged_col, corpus_path, separator)

In [None]:
dev_df = 

In [109]:
compute_BM25(corpus_df, query_df, "merged_all", dataset)

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


KeyboardInterrupt: 

# SQuAD

In [4]:
path = "/lfs/1/sahaana/enrichment/data/SQuAD/"
merged_col = "merged_all"
separator = "[SEP]"
dataset = 'SQuAD'

## Sentence

In [9]:
query_path = f"{path}/train_tableA_processed.pkl"
corpus_path = f"{path}/train_tableB_sent_processed.pkl"

query_df = pd.read_pickle(f"{path}/train_tableA.pkl")
corpus_df = pd.read_pickle(f"{path}/train_tableB_sent.pkl")

query_columns = list(query_df.columns)
corpus_columns = list(corpus_df.columns)

query_df = merge_columns(query_df, query_columns, merged_col, query_path, separator)
corpus_df = merge_columns(corpus_df, corpus_columns, merged_col, corpus_path, separator)

bm25_sent = compute_BM25(corpus_df, query_df, "merged_all", "SQuAD_sent", reindex=True)

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved SQuAD_sent


In [10]:
combined_bm25_sent = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25_sent[2]]))
combined_bm25_sent = combined_bm25_sent.set_index(0)
pd.to_pickle(combined_bm25_sent, '/lfs/1/sahaana/enrichment/data/Okapi25Queries/SQuAD_sent_argsort_indices.pkl')
combined_bm25_sent

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,60170,60171,60172,60173,60174,60175,60176,60177,60178,60179
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,204639,36501,38733,111237,70098,128885,215939,144864,179022,102518,...,135573,94037,127893,53939,19016,76127,38408,67168,54978,148554
1,38733,111237,102518,159393,150200,137107,65083,128885,127550,215939,...,94037,222620,216915,225200,594,1259,121314,57127,6879,84126
2,73705,115181,131688,58366,86138,97218,120773,39722,127817,183022,...,98,205,195,2,96,353,546,199,47,9
3,143023,137615,190980,103642,165099,149909,35231,59913,215413,159561,...,594,122491,22690,153055,66141,56315,21437,174196,50920,136415
4,38733,111237,215939,144864,102518,126554,143790,137107,135964,137762,...,65451,190821,29961,177183,96236,64792,14287,89023,60910,32761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86816,204639,36501,38733,111237,70098,144864,179022,102518,126554,137107,...,14287,22690,101804,92710,24604,136415,52117,116959,228265,227698
86817,36501,38733,111237,215939,144864,179022,102518,126554,143790,137107,...,71826,174102,16516,94037,107773,107318,107168,107914,107765,124240
86818,36501,38733,111237,126554,137107,127550,137762,174332,209485,205350,...,195276,70103,55205,228045,58515,161535,200544,227782,97071,227509
86819,36501,38733,111237,70098,144864,179022,102518,143790,137107,150200,...,37711,228224,227713,19311,227491,116959,228250,192318,65451,228265


In [62]:
query_path = f"{path}/dev_tableA_processed.pkl"
corpus_path = f"{path}/dev_tableB_sent_processed.pkl"

query_df = pd.read_pickle(f"{path}/dev_tableA.pkl")
corpus_df = pd.read_pickle(f"{path}/dev_tableB_sent.pkl")

query_columns = list(query_df.columns)
corpus_columns = list(corpus_df.columns)

query_df = merge_columns(query_df, query_columns, merged_col, query_path, separator)
corpus_df = merge_columns(corpus_df, corpus_columns, merged_col, corpus_path, separator)

bm25_sent = compute_BM25(corpus_df, query_df, "merged_all", "dev_SQuAD_sent", reindex=True)

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dev_SQuAD_sent


In [63]:
combined_bm25_sent = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25_sent[2]]))
combined_bm25_sent = combined_bm25_sent.set_index(0)
pd.to_pickle(combined_bm25_sent, '/lfs/1/sahaana/enrichment/data/Okapi25Queries/dev_SQuAD_sent_argsort_indices.pkl')
combined_bm25_sent

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,4361,4362,4363,4364,4365,4366,4367,4368,4369,4370
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
86821,237176,238497,239249,230110,239261,239264,243056,232665,236458,236663,...,237096,231001,241596,234148,230978,241573,229354,241485,229107,229361
86822,237176,237147,237936,231567,234542,237115,234547,238148,237888,237881,...,241482,241511,229109,241573,229361,228956,229041,241485,229021,228945
86823,237176,236663,237147,237936,237862,231567,234542,231829,241862,234547,...,237764,241840,237774,229361,241573,238245,241485,228983,241478,228990
86824,237176,236663,237147,237888,237936,237862,234542,231567,231829,244164,...,236223,241478,241482,241511,236180,229361,241573,241485,231448,228990
86825,237176,236663,237147,237888,237936,237862,231567,234542,231829,244164,...,228956,229493,228951,239020,231849,241511,234916,229109,231167,240267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92744,237176,236663,237147,237888,244164,241738,237788,237115,234547,238148,...,233482,239278,235542,240258,229493,229703,244210,243266,240842,244298
92745,237176,236663,237147,237888,231567,244164,241738,237788,236519,240386,...,241511,236081,244102,229361,244298,243875,240409,241485,241573,229493
92746,237176,236519,240386,237201,240851,234175,229580,236663,234497,237033,...,229493,231766,229978,231972,242867,231771,240409,244130,244298,244300
92747,237176,236663,230099,232529,237467,238557,233805,239835,230036,236519,...,242293,243761,243783,242934,237810,242867,240842,244300,244102,244304


## Paragraph

In [5]:
query_path = f"{path}/train_tableA_processed.pkl"
corpus_path = f"{path}/train_tableB_para_processed.pkl"

query_df = pd.read_pickle(f"{path}/train_tableA.pkl")
corpus_df = pd.read_pickle(f"{path}/train_tableB_para.pkl")

query_columns = list(query_df.columns)
corpus_columns = list(corpus_df.columns)

query_df = merge_columns(query_df, query_columns, merged_col, query_path, separator)
corpus_df = merge_columns(corpus_df, corpus_columns, merged_col, corpus_path, separator)

bm25_para = compute_BM25(corpus_df, query_df, "merged_all", "SQuAD_para", reindex=True)

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved SQuAD_para


In [6]:
combined_bm25_para = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25_para[2]]))
combined_bm25_para = combined_bm25_para.set_index(0)
pd.to_pickle(combined_bm25_para, '/lfs/1/sahaana/enrichment/data/Okapi25Queries/SQuAD_para_argsort_indices.pkl')
combined_bm25_para

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,18868,18869,18870,18871,18872,18873,18874,18875,18876,18877
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,12251,351,15189,18910,10178,4137,5399,8292,8602,17425,...,11323,5977,3921,5728,848,13777,1295,10808,15710,6025
1,16540,6801,765,18664,2383,5808,18877,15263,14172,14883,...,11248,17503,9418,4004,4435,7069,18760,10274,9371,9402
2,11485,7435,7415,11040,14697,18880,11406,12043,5876,6177,...,3685,16778,7,6,11250,0,11,1,8,36
3,15265,11456,7155,15871,11485,11513,8194,6545,11268,7415,...,466,4044,3003,7635,14679,5517,13151,2969,3921,3842
4,5399,13426,9821,195,16505,12394,10367,17130,9213,18910,...,951,3381,6462,10744,16409,5728,1871,3921,1627,16778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86816,17425,9821,1170,16505,13529,10367,1169,8597,9213,7097,...,11780,18968,18978,9193,18940,18979,5349,18945,3921,18938
86817,351,18910,12252,10178,4137,6853,5399,8292,6112,17425,...,3921,795,16061,10836,9500,5722,10844,14946,1576,9559
86818,9821,33,9000,7588,6220,308,11906,17279,17435,6576,...,18926,18933,18945,18925,18923,11479,18922,18935,18936,18938
86819,10890,5399,8602,17425,14448,9821,1150,195,12394,6296,...,18932,18934,18976,18930,18979,18968,18978,18940,18945,18939


In [64]:
query_path = f"{path}/dev_tableA_processed.pkl"
corpus_path = f"{path}/dev_tableB_para_processed.pkl"

query_df = pd.read_pickle(f"{path}/dev_tableA.pkl")
corpus_df = pd.read_pickle(f"{path}/dev_tableB_para.pkl")

query_columns = list(query_df.columns)
corpus_columns = list(corpus_df.columns)

query_df = merge_columns(query_df, query_columns, merged_col, query_path, separator)
corpus_df = merge_columns(corpus_df, corpus_columns, merged_col, corpus_path, separator)

bm25_para = compute_BM25(corpus_df, query_df, "merged_all", "dev_SQuAD_para", reindex=True)

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dev_SQuAD_para


In [65]:
combined_bm25_para = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25_para[2]]))
combined_bm25_para = combined_bm25_para.set_index(0)
pd.to_pickle(combined_bm25_para, '/lfs/1/sahaana/enrichment/data/Okapi25Queries/dev_SQuAD_para_argsort_indices.pkl')
combined_bm25_para

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,1195,1196,1197,1198,1199,1200,1201,1202,1203,1204
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
86821,19046,19815,19801,19704,20191,19820,19897,19369,19805,19053,...,19609,20074,20021,19742,19049,19108,19077,19244,20015,19076
86822,19684,19685,19486,19431,19148,19683,19675,19112,19434,20062,...,19049,20021,19043,19053,19045,19058,19041,20015,19035,19042
86823,19684,19685,19112,19088,19148,19736,19486,19431,20054,19683,...,19722,19052,19508,20014,19037,20052,19964,20015,19040,19039
86824,19684,19685,19112,19736,19486,19431,19088,19148,19683,19170,...,20021,19108,19609,19039,20014,19037,19052,19272,20015,19040
86825,19684,19685,19112,19736,19486,19431,19088,19148,20054,19683,...,19051,19042,19260,20015,19307,19035,20075,19231,19913,19068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92744,19684,19685,19736,19683,19486,19148,19112,19571,20054,19487,...,20228,19831,19076,20015,19733,19958,19631,19116,20063,20238
92745,19684,19685,19200,19728,19479,19088,19683,19478,19778,20033,...,19609,20218,19076,19116,19108,20021,19077,19089,20015,20238
92746,19684,19685,19148,19359,19683,19431,19173,19193,19361,19787,...,20234,19108,19312,19316,20015,20123,19076,20220,20218,20238
92747,19684,19685,19359,19481,19153,19361,19128,19159,19148,19121,...,19076,20195,20223,20234,20087,20217,20203,19925,20218,20238


# IMDB Wiki Movie

In [13]:
path = "/lfs/1/sahaana/enrichment/data/imdb_wiki"
merged_col = "merged_all"
separator = "[SEP]"

In [9]:
query_path = f"{path}/train_tableA_processed.pkl"
corpus_path = f"{path}/train_tableB_processed.pkl"

query_df = pd.read_pickle(f"{path}/imdb_train.pkl")
corpus_df = pd.read_pickle(f"{path}/wiki_train.pkl")

query_columns = list(query_df.columns)
corpus_columns = list(corpus_df.columns)

query_df = merge_columns(query_df, query_columns, merged_col, query_path, separator)
corpus_df = merge_columns(corpus_df, corpus_columns, merged_col, corpus_path, separator)

bm25 = compute_BM25(corpus_df, query_df, "merged_all", "imdb_wiki", reindex=True)

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved imdb_wiki


In [11]:
combined_bm25 = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25[2]]))
combined_bm25 = combined_bm25.set_index(0)
pd.to_pickle(combined_bm25, '/lfs/1/sahaana/enrichment/data/Okapi25Queries/imdb_wiki_argsort_indices.pkl')
combined_bm25

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,38241,38242,38243,38244,38245,38246,38247,38248,38249,38250
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt2825120,Q7710892,Q5537686,Q1216269,Q3469484,Q7968392,Q632362,Q36479,Q5307057,Q20649432,Q21584247,...,Q14639047,Q6544727,Q357086,Q16249578,Q7059878,Q4633000,Q3498669,Q18538354,Q498218,Q17112644
tt0403455,Q7710892,Q3469484,Q7968392,Q5537686,Q20649432,Q632362,Q1216269,Q5307057,Q36479,Q162182,...,Q498218,Q33520346,Q2298281,Q20814728,Q17112644,Q28496667,Q21428189,Q895137,Q7750926,Q2120775
tt0418460,Q7710892,Q5537686,Q1216269,Q3469484,Q7968392,Q20649432,Q632362,Q5307057,Q21584247,Q162182,...,Q7421007,Q5880282,Q2620294,Q47460729,Q4241485,Q4633000,Q498218,Q17112644,Q7750926,Q3413700
tt0477139,Q5537686,Q1216269,Q7710892,Q3469484,Q7968392,Q20649432,Q632362,Q5307057,Q36479,Q684150,...,Q4633000,Q7059878,Q7317889,Q498218,Q17112644,Q33520346,Q165685,Q18538354,Q2002073,Q7750926
tt0189630,Q7710892,Q5537686,Q1216269,Q3469484,Q7968392,Q20649432,Q632362,Q36479,Q5307057,Q21584247,...,Q357086,Q3843872,Q7059878,Q18538354,Q16249578,Q7750926,Q4633000,Q498218,Q17112644,Q3191800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt0154420,Q7710892,Q5537686,Q3469484,Q7968392,Q632362,Q18914954,Q1216269,Q162182,Q401976,Q18464203,...,Q841958,Q357086,Q16249578,Q4633000,Q18538354,Q17112644,Q7750926,Q7059878,Q1103638,Q498218
tt9358106,Q7710892,Q5537686,Q1216269,Q3469484,Q7968392,Q632362,Q36479,Q5307057,Q21584247,Q162182,...,Q58030445,Q21561875,Q2620294,Q4633000,Q3966479,Q17112644,Q18538354,Q498218,Q7750926,Q64768058
tt6284256,Q7710892,Q5537686,Q7968392,Q1216269,Q3469484,Q632362,Q20649432,Q36479,Q5307057,Q212965,...,Q21428189,Q2620294,Q47460729,Q7059878,Q498218,Q18538354,Q77733171,Q4633000,Q17112644,Q7750926
tt0997274,Q7710892,Q5537686,Q1216269,Q3469484,Q7968392,Q20649432,Q632362,Q684150,Q18914954,Q107940,...,Q20814728,Q4838270,Q33520346,Q7059878,Q17112644,Q18538354,Q4633000,Q21428189,Q7750926,Q498218


In [29]:
query_path = f"{path}/dev_tableA_processed.pkl"
corpus_path = f"{path}/dev_tableB_processed.pkl"

query_df = pd.read_pickle(f"{path}/imdb_test.pkl")
corpus_df = pd.read_pickle(f"{path}/wiki_test.pkl")

query_columns = list(query_df.columns)
corpus_columns = list(corpus_df.columns)

query_df = merge_columns(query_df, query_columns, merged_col, query_path, separator)
corpus_df = merge_columns(corpus_df, corpus_columns, merged_col, corpus_path, separator)

bm25_dev = compute_BM25(corpus_df, query_df, "merged_all", "dev_imdb_wiki", reindex=True)

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dev_imdb_wiki


In [31]:
combined_bm25_dev = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25_dev[2]]))
combined_bm25_dev = combined_bm25_dev.set_index(0)
pd.to_pickle(combined_bm25_dev, '/lfs/1/sahaana/enrichment/data/Okapi25Queries/dev_imdb_wiki_argsort_indices.pkl')
combined_bm25_dev

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,9554,9555,9556,9557,9558,9559,9560,9561,9562,9563
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0038427,Q91540,Q7737002,Q2826044,Q5373107,Q3520498,Q18488691,Q210756,Q55973412,Q834327,Q20762668,...,Q1138132,Q24053292,Q3283616,Q5432300,Q2383025,Q2582903,Q9379924,Q3827406,Q5242981,Q547323
tt1613092,Q7737002,Q91540,Q1357820,Q2826044,Q5500522,Q18488691,Q182212,Q746733,Q162997,Q1402152,...,Q3032438,Q3284333,Q4657248,Q56561552,Q5652429,Q5652897,Q20022628,Q28682695,Q89400860,Q5242981
tt1327820,Q7737002,Q91540,Q1357820,Q5500522,Q3418150,Q18488691,Q182212,Q746733,Q21001674,Q2826044,...,Q16254103,Q675937,Q8076592,Q89400860,Q29404015,Q15052313,Q1406079,Q6937761,Q5242981,Q992227
tt5462602,Q91540,Q1357820,Q5373107,Q2115481,Q2826044,Q18488691,Q14878580,Q19363674,Q6690192,Q222041,...,Q3032438,Q4290698,Q3223206,Q185776,Q2281513,Q56561552,Q20022628,Q89400860,Q5242981,Q24901880
tt0047167,Q91540,Q7737002,Q2826044,Q179673,Q5500522,Q162997,Q18488691,Q21001674,Q182212,Q246283,...,Q4657248,Q6948746,Q1188701,Q3053644,Q7044342,Q7637494,Q1764347,Q5242981,Q3961762,Q1091180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt0060865,Q7737002,Q2826044,Q91540,Q1357820,Q21001674,Q3418150,Q18488691,Q179673,Q746733,Q5500522,...,Q7162359,Q18331149,Q2902275,Q675937,Q8076592,Q15052313,Q1406079,Q323820,Q6937761,Q5242981
tt0081111,Q7737002,Q91540,Q1357820,Q3418150,Q18488691,Q162997,Q179673,Q746733,Q2826044,Q5500522,...,Q18703894,Q24055688,Q21427163,Q5432300,Q3389586,Q29652900,Q5242981,Q14323819,Q6748794,Q3961762
tt0410400,Q91540,Q21001674,Q3520498,Q7737002,Q1357820,Q3418150,Q465646,Q2296376,Q18488691,Q246283,...,Q3280348,Q42871878,Q7091335,Q3284333,Q3032438,Q56561552,Q20022628,Q89400860,Q3549360,Q5242981
tt0107611,Q7737002,Q1357820,Q2826044,Q5500522,Q3418150,Q18488691,Q91540,Q21001674,Q222023,Q23781155,...,Q3226141,Q3201912,Q250628,Q165156,Q12126323,Q10332948,Q3961762,Q56561552,Q5242981,Q2525108


## Harder Dev

In [39]:
path = "/lfs/1/sahaana/enrichment/data/imdb_wiki"
merged_col = "merged_all"
separator = "[SEP]"

In [40]:
query_path = f"{path}/HARD_train_tableA_processed.pkl"
corpus_path = f"{path}/HARD_train_tableB_processed.pkl"

query_df = pd.read_pickle(f"{path}/HARD_imdb_train.pkl")
corpus_df = pd.read_pickle(f"{path}/HARD_wiki_train.pkl")

query_columns = list(query_df.columns)
corpus_columns = list(corpus_df.columns)

query_df = merge_columns(query_df, query_columns, merged_col, query_path, separator)
corpus_df = merge_columns(corpus_df, corpus_columns, merged_col, corpus_path, separator)

bm25 = compute_BM25(corpus_df, query_df, "merged_all", "HARD_imdb_wiki", reindex=True)

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved HARD_imdb_wiki


In [41]:
combined_bm25 = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25[2]]))
combined_bm25 = combined_bm25.set_index(0)
pd.to_pickle(combined_bm25, '/lfs/1/sahaana/enrichment/data/Okapi25Queries/HARD_imdb_wiki_argsort_indices.pkl')
combined_bm25

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,43022,43023,43024,43025,43026,43027,43028,43029,43030,43031
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0100990,Q7710892,Q5537686,Q1216269,Q91540,Q3469484,Q7968392,Q36479,Q21584247,Q632362,Q2414630,...,Q56887198,Q5577149,Q5968383,Q7564768,Q776472,Q11107704,Q498218,Q7750926,Q5242981,Q6170075
tt0179757,Q7710892,Q5537686,Q20649432,Q21584247,Q7968392,Q21001674,Q91540,Q3520498,Q10298666,Q221236,...,Q17112644,Q12124710,Q3498669,Q357086,Q7059878,Q4532625,Q16249578,Q498218,Q5242981,Q3020754
tt0072351,Q7710892,Q1216269,Q3469484,Q7968392,Q36479,Q20649432,Q632362,Q511591,Q5307057,Q1357820,...,Q5356859,Q4633000,Q498218,Q3498669,Q3474027,Q17112644,Q7899422,Q7750926,Q5242981,Q7899447
tt2331047,Q7710892,Q5537686,Q7968392,Q3469484,Q1216269,Q20649432,Q632362,Q91540,Q107940,Q5307057,...,Q4838270,Q7059878,Q28180481,Q33520346,Q89400860,Q21428189,Q498218,Q7750926,Q28101385,Q5242981
tt1810710,Q7710892,Q5537686,Q1216269,Q7968392,Q3469484,Q632362,Q171048,Q20649432,Q18464203,Q511591,...,Q12129243,Q8773893,Q4633000,Q7750926,Q6946380,Q17112644,Q7728995,Q1428462,Q5242981,Q6946539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt0047192,Q7710892,Q3469484,Q7968392,Q5537686,Q1216269,Q20649432,Q511591,Q5307057,Q632362,Q21584247,...,Q357086,Q16249578,Q7059878,Q4241485,Q498218,Q5242981,Q4633000,Q7750926,Q17112644,Q6675874
tt0073442,Q7710892,Q5537686,Q7968392,Q3469484,Q1216269,Q632362,Q20649432,Q5307057,Q36479,Q162182,...,Q1546727,Q16249578,Q7059878,Q4633000,Q357086,Q17112644,Q1964207,Q5242981,Q498218,Q4315419
tt0365957,Q7710892,Q1216269,Q7968392,Q5537686,Q91540,Q20649432,Q632362,Q36479,Q3469484,Q1357820,...,Q3498669,Q357086,Q934144,Q33520346,Q16249578,Q498218,Q7059878,Q7750926,Q5242981,Q1754386
tt1922612,Q7710892,Q5537686,Q18914954,Q3469484,Q7968392,Q91540,Q5307057,Q26720927,Q913078,Q162182,...,Q952217,Q4633000,Q16249038,Q3521338,Q17112644,Q498218,Q1028548,Q7750926,Q5242981,Q1186557


In [42]:
query_path = f"{path}/HARD_dev_tableA_processed.pkl"
corpus_path = f"{path}/HARD_dev_tableB_processed.pkl"

query_df = pd.read_pickle(f"{path}/HARD_imdb_test.pkl")
corpus_df = pd.read_pickle(f"{path}/HARD_wiki_test.pkl")

query_columns = list(query_df.columns)
corpus_columns = list(corpus_df.columns)

query_df = merge_columns(query_df, query_columns, merged_col, query_path, separator)
corpus_df = merge_columns(corpus_df, corpus_columns, merged_col, corpus_path, separator)

bm25_dev = compute_BM25(corpus_df, query_df, "merged_all", "HARD_dev_imdb_wiki", reindex=True)

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved HARD_dev_imdb_wiki


In [43]:
combined_bm25_dev = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25_dev[2]]))
combined_bm25_dev = combined_bm25_dev.set_index(0)
pd.to_pickle(combined_bm25_dev, '/lfs/1/sahaana/enrichment/data/Okapi25Queries/HARD_dev_imdb_wiki_argsort_indices.pkl')
combined_bm25_dev

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,4773,4774,4775,4776,4777,4778,4779,4780,4781,4782
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0144618,Q7737002,Q1092278,Q4682732,Q3418150,Q864592,Q22251894,Q1192981,Q170564,Q155653,Q189505,...,Q3213829,Q16838897,Q47063628,Q1991730,Q42947819,Q4656804,Q7763656,Q6373380,Q476133,Q278560
tt0048281,Q4682732,Q7737002,Q3418150,Q1092278,Q864592,Q320588,Q232009,Q170564,Q1192981,Q743203,...,Q1770003,Q2162060,Q47460729,Q776934,Q1056797,Q2298281,Q679429,Q6544727,Q18538354,Q794306
tt0074236,Q1192981,Q4682732,Q7737002,Q1092278,Q27959891,Q3418150,Q541707,Q7750035,Q189054,Q191753,...,Q23013778,Q485336,Q14948585,Q5042500,Q36491820,Q679429,Q3481952,Q18538354,Q3821751,Q1853197
tt0090793,Q7737002,Q3418150,Q1192981,Q1092278,Q4682732,Q170564,Q320588,Q232009,Q622380,Q156497,...,Q6544727,Q3761844,Q3481952,Q232053,Q7760394,Q18538354,Q14477212,Q1613381,Q47460729,Q44408
tt0144294,Q7737002,Q170564,Q4682732,Q182254,Q3418150,Q235347,Q622380,Q164963,Q156497,Q7750035,...,Q33101621,Q1590837,Q36092,Q6654640,Q6899887,Q3210537,Q18538354,Q641668,Q2731768,Q47460729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt0262699,Q7737002,Q3418150,Q1092278,Q170564,Q1192981,Q7750035,Q320588,Q156497,Q235347,Q232009,...,Q617969,Q2298281,Q6544727,Q3392009,Q2308797,Q2413394,Q18538354,Q679429,Q3821751,Q3225798
tt0082405,Q7737002,Q4682732,Q170564,Q1192981,Q1092278,Q864592,Q743203,Q191753,Q156497,Q4920204,...,Q7735006,Q2511284,Q18538354,Q167022,Q4877068,Q7720977,Q8025669,Q663294,Q3821751,Q1757188
tt0116756,Q7737002,Q3418150,Q1192981,Q232009,Q622380,Q156497,Q186341,Q7752371,Q170564,Q182254,...,Q7245851,Q3285122,Q2511329,Q3989054,Q47460729,Q2298281,Q679429,Q6544727,Q18538354,Q3137753
tt2263944,Q7737002,Q1092278,Q1192981,Q3418150,Q4682732,Q232009,Q182254,Q7752371,Q320588,Q186341,...,Q6544727,Q4468674,Q43303207,Q11180478,Q3110692,Q47460729,Q3481952,Q132471,Q18538354,Q1150595


# Blocked BM_25, already processed

In [4]:
path = "/lfs/1/sahaana/enrichment/data/dm_blocked"
merged_col = "merged_all"
separator = "[SEP]"

datasets = {0:"blocked_abt_buy_exp_data", 
            1:"blocked_amazon_google_exp_data", 
            2:"blocked_beer_exp_data", 
            3:"blocked_company_exp_data", 
            4:"blocked_dblp_acm_exp_data", 
            5:"blocked_dblp_scholar_exp_data", 
            6:"blocked_dirty_dblp_acm_exp_data", 
            7:"blocked_dirty_dblp_scholar_exp_data", 
            8:"blocked_dirty_itunes_amazon_exp_data", 
            9:"blocked_dirty_walmart_amazon_exp_data", 
            10:"blocked_fodors_zagat_exp_data", 
            11:"blocked_itunes_amazon_exp_data", 
            12:"blocked_walmart_amazon_exp_data"}

## Merge and BM25
for d in datasets:
    query_path = f"{path}/{datasets[d]}/train_tableA_processed.pkl"
    corpus_path = f"{path}/{datasets[d]}/train_tableB_processed.pkl"
    

    
    query_df = pd.read_pickle(query_path)
    corpus_df = pd.read_pickle(corpus_path)
    
    bm25 = compute_BM25(corpus_df, query_df, "merged_all", datasets[d], reindex=True)
    combined_bm25 = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25[2]]))
    combined_bm25 = combined_bm25.set_index(0)
    pd.to_pickle(combined_bm25, f'/lfs/1/sahaana/enrichment/data/Okapi25Queries/{datasets[d]}_argsort_indices.pkl')

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved blocked_abt_buy_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved blocked_amazon_google_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved blocked_beer_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved blocked_company_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved blocked_dblp_acm_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main proc

In [5]:
combined_bm25

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,566,567,568,569,570,571,572,573,574,575
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2050,4787,2578,7145,4576,5479,994,4783,3626,15462,3273,...,7445,3254,8175,3566,18698,19119,10273,6362,843,21912
2052,7445,9858,13173,4576,4783,15462,3273,14384,13251,11884,...,11211,10923,18372,16173,8391,15247,5560,18620,5842,6286
5,7445,430,4984,6601,4787,19026,3273,1249,2578,7145,...,2750,14869,16791,6808,18980,4900,19119,4362,485,20932
7,4787,6601,7445,7145,2578,13173,5479,4576,9858,15462,...,4984,4162,3254,691,8175,6970,6872,6572,3566,1303
2061,7445,4576,9858,4783,15462,3273,14384,13251,11884,10905,...,2593,11953,3254,20805,7824,8175,3566,20108,11537,352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2028,7445,430,4984,19026,4787,3273,2578,7145,5479,9858,...,11350,8455,1120,9097,713,15426,5991,12024,10048,12891
2030,7445,430,19026,3273,4984,1249,4787,13173,4576,994,...,18727,19119,16634,17534,1354,1903,11759,20843,20844,13167
2037,4787,6601,7445,7145,2578,5479,4783,15462,3626,14384,...,4759,15071,4424,1274,5207,884,10481,12995,8519,21543
2039,430,4984,4787,19026,3273,6601,7145,2578,1249,9858,...,5415,20212,11440,17687,11428,11438,20411,15415,3471,7443


In [6]:
#BM 25 for dev set 
for d in datasets:
    query_df = pd.read_pickle(f"{path}/{datasets[d]}/test_tableA_processed.pkl")
    corpus_df = pd.read_pickle(f"{path}/{datasets[d]}/test_tableB_processed.pkl")
    
    query_columns = list(query_df.columns)
    corpus_columns = list(corpus_df.columns)
    
    bm25 = compute_BM25(corpus_df, query_df, "merged_all", "dev_" + datasets[d], reindex=True)
    combined_bm25 = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25[2]]))
    combined_bm25 = combined_bm25.set_index(0)
    pd.to_pickle(combined_bm25, f'/lfs/1/sahaana/enrichment/data/Okapi25Queries/dev_{datasets[d]}_argsort_indices.pkl')

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dev_blocked_abt_buy_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dev_blocked_amazon_google_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dev_blocked_beer_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dev_blocked_company_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dev_blocked_dblp_acm_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data b

In [7]:
combined_bm25

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,184,185,186,187,188,189,190,191,192,193
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1024,8070,10507,7630,12647,10206,17810,6998,13767,13174,17103,...,11625,13583,10874,2645,19753,13845,16059,9890,8386,12674
6,10507,10206,7630,13767,13174,10558,12582,13282,16298,2645,...,11143,17810,4637,5013,6990,14916,19592,6974,13023,9015
2054,9890,8070,10507,10206,12647,2671,13767,16832,17810,6998,...,7630,12093,14873,19353,21373,19006,5577,2438,21192,7601
518,10507,7630,12647,10206,13767,10558,13174,12582,2522,3155,...,2671,577,18464,11625,12700,13583,11223,2645,10874,2138
523,8070,10507,12647,10206,13767,2671,2522,12582,3155,13282,...,17494,15891,13318,594,20070,7601,11143,7637,20443,7630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1011,9890,10507,10206,12647,6998,16832,17810,13767,18197,8500,...,11435,13422,21483,6990,15278,6828,21196,13677,2645,13413
1010,8070,10507,10206,17810,3155,17483,10813,8500,12582,17103,...,18626,21976,8739,2671,2668,1891,15015,9085,1090,15003
502,9890,10507,12647,7630,10206,13767,2671,10558,12582,3155,...,11683,594,7601,11143,13413,2645,13677,11625,13422,11435
1014,9890,10507,7630,12647,10206,13767,10558,13174,12582,3155,...,12700,11214,21942,1375,19353,594,11143,7601,2805,7028


# Blocked all Joined BM_25, already processed

In [4]:
path = "/lfs/1/sahaana/enrichment/data/dm_blocked"
merged_col = "merged_all"
separator = "[SEP]"

datasets = {0:"joined_abt_buy_exp_data", 
            1:"joined_amazon_google_exp_data", 
            2:"joined_beer_exp_data", 
            3:"joined_company_exp_data", 
            4:"joined_dblp_acm_exp_data", 
            5:"joined_dblp_scholar_exp_data", 
            6:"joined_dirty_dblp_acm_exp_data", 
            7:"joined_dirty_dblp_scholar_exp_data", 
            8:"joined_dirty_itunes_amazon_exp_data", 
            9:"joined_dirty_walmart_amazon_exp_data", 
            10:"joined_fodors_zagat_exp_data", 
            11:"joined_itunes_amazon_exp_data", 
            12:"joined_walmart_amazon_exp_data"}

## Merge and BM25
for d in datasets:
    query_path = f"{path}/{datasets[d]}/tableA_processed.pkl"
    corpus_path = f"{path}/{datasets[d]}/tableB_processed.pkl"
    

    
    query_df = pd.read_pickle(query_path)
    corpus_df = pd.read_pickle(corpus_path)
    
    bm25 = compute_BM25(corpus_df, query_df, "merged_all", datasets[d], reindex=True)
    combined_bm25 = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25[2]]))
    combined_bm25 = combined_bm25.set_index(0)
    pd.to_pickle(combined_bm25, f'/lfs/1/sahaana/enrichment/data/Okapi25Queries/{datasets[d]}_argsort_indices.pkl')

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved joined_abt_buy_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved joined_amazon_google_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved joined_beer_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved joined_company_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved joined_dblp_acm_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process a

In [5]:
combined_bm25

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,22065,22066,22067,22068,22069,22070,22071,22072,22073,22074
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,7071,2079,14095,4114,9670,14009,3773,10992,14094,4398,...,7201,5165,19334,15121,11421,11188,16614,11420,18469,15155
1,4114,7071,2079,3773,14095,9670,14009,15838,14094,11908,...,13160,11136,1177,13161,3172,3878,17828,7147,16447,16720
2,7071,4114,3773,2079,14095,9670,4398,15838,2538,10992,...,15625,16798,18029,18322,18321,17376,18949,16496,18028,16495
3,7071,2079,14095,4114,9670,14009,3773,14094,10992,4398,...,1241,4376,7210,16404,13216,13214,12449,15161,21424,4378
4,7071,2079,14095,4114,3773,9670,14009,10992,14094,4398,...,6171,2472,8300,201,19758,22064,22048,10084,21159,9809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2549,7071,4114,3773,2079,14095,9670,14009,10992,14094,15838,...,14193,17704,20960,18325,16808,18036,19119,14360,7860,21539
2550,7071,2079,3773,14095,4114,9670,14009,10992,14094,4398,...,1463,4943,7868,4946,15838,20213,21389,70,3025,15807
2551,4114,2079,3773,7071,9670,14009,14095,11908,2538,4441,...,4396,15584,14448,6489,10429,10414,5061,4397,4394,21869
2552,7071,2079,4114,9670,14095,14009,4398,3773,11908,20267,...,8291,379,8304,18623,10281,2325,14222,15224,2320,8306


In [6]:
#BM 25 for dev set 
for d in datasets:
    query_df = pd.read_pickle(f"{path}/{datasets[d]}/tableA_processed.pkl")
    corpus_df = pd.read_pickle(f"{path}/{datasets[d]}/tableB_processed.pkl")
    
    query_columns = list(query_df.columns)
    corpus_columns = list(corpus_df.columns)
    
    bm25 = compute_BM25(corpus_df, query_df, "merged_all", "dev_" + datasets[d], reindex=True)
    combined_bm25 = pd.DataFrame(np.hstack([np.array(query_df.index)[:,None], bm25[2]]))
    combined_bm25 = combined_bm25.set_index(0)
    pd.to_pickle(combined_bm25, f'/lfs/1/sahaana/enrichment/data/Okapi25Queries/dev_{datasets[d]}_argsort_indices.pkl')

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dev_joined_abt_buy_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dev_joined_amazon_google_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dev_joined_beer_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dev_joined_company_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Saved dev_joined_dblp_acm_exp_data
INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data betwee

In [7]:
combined_bm25

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,22065,22066,22067,22068,22069,22070,22071,22072,22073,22074
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,7071,2079,14095,4114,9670,14009,3773,10992,14094,4398,...,7201,5165,19334,15121,11421,11188,16614,11420,18469,15155
1,4114,7071,2079,3773,14095,9670,14009,15838,14094,11908,...,13160,11136,1177,13161,3172,3878,17828,7147,16447,16720
2,7071,4114,3773,2079,14095,9670,4398,15838,2538,10992,...,15625,16798,18029,18322,18321,17376,18949,16496,18028,16495
3,7071,2079,14095,4114,9670,14009,3773,14094,10992,4398,...,1241,4376,7210,16404,13216,13214,12449,15161,21424,4378
4,7071,2079,14095,4114,3773,9670,14009,10992,14094,4398,...,6171,2472,8300,201,19758,22064,22048,10084,21159,9809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2549,7071,4114,3773,2079,14095,9670,14009,10992,14094,15838,...,14193,17704,20960,18325,16808,18036,19119,14360,7860,21539
2550,7071,2079,3773,14095,4114,9670,14009,10992,14094,4398,...,1463,4943,7868,4946,15838,20213,21389,70,3025,15807
2551,4114,2079,3773,7071,9670,14009,14095,11908,2538,4441,...,4396,15584,14448,6489,10429,10414,5061,4397,4394,21869
2552,7071,2079,4114,9670,14095,14009,4398,3773,11908,20267,...,8291,379,8304,18623,10281,2325,14222,15224,2320,8306
