In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [4]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict 

In [5]:
path_base = '/lfs/1/sahaana/enrichment/'

def save_config(config_path):
    with open(config_path, 'w') as fp:
        json.dump(config, fp, indent=4)
        
def load_config(config_path):
    with open(config_path) as fp:
        config = json.load(fp)
    return config

# MLM Pretraining Configs

## Base

## DBFT-cased-masked-ALL-BM25

In [None]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + 'data/imdb/final_table_nogenre.pkl'
config['datapath_r'] = path_base + 'data/wikidata/wikidata-processor/data/queries/movies/final_table_nogenre.pkl'
config['bm25_argsort_path'] = path_base + 'data/Okapi25Queries/imdb_bm25Okapi_summaries_nogenre_argsort.npy'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'cased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None #imdb_l, summaries_r = 9
config['mlm_masking'] = 'ALL' #ALL BEFORE AFTER

config['num_train'] = 40000
config['num_test'] = 7823
config['data_column'] = 'merged_nogenre'
config['model_name'] = f"DBFT-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
#config['model_outdir'] = '../models/DBFT-cased-masked-ALL-BM25'
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"context-enrichment/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## DBFT-uncased-masked-ALL-BM25

In [None]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + 'data/imdb/final_table_nogenre.pkl'
config['datapath_r'] = path_base + 'data/wikidata/wikidata-processor/data/queries/movies/final_table_nogenre.pkl'
config['bm25_argsort_path'] = path_base + 'data/Okapi25Queries/imdb_bm25Okapi_summaries_nogenre_argsort.npy'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None #imdb_l, summaries_r = 9
config['mlm_masking'] = 'ALL' #ALL BEFORE AFTER

config['num_train'] = 40000
config['num_test'] = 7823
config['data_column'] = 'merged_nogenre'
config['model_name'] = f"DBFT-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"context-enrichment/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## BFT-cased-masked-ALL-BM25

In [None]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + 'data/imdb/final_table_nogenre.pkl'
config['datapath_r'] = path_base + 'data/wikidata/wikidata-processor/data/queries/movies/final_table_nogenre.pkl'
config['bm25_argsort_path'] = path_base + 'data/Okapi25Queries/imdb_bm25Okapi_summaries_nogenre_argsort.npy'

config['model_type'] = 'bert'
config['tokenizer_casing'] = 'cased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None #imdb_l, summaries_r = 9
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = 40000
config['num_test'] = 7823
config['data_column'] = 'merged_nogenre'
config['model_name'] = f"BFT-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
#config['model_outdir'] = '../models/DBFT-cased-masked-ALL-BM25'
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"context-enrichment/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## BFT-uncased-masked-ALL-BM25

In [None]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + 'data/imdb/final_table_nogenre.pkl'
config['datapath_r'] = path_base + 'data/wikidata/wikidata-processor/data/queries/movies/final_table_nogenre.pkl'
config['bm25_argsort_path'] = path_base + 'data/Okapi25Queries/imdb_bm25Okapi_summaries_nogenre_argsort.npy'

config['model_type'] = 'bert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None #imdb_l, summaries_r = 9
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = 40000
config['num_test'] = 7823
config['data_column'] = 'merged_nogenre'
config['model_name'] = f"BFT-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"context-enrichment/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## DeepMatcher: DATA-uncased-masked-ALL-BM25

In [6]:
dm_data  = {0:"abt_buy_exp_data", 
            1:"amazon_google_exp_data", 
            2:"beer_exp_data", 
            3:"company_exp_data", 
            4:"dblp_acm_exp_data", 
            5:"dblp_scholar_exp_data", 
            6:"dirty_dblp_acm_exp_data", 
            7:"dirty_dblp_scholar_exp_data", 
            8:"dirty_itunes_amazon_exp_data", 
            9:"dirty_walmart_amazon_exp_data", 
            10:"fodors_zagat_exp_data", 
            11:"itunes_amazon_exp_data", 
            12:"walmart_amazon_exp_data"}

In [30]:
for i in dm_data:
    config = defaultdict(dict) 

    config['datapath_l'] = path_base + f'data/deepmatcher/{dm_data[i]}/tableA_processed.pkl'
    config['datapath_r'] = path_base + f'data/deepmatcher/{dm_data[i]}/tableB_processed.pkl'
    config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{dm_data[i]}_argsort_indices.pkl'

    config['model_type'] = 'distilbert'
    config['tokenizer_casing'] = 'uncased'
    config['from_scratch'] = False

    config['mlm_probability'] = 0.15
    config['mlm_num_seps'] = None #imdb_l, summaries_r = 9
    config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

    config['num_train'] = len(pd.read_pickle(config['datapath_l']))
    config['num_test'] = 0
    config['data_column'] = 'merged_all'
    config['model_name'] = f"{dm_data[i]}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
    config['train_epochs'] = 20
    config['batch_size'] = 8

    config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
    print(config_path)
    print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
    save_config(config_path)
    load_config(config_path)
    print()
    if i == 3:
        break

/lfs/1/sahaana/enrichment/ember/pretraining/configs/abt_buy_exp_data-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/abt_buy_exp_data-uncased-masked-ALL-BM25.json

/lfs/1/sahaana/enrichment/ember/pretraining/configs/amazon_google_exp_data-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/amazon_google_exp_data-uncased-masked-ALL-BM25.json

/lfs/1/sahaana/enrichment/ember/pretraining/configs/beer_exp_data-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/beer_exp_data-uncased-masked-ALL-BM25.json

/lfs/1/sahaana/enrichment/ember/pretraining/configs/company_exp_data-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/company_exp_data-uncased-masked-ALL-BM25.json



In [34]:
pd.read_pickle('/lfs/1/sahaana/enrichment/data/deepmatcher/company_exp_data/tableA_processed.pkl')

Unnamed: 0_level_0,content,merged_all
id,Unnamed: 1_level_1,Unnamed: 2_level_1
http://www.wikidata.org/entity/Q6843249_wiki,and the outdoors type private founded 1977 hea...,[SEP] content and the outdoors type private ...
http://www.wikidata.org/entity/Q7243708_wiki,verification . please help improve this articl...,[SEP] content verification . please help imp...
http://www.wikidata.org/entity/Q4047803_wiki,filename extension . bik . bk2 type code bink ...,[SEP] content filename extension . bik . bk2...
http://www.wikidata.org/entity/Q834077_wiki,might be slanted towards recent events . pleas...,[SEP] content might be slanted towards recen...
http://www.wikidata.org/entity/Q4836856_wiki,sources . please help improve this article by ...,[SEP] content sources . please help improve ...
...,...,...
http://www.wikidata.org/entity/Q5059736_wiki,knowledge societies type private industry expe...,[SEP] content knowledge societies type priva...
http://www.wikidata.org/entity/Q5973259_wiki,the subject in a subjective manner without imp...,[SEP] content the subject in a subjective ma...
http://www.wikidata.org/entity/Q20644661_wiki,founded june 2009 headquarters cambridge canad...,[SEP] content founded june 2009 headquarters...
http://www.wikidata.org/entity/Q7394750_wiki,please help improve it or discuss these issues...,[SEP] content please help improve it or disc...


In [35]:
pd.read_pickle('/lfs/1/sahaana/enrichment/data/deepmatcher/company_exp_data/tableA_processed.pkl')

Unnamed: 0_level_0,content,merged_all
id,Unnamed: 1_level_1,Unnamed: 2_level_1
http://www.wikidata.org/entity/Q6843249_wiki,and the outdoors type private founded 1977 hea...,[SEP] content and the outdoors type private ...
http://www.wikidata.org/entity/Q7243708_wiki,verification . please help improve this articl...,[SEP] content verification . please help imp...
http://www.wikidata.org/entity/Q4047803_wiki,filename extension . bik . bk2 type code bink ...,[SEP] content filename extension . bik . bk2...
http://www.wikidata.org/entity/Q834077_wiki,might be slanted towards recent events . pleas...,[SEP] content might be slanted towards recen...
http://www.wikidata.org/entity/Q4836856_wiki,sources . please help improve this article by ...,[SEP] content sources . please help improve ...
...,...,...
http://www.wikidata.org/entity/Q5059736_wiki,knowledge societies type private industry expe...,[SEP] content knowledge societies type priva...
http://www.wikidata.org/entity/Q5973259_wiki,the subject in a subjective manner without imp...,[SEP] content the subject in a subjective ma...
http://www.wikidata.org/entity/Q20644661_wiki,founded june 2009 headquarters cambridge canad...,[SEP] content founded june 2009 headquarters...
http://www.wikidata.org/entity/Q7394750_wiki,please help improve it or discuss these issues...,[SEP] content please help improve it or disc...


In [36]:
pd.read_pickle('/lfs/1/sahaana/enrichment/data/Okapi25Queries/company_exp_data_argsort_indices.pkl')

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,28191,28192,28193,28194,28195,28196,28197,28198,28199,28200
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
http://www.wikidata.org/entity/Q6843249_wiki,http://www.wikidata.org/entity/Q2013404_official,http://www.wikidata.org/entity/Q7956107_official,http://www.wikidata.org/entity/Q5206553_official,http://www.wikidata.org/entity/Q7766549_official,http://www.wikidata.org/entity/Q5089247_official,http://www.wikidata.org/entity/Q6714268_official,http://www.wikidata.org/entity/Q3192653_official,http://www.wikidata.org/entity/Q689327_official,http://www.wikidata.org/entity/Q2550716_official,http://www.wikidata.org/entity/Q1914214_official,...,http://www.wikidata.org/entity/Q5043331_official,http://www.wikidata.org/entity/Q5882605_official,http://www.wikidata.org/entity/Q5614797_official,http://www.wikidata.org/entity/Q6627148_official,http://www.wikidata.org/entity/Q7989044_official,http://www.wikidata.org/entity/Q5579967_official,http://www.wikidata.org/entity/Q2740638_official,http://www.wikidata.org/entity/Q7311031_official,http://www.wikidata.org/entity/Q8037611_official,http://www.wikidata.org/entity/Q6843249_official
http://www.wikidata.org/entity/Q7243708_wiki,http://www.wikidata.org/entity/Q5766832_official,http://www.wikidata.org/entity/Q4584533_official,http://www.wikidata.org/entity/Q3536932_official,http://www.wikidata.org/entity/Q7844499_official,http://www.wikidata.org/entity/Q6996012_official,http://www.wikidata.org/entity/Q7956107_official,http://www.wikidata.org/entity/Q5206553_official,http://www.wikidata.org/entity/Q583268_official,http://www.wikidata.org/entity/Q5463207_official,http://www.wikidata.org/entity/Q7766549_official,...,http://www.wikidata.org/entity/Q7061450_official,http://www.wikidata.org/entity/Q6546636_official,http://www.wikidata.org/entity/Q7102891_official,http://www.wikidata.org/entity/Q3432521_official,http://www.wikidata.org/entity/Q803873_official,http://www.wikidata.org/entity/Q17096512_official,http://www.wikidata.org/entity/Q5514600_official,http://www.wikidata.org/entity/Q7839926_official,http://www.wikidata.org/entity/Q7243708_official,http://www.wikidata.org/entity/Q7320811_official
http://www.wikidata.org/entity/Q4047803_wiki,http://www.wikidata.org/entity/Q6979332_official,http://www.wikidata.org/entity/Q3536932_official,http://www.wikidata.org/entity/Q294205_official,http://www.wikidata.org/entity/Q2013404_official,http://www.wikidata.org/entity/Q7956107_official,http://www.wikidata.org/entity/Q5206553_official,http://www.wikidata.org/entity/Q7766549_official,http://www.wikidata.org/entity/Q5463207_official,http://www.wikidata.org/entity/Q6714268_official,http://www.wikidata.org/entity/Q2550716_official,...,http://www.wikidata.org/entity/Q5306251_official,http://www.wikidata.org/entity/Q3002349_official,http://www.wikidata.org/entity/Q5519771_official,http://www.wikidata.org/entity/Q7950497_official,http://www.wikidata.org/entity/Q3875063_official,http://www.wikidata.org/entity/Q122741_official,http://www.wikidata.org/entity/Q638864_official,http://www.wikidata.org/entity/Q19865721_official,http://www.wikidata.org/entity/Q6457560_official,http://www.wikidata.org/entity/Q4047803_official
http://www.wikidata.org/entity/Q834077_wiki,http://www.wikidata.org/entity/Q626459_official,http://www.wikidata.org/entity/Q7766549_official,http://www.wikidata.org/entity/Q5089247_official,http://www.wikidata.org/entity/Q6714268_official,http://www.wikidata.org/entity/Q6094914_official,http://www.wikidata.org/entity/Q3192653_official,http://www.wikidata.org/entity/Q2550716_official,http://www.wikidata.org/entity/Q1914214_official,http://www.wikidata.org/entity/Q16974104_official,http://www.wikidata.org/entity/Q7548230_official,...,http://www.wikidata.org/entity/Q76615_official,http://www.wikidata.org/entity/Q7989470_official,http://www.wikidata.org/entity/Q5082088_official,http://www.wikidata.org/entity/Q6035376_official,http://www.wikidata.org/entity/Q7061450_official,http://www.wikidata.org/entity/Q185608_official,http://www.wikidata.org/entity/Q7849303_official,http://www.wikidata.org/entity/Q7311031_official,http://www.wikidata.org/entity/Q1192667_official,http://www.wikidata.org/entity/Q4040329_official
http://www.wikidata.org/entity/Q4836856_wiki,http://www.wikidata.org/entity/Q626459_official,http://www.wikidata.org/entity/Q6996012_official,http://www.wikidata.org/entity/Q1327775_official,http://www.wikidata.org/entity/Q5206553_official,http://www.wikidata.org/entity/Q7956107_official,http://www.wikidata.org/entity/Q7766549_official,http://www.wikidata.org/entity/Q5010813_official,http://www.wikidata.org/entity/Q5089247_official,http://www.wikidata.org/entity/Q7543889_official,http://www.wikidata.org/entity/Q2550716_official,...,http://www.wikidata.org/entity/Q3108958_official,http://www.wikidata.org/entity/Q674138_official,http://www.wikidata.org/entity/Q375261_official,http://www.wikidata.org/entity/Q7798864_official,http://www.wikidata.org/entity/Q18224_official,http://www.wikidata.org/entity/Q1775875_official,http://www.wikidata.org/entity/Q1273336_official,http://www.wikidata.org/entity/Q7834817_official,http://www.wikidata.org/entity/Q4863947_official,http://www.wikidata.org/entity/Q4836856_official
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
http://www.wikidata.org/entity/Q5059736_wiki,http://www.wikidata.org/entity/Q3536932_official,http://www.wikidata.org/entity/Q294205_official,http://www.wikidata.org/entity/Q1327775_official,http://www.wikidata.org/entity/Q2013404_official,http://www.wikidata.org/entity/Q5206553_official,http://www.wikidata.org/entity/Q7956107_official,http://www.wikidata.org/entity/Q5089247_official,http://www.wikidata.org/entity/Q2550716_official,http://www.wikidata.org/entity/Q3192653_official,http://www.wikidata.org/entity/Q7543889_official,...,http://www.wikidata.org/entity/Q1786345_official,http://www.wikidata.org/entity/Q4674790_official,http://www.wikidata.org/entity/Q17055715_official,http://www.wikidata.org/entity/Q4651287_official,http://www.wikidata.org/entity/Q7598420_official,http://www.wikidata.org/entity/Q7511771_official,http://www.wikidata.org/entity/Q7191297_official,http://www.wikidata.org/entity/Q5264294_official,http://www.wikidata.org/entity/Q16002572_official,http://www.wikidata.org/entity/Q5059736_official
http://www.wikidata.org/entity/Q5973259_wiki,http://www.wikidata.org/entity/Q5766832_official,http://www.wikidata.org/entity/Q294205_official,http://www.wikidata.org/entity/Q2013404_official,http://www.wikidata.org/entity/Q7956107_official,http://www.wikidata.org/entity/Q6714268_official,http://www.wikidata.org/entity/Q7543889_official,http://www.wikidata.org/entity/Q2550716_official,http://www.wikidata.org/entity/Q3192653_official,http://www.wikidata.org/entity/Q16974104_official,http://www.wikidata.org/entity/Q5323694_official,...,http://www.wikidata.org/entity/Q7115148_official,http://www.wikidata.org/entity/Q7598420_official,http://www.wikidata.org/entity/Q3819976_official,http://www.wikidata.org/entity/Q7934971_official,http://www.wikidata.org/entity/Q5616895_official,http://www.wikidata.org/entity/Q5008097_official,http://www.wikidata.org/entity/Q4896433_official,http://www.wikidata.org/entity/Q7061450_official,http://www.wikidata.org/entity/Q7511771_official,http://www.wikidata.org/entity/Q20311499_official
http://www.wikidata.org/entity/Q20644661_wiki,http://www.wikidata.org/entity/Q7228679_official,http://www.wikidata.org/entity/Q7844499_official,http://www.wikidata.org/entity/Q17032198_official,http://www.wikidata.org/entity/Q5089247_official,http://www.wikidata.org/entity/Q6094914_official,http://www.wikidata.org/entity/Q3192653_official,http://www.wikidata.org/entity/Q2550716_official,http://www.wikidata.org/entity/Q7543889_official,http://www.wikidata.org/entity/Q16974104_official,http://www.wikidata.org/entity/Q1602881_official,...,http://www.wikidata.org/entity/Q6048317_official,http://www.wikidata.org/entity/Q4816811_official,http://www.wikidata.org/entity/Q128356_official,http://www.wikidata.org/entity/Q4904717_official,http://www.wikidata.org/entity/Q19877982_official,http://www.wikidata.org/entity/Q7957512_official,http://www.wikidata.org/entity/Q1024012_official,http://www.wikidata.org/entity/Q7572201_official,http://www.wikidata.org/entity/Q7533666_official,http://www.wikidata.org/entity/Q20644661_official
http://www.wikidata.org/entity/Q7394750_wiki,http://www.wikidata.org/entity/Q4584533_official,http://www.wikidata.org/entity/Q5766832_official,http://www.wikidata.org/entity/Q3536932_official,http://www.wikidata.org/entity/Q3716842_official,http://www.wikidata.org/entity/Q7886359_official,http://www.wikidata.org/entity/Q7844499_official,http://www.wikidata.org/entity/Q1327775_official,http://www.wikidata.org/entity/Q583268_official,http://www.wikidata.org/entity/Q1466184_official,http://www.wikidata.org/entity/Q5089247_official,...,http://www.wikidata.org/entity/Q7012543_official,http://www.wikidata.org/entity/Q991188_official,http://www.wikidata.org/entity/Q6732036_official,http://www.wikidata.org/entity/Q4217792_official,http://www.wikidata.org/entity/Q6840399_official,http://www.wikidata.org/entity/Q5196733_official,http://www.wikidata.org/entity/Q7623717_official,http://www.wikidata.org/entity/Q671356_official,http://www.wikidata.org/entity/Q4688976_official,http://www.wikidata.org/entity/Q7246604_official


In [32]:
np.max(np.load(path_base + f'data/Okapi25Queries/{dm_data[0]}_argsort.npy'))

1091

## MS MARCO: MARCO-uncased-masked-ALL-BM25 (running with BM25 over the already 25-ed 1k)

In [105]:
data = 'MARCO'

In [106]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/MSMARCO/tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/MSMARCO/tableB_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_1k_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/MARCO-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/MARCO-uncased-masked-ALL-BM25.json



## MS MARCO (No BM25 scratch): MARCO-uncased-masked-ALL

In [40]:
data = 'MARCO-1K'

In [48]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/MSMARCO/tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/MSMARCO/tableB_processed.pkl'
config['supervision'] = path_base + 'data/MSMARCO/top1000.pidqid.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['supervision']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}"
config['train_epochs'] = 5
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/MARCO-1K-uncased-masked-ALL.json
python scripts/pretrain_MLM.py -c configs/MARCO-1K-uncased-masked-ALL.json



In [49]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/MSMARCO/tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/MSMARCO/tableB_processed.pkl'
config['supervision'] = path_base + 'data/MSMARCO/top1000.pidqid.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['supervision']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-1-epoch"
config['train_epochs'] = 1 ## Changed this from above
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/MARCO-1K-uncased-masked-ALL-1-epoch.json
python scripts/pretrain_MLM.py -c configs/MARCO-1K-uncased-masked-ALL-1-epoch.json



## SQuAD Sent

In [71]:
data = 'SQuAD_sent'

In [72]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/SQuAD/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/SQuAD/train_tableB_sent_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/SQuAD_sent-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/SQuAD_sent-uncased-masked-ALL-BM25.json



## SQuAD Para

In [37]:
data = 'SQuAD_para'

In [63]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/SQuAD/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/SQuAD/train_tableB_para_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/SQuAD_para-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/SQuAD_para-uncased-masked-ALL-BM25.json



## IMDB_Wiki

In [6]:
data = 'imdb_wiki'

In [7]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/imdb_wiki-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/imdb_wiki-uncased-masked-ALL-BM25.json



## small IMDB_fuzzy

In [14]:
data = 'small_imdb_fuzzy'

In [19]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)


/lfs/1/sahaana/enrichment/ember/pretraining/configs/small_imdb_fuzzy-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/small_imdb_fuzzy-uncased-masked-ALL-BM25.json


{'datapath_l': '/lfs/1/sahaana/enrichment/data/small_imdb_fuzzy/train_tableA_processed.pkl',
 'datapath_r': '/lfs/1/sahaana/enrichment/data/small_imdb_fuzzy/train_tableB_processed.pkl',
 'bm25_argsort_path': '/lfs/1/sahaana/enrichment/data/Okapi25Queries/small_imdb_fuzzy_argsort_indices.pkl',
 'model_type': 'distilbert',
 'tokenizer_casing': 'uncased',
 'from_scratch': False,
 'mlm_probability': 0.15,
 'mlm_num_seps': None,
 'mlm_masking': 'ALL',
 'num_train': 40000,
 'num_test': 0,
 'data_column': 'merged_all',
 'model_name': 'small_imdb_fuzzy-uncased-masked-ALL-BM25',
 'train_epochs': 20,
 'batch_size': 8}

## hard IMDB_fuzzy

In [24]:
data = 'hard_imdb_fuzzy'

In [25]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)


/lfs/1/sahaana/enrichment/ember/pretraining/configs/hard_imdb_fuzzy-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/hard_imdb_fuzzy-uncased-masked-ALL-BM25.json


{'datapath_l': '/lfs/1/sahaana/enrichment/data/hard_imdb_fuzzy/train_tableA_processed.pkl',
 'datapath_r': '/lfs/1/sahaana/enrichment/data/hard_imdb_fuzzy/train_tableB_processed.pkl',
 'bm25_argsort_path': '/lfs/1/sahaana/enrichment/data/Okapi25Queries/hard_imdb_fuzzy_argsort_indices.pkl',
 'model_type': 'distilbert',
 'tokenizer_casing': 'uncased',
 'from_scratch': False,
 'mlm_probability': 0.15,
 'mlm_num_seps': None,
 'mlm_masking': 'ALL',
 'num_train': 40000,
 'num_test': 0,
 'data_column': 'merged_all',
 'model_name': 'hard_imdb_fuzzy-uncased-masked-ALL-BM25',
 'train_epochs': 20,
 'batch_size': 8}

## small IMDB_fuzzy

In [14]:
data = 'small_imdb_fuzzy'

In [19]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)


/lfs/1/sahaana/enrichment/ember/pretraining/configs/small_imdb_fuzzy-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/small_imdb_fuzzy-uncased-masked-ALL-BM25.json


{'datapath_l': '/lfs/1/sahaana/enrichment/data/small_imdb_fuzzy/train_tableA_processed.pkl',
 'datapath_r': '/lfs/1/sahaana/enrichment/data/small_imdb_fuzzy/train_tableB_processed.pkl',
 'bm25_argsort_path': '/lfs/1/sahaana/enrichment/data/Okapi25Queries/small_imdb_fuzzy_argsort_indices.pkl',
 'model_type': 'distilbert',
 'tokenizer_casing': 'uncased',
 'from_scratch': False,
 'mlm_probability': 0.15,
 'mlm_num_seps': None,
 'mlm_masking': 'ALL',
 'num_train': 40000,
 'num_test': 0,
 'data_column': 'merged_all',
 'model_name': 'small_imdb_fuzzy-uncased-masked-ALL-BM25',
 'train_epochs': 20,
 'batch_size': 8}

## very hard IMDB_fuzzy

In [39]:
data = 'very_hard_imdb_fuzzy'

In [40]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)


/lfs/1/sahaana/enrichment/ember/pretraining/configs/very_hard_imdb_fuzzy-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/very_hard_imdb_fuzzy-uncased-masked-ALL-BM25.json


{'datapath_l': '/lfs/1/sahaana/enrichment/data/very_hard_imdb_fuzzy/train_tableA_processed.pkl',
 'datapath_r': '/lfs/1/sahaana/enrichment/data/very_hard_imdb_fuzzy/train_tableB_processed.pkl',
 'bm25_argsort_path': '/lfs/1/sahaana/enrichment/data/Okapi25Queries/very_hard_imdb_fuzzy_argsort_indices.pkl',
 'model_type': 'distilbert',
 'tokenizer_casing': 'uncased',
 'from_scratch': False,
 'mlm_probability': 0.15,
 'mlm_num_seps': None,
 'mlm_masking': 'ALL',
 'num_train': 40000,
 'num_test': 0,
 'data_column': 'merged_all',
 'model_name': 'very_hard_imdb_fuzzy-uncased-masked-ALL-BM25',
 'train_epochs': 20,
 'batch_size': 8}

## IMDB_fuzzy

In [12]:
data = 'imdb_fuzzy'

In [13]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/imdb_fuzzy-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/imdb_fuzzy-uncased-masked-ALL-BM25.json

