In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [4]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict 

In [5]:
path_base = '/lfs/1/sahaana/enrichment/'

def save_config(config_path):
    with open(config_path, 'w') as fp:
        json.dump(config, fp, indent=4)
        
def load_config(config_path):
    with open(config_path) as fp:
        config = json.load(fp)
    return config

# MLM Pretraining Configs

## Base

## DBFT-cased-masked-ALL-BM25

In [None]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + 'data/imdb/final_table_nogenre.pkl'
config['datapath_r'] = path_base + 'data/wikidata/wikidata-processor/data/queries/movies/final_table_nogenre.pkl'
config['bm25_argsort_path'] = path_base + 'data/Okapi25Queries/imdb_bm25Okapi_summaries_nogenre_argsort.npy'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'cased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None #imdb_l, summaries_r = 9
config['mlm_masking'] = 'ALL' #ALL BEFORE AFTER

config['num_train'] = 40000
config['num_test'] = 7823
config['data_column'] = 'merged_nogenre'
config['model_name'] = f"DBFT-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
#config['model_outdir'] = '../models/DBFT-cased-masked-ALL-BM25'
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"context-enrichment/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## DBFT-uncased-masked-ALL-BM25

In [None]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + 'data/imdb/final_table_nogenre.pkl'
config['datapath_r'] = path_base + 'data/wikidata/wikidata-processor/data/queries/movies/final_table_nogenre.pkl'
config['bm25_argsort_path'] = path_base + 'data/Okapi25Queries/imdb_bm25Okapi_summaries_nogenre_argsort.npy'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None #imdb_l, summaries_r = 9
config['mlm_masking'] = 'ALL' #ALL BEFORE AFTER

config['num_train'] = 40000
config['num_test'] = 7823
config['data_column'] = 'merged_nogenre'
config['model_name'] = f"DBFT-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"context-enrichment/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## BFT-cased-masked-ALL-BM25

In [None]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + 'data/imdb/final_table_nogenre.pkl'
config['datapath_r'] = path_base + 'data/wikidata/wikidata-processor/data/queries/movies/final_table_nogenre.pkl'
config['bm25_argsort_path'] = path_base + 'data/Okapi25Queries/imdb_bm25Okapi_summaries_nogenre_argsort.npy'

config['model_type'] = 'bert'
config['tokenizer_casing'] = 'cased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None #imdb_l, summaries_r = 9
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = 40000
config['num_test'] = 7823
config['data_column'] = 'merged_nogenre'
config['model_name'] = f"BFT-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
#config['model_outdir'] = '../models/DBFT-cased-masked-ALL-BM25'
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"context-enrichment/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## BFT-uncased-masked-ALL-BM25

In [None]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + 'data/imdb/final_table_nogenre.pkl'
config['datapath_r'] = path_base + 'data/wikidata/wikidata-processor/data/queries/movies/final_table_nogenre.pkl'
config['bm25_argsort_path'] = path_base + 'data/Okapi25Queries/imdb_bm25Okapi_summaries_nogenre_argsort.npy'

config['model_type'] = 'bert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None #imdb_l, summaries_r = 9
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = 40000
config['num_test'] = 7823
config['data_column'] = 'merged_nogenre'
config['model_name'] = f"BFT-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"context-enrichment/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## DeepMatcher: DATA-uncased-masked-ALL-BM25

In [6]:
dm_data  = {0:"abt_buy_exp_data", 
            1:"amazon_google_exp_data", 
            2:"beer_exp_data", 
            3:"company_exp_data", 
            4:"dblp_acm_exp_data", 
            5:"dblp_scholar_exp_data", 
            6:"dirty_dblp_acm_exp_data", 
            7:"dirty_dblp_scholar_exp_data", 
            8:"dirty_itunes_amazon_exp_data", 
            9:"dirty_walmart_amazon_exp_data", 
            10:"fodors_zagat_exp_data", 
            11:"itunes_amazon_exp_data", 
            12:"walmart_amazon_exp_data"}

In [9]:
for i in dm_data:
    config = defaultdict(dict) 

    config['datapath_l'] = path_base + f'data/deepmatcher/{dm_data[i]}/tableA_processed.pkl'
    config['datapath_r'] = path_base + f'data/deepmatcher/{dm_data[i]}/tableB_processed.pkl'
    config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{dm_data[i]}_argsort_indices.pkl'

    config['model_type'] = 'distilbert'
    config['tokenizer_casing'] = 'uncased'
    config['from_scratch'] = False

    config['mlm_probability'] = 0.15
    config['mlm_num_seps'] = None #imdb_l, summaries_r = 9
    config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

    config['num_train'] = len(pd.read_pickle(config['datapath_l']))
    config['num_test'] = 0
    config['data_column'] = 'merged_all'
    config['model_name'] = f"{dm_data[i]}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
    config['train_epochs'] = 20
    config['batch_size'] = 8

    config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
    print(config_path)
    print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
    save_config(config_path)
    load_config(config_path)
    print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/abt_buy_exp_data-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/abt_buy_exp_data-uncased-masked-ALL-BM25.json

/lfs/1/sahaana/enrichment/ember/pretraining/configs/amazon_google_exp_data-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/amazon_google_exp_data-uncased-masked-ALL-BM25.json

/lfs/1/sahaana/enrichment/ember/pretraining/configs/beer_exp_data-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/beer_exp_data-uncased-masked-ALL-BM25.json

/lfs/1/sahaana/enrichment/ember/pretraining/configs/company_exp_data-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/company_exp_data-uncased-masked-ALL-BM25.json

/lfs/1/sahaana/enrichment/ember/pretraining/configs/dblp_acm_exp_data-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/dblp_acm_exp_data-uncased-masked-ALL-BM25.json

/lfs/1/sahaana/enrichment/ember/pretraining/configs/dblp_

In [36]:
pd.read_pickle(f'/lfs/1/sahaana/enrichment/data/deepmatcher/{dm_data[0]}/tableB_processed.pkl')

Unnamed: 0,id,name,description,price,merged_all
0,0,linksys etherfast ezxs88w ethernet switch ezxs88w,linksys etherfast 8-port 10/100 switch ( new/w...,,[SEP] name linksys etherfast ezxs88w etherne...
1,1,linksys etherfast ezxs55w ethernet switch,5 x 10/100base-tx lan,,[SEP] name linksys etherfast ezxs55w etherne...
2,2,netgear prosafe fs105 ethernet switch fs105na,netgear fs105 prosafe 5 port 10/100 desktop sw...,,[SEP] name netgear prosafe fs105 ethernet sw...
3,3,belkin pro series high integrity vga/svga moni...,1 x hd-15 1 x hd-15 10ft beige,,[SEP] name belkin pro series high integrity ...
4,4,netgear prosafe jfs516 ethernet switch,netgear prosafe 16 port 10/100 rackmount switc...,,[SEP] name netgear prosafe jfs516 ethernet s...
...,...,...,...,...,...
1087,1087,sony vaio fw378j/b notebook vgnfw378j/b,intel centrino 2 core 2 duo p8600 2.4 ghz 16.4...,,[SEP] name sony vaio fw378j/b notebook vgnfw...
1088,1088,sennheiser cx380 sennheiser cx 380 sport ii gr...,,,[SEP] name sennheiser cx380 sennheiser cx 38...
1089,1089,iwork 09 retail-int dvd mb942z/a,,,[SEP] name iwork 09 retail-int dvd mb942z/a ...
1090,1090,iwork 09 family pack-int dvd mb943z/a,,,[SEP] name iwork 09 family pack-int dvd mb94...


In [32]:
np.max(np.load(path_base + f'data/Okapi25Queries/{dm_data[0]}_argsort.npy'))

1091

## MS MARCO: MARCO-uncased-masked-ALL-BM25 (running with BM25 over the already 25-ed 1k)

In [105]:
data = 'MARCO'

In [106]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/MSMARCO/tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/MSMARCO/tableB_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_1k_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/MARCO-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/MARCO-uncased-masked-ALL-BM25.json



## MS MARCO (No BM25 scratch): MARCO-uncased-masked-ALL

In [40]:
data = 'MARCO-1K'

In [48]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/MSMARCO/tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/MSMARCO/tableB_processed.pkl'
config['supervision'] = path_base + 'data/MSMARCO/top1000.pidqid.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['supervision']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}"
config['train_epochs'] = 5
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/MARCO-1K-uncased-masked-ALL.json
python scripts/pretrain_MLM.py -c configs/MARCO-1K-uncased-masked-ALL.json



In [49]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/MSMARCO/tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/MSMARCO/tableB_processed.pkl'
config['supervision'] = path_base + 'data/MSMARCO/top1000.pidqid.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['supervision']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-1-epoch"
config['train_epochs'] = 1 ## Changed this from above
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/MARCO-1K-uncased-masked-ALL-1-epoch.json
python scripts/pretrain_MLM.py -c configs/MARCO-1K-uncased-masked-ALL-1-epoch.json



## SQuAD Sent

In [71]:
data = 'SQuAD_sent'

In [72]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/SQuAD/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/SQuAD/train_tableB_sent_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/SQuAD_sent-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/SQuAD_sent-uncased-masked-ALL-BM25.json



## SQuAD Para

In [37]:
data = 'SQuAD_para'

In [63]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/SQuAD/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/SQuAD/train_tableB_para_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/SQuAD_para-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/SQuAD_para-uncased-masked-ALL-BM25.json



## IMDB_Wiki

In [6]:
data = 'imdb_wiki'

In [7]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/imdb_wiki-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/imdb_wiki-uncased-masked-ALL-BM25.json



## small IMDB_fuzzy

In [10]:
data = 'small_imdb_fuzzy'

In [11]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/small_imdb_fuzzy-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/small_imdb_fuzzy-uncased-masked-ALL-BM25.json



## IMDB_fuzzy

In [12]:
data = 'imdb_fuzzy'

In [13]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['bm25_argsort_path'] = path_base + f'data/Okapi25Queries/{data}_argsort_indices.pkl'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = len(pd.read_pickle(config['datapath_l']))
config['num_test'] = 0
config['data_column'] = 'merged_all'
config['model_name'] = f"{data}-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"ember/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)
print()

/lfs/1/sahaana/enrichment/ember/pretraining/configs/imdb_fuzzy-uncased-masked-ALL-BM25.json
python scripts/pretrain_MLM.py -c configs/imdb_fuzzy-uncased-masked-ALL-BM25.json

