In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [24]:
import json
from collections import defaultdict 

In [40]:
path_base = '/lfs/1/sahaana/enrichment/'

def save_config(config_path):
    with open(config_path, 'w') as fp:
        json.dump(config, fp, indent=4)
        
def load_config(config_path):
    with open(config_path) as fp:
        config = json.load(fp)
    return config

# MLM Pretraining Configs

## Base

## DBFT-cased-masked-ALL-BM25

In [None]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + 'data/imdb/final_table_nogenre.pkl'
config['datapath_r'] = path_base + 'data/wikidata/wikidata-processor/data/queries/movies/final_table_nogenre.pkl'
config['bm25_argsort_path'] = path_base + 'data/Okapi25Queries/imdb_bm25Okapi_summaries_nogenre_argsort.npy'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'cased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None #imdb_l, summaries_r = 9
config['mlm_masking'] = 'ALL' #ALL BEFORE AFTER

config['num_train'] = 40000
config['num_test'] = 7823
config['data_column'] = 'merged_nogenre'
config['model_name'] = f"DBFT-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
#config['model_outdir'] = '../models/DBFT-cased-masked-ALL-BM25'
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"context-enrichment/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## DBFT-uncased-masked-ALL-BM25

In [None]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + 'data/imdb/final_table_nogenre.pkl'
config['datapath_r'] = path_base + 'data/wikidata/wikidata-processor/data/queries/movies/final_table_nogenre.pkl'
config['bm25_argsort_path'] = path_base + 'data/Okapi25Queries/imdb_bm25Okapi_summaries_nogenre_argsort.npy'

config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None #imdb_l, summaries_r = 9
config['mlm_masking'] = 'ALL' #ALL BEFORE AFTER

config['num_train'] = 40000
config['num_test'] = 7823
config['data_column'] = 'merged_nogenre'
config['model_name'] = f"DBFT-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"context-enrichment/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## BFT-cased-masked-ALL-BM25

In [None]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + 'data/imdb/final_table_nogenre.pkl'
config['datapath_r'] = path_base + 'data/wikidata/wikidata-processor/data/queries/movies/final_table_nogenre.pkl'
config['bm25_argsort_path'] = path_base + 'data/Okapi25Queries/imdb_bm25Okapi_summaries_nogenre_argsort.npy'

config['model_type'] = 'bert'
config['tokenizer_casing'] = 'cased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None #imdb_l, summaries_r = 9
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = 40000
config['num_test'] = 7823
config['data_column'] = 'merged_nogenre'
config['model_name'] = f"BFT-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
#config['model_outdir'] = '../models/DBFT-cased-masked-ALL-BM25'
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"context-enrichment/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## BFT-uncased-masked-ALL-BM25

In [None]:
config = defaultdict(dict) 

config['datapath_l'] = path_base + 'data/imdb/final_table_nogenre.pkl'
config['datapath_r'] = path_base + 'data/wikidata/wikidata-processor/data/queries/movies/final_table_nogenre.pkl'
config['bm25_argsort_path'] = path_base + 'data/Okapi25Queries/imdb_bm25Okapi_summaries_nogenre_argsort.npy'

config['model_type'] = 'bert'
config['tokenizer_casing'] = 'uncased'
config['from_scratch'] = False

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None #imdb_l, summaries_r = 9
config['mlm_masking'] = 'ALL' #ALL, BEFORE, AFTER

config['num_train'] = 40000
config['num_test'] = 7823
config['data_column'] = 'merged_nogenre'
config['model_name'] = f"BFT-{config['tokenizer_casing']}-masked-{config['mlm_masking']}-BM25"
config['train_epochs'] = 20
config['batch_size'] = 8

config_path = path_base + f"context-enrichment/pretraining/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/pretrain_MLM.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)