In [8]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [9]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict 

In [10]:
path_base = '/lfs/1/sahaana/enrichment/'

def save_config(config_path):
    with open(config_path, 'w') as fp:
        json.dump(config, fp, indent=4)
        
def load_config(config_path):
    with open(config_path) as fp:
        config = json.load(fp)
    return config

# EMBER all

## Base

## MS MARCO (running with BM25 over the already 25-ed 1k)

In [11]:
data = 'MSMARCO'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/tableB_processed.pkl'
config['train_supervision'] = path_base + f'data/{data}/qidpidtriples.train.full.2.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/tableB_processed.pkl'
config['test_supervision'] = path_base + f'data/{data}/supervision_test.pkl'

config['arch'] = 'single-triplet'
config['bert_path']= path_base + f'/ember/pretraining/models/MARCO-uncased-masked-ALL-BM25' # change manuaallyyy~
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/100)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

/lfs/1/sahaana/enrichment/ember/embedding/configs/MSMARCO-uncased-masked-ALL-BM25-single-triplet-3977686-1.json
python scripts/train_embedding.py -c configs/MSMARCO-uncased-masked-ALL-BM25-single-triplet-3977686-1.json


{'data': 'MSMARCO',
 'datapath_l': '/lfs/1/sahaana/enrichment/data/MSMARCO/tableA_processed.pkl',
 'datapath_r': '/lfs/1/sahaana/enrichment/data/MSMARCO/tableB_processed.pkl',
 'train_supervision': '/lfs/1/sahaana/enrichment/data/MSMARCO/qidpidtriples.train.full.2.pkl',
 'eval_datapath_l': '/lfs/1/sahaana/enrichment/data/MSMARCO/dev_tableA_processed.pkl',
 'eval_datapath_r': '/lfs/1/sahaana/enrichment/data/MSMARCO/tableB_processed.pkl',
 'test_supervision': '/lfs/1/sahaana/enrichment/data/MSMARCO/supervision_test.pkl',
 'arch': 'single-triplet',
 'bert_path': '/lfs/1/sahaana/enrichment//ember/pretraining/models/MARCO-uncased-masked-ALL-BM25',
 'column': 'merged_all',
 'tokenizer': 'distilbert-base-uncased',
 'train_size': 3977686,
 'epochs': 1,
 'batch_size': 8,
 'final_size': 200,
 'lr': 1e-05,
 'loss': 'triplet',
 'tl_margin': 1.0,
 'tl_p': 2,
 'pool_type': 'CLS',
 'tokenizer_max_length': 512,
 'knn_k': 300,
 'model_name': 'MSMARCO-uncased-masked-ALL-BM25-single-triplet-3977686-1'}

## SQuAD Sent

In [5]:
data = 'SQuAD_sent'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/SQuAD/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/SQuAD/train_tableB_sent_processed.pkl'
config['train_supervision'] = path_base + f'data/SQuAD/train_sent_triplets.pkl'

config['eval_datapath_l'] = path_base + f'data/SQuAD/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/SQuAD/dev_tableB_sent_processed.pkl'
config['test_supervision'] = path_base + f'data/SQuAD/dev_sent_labels.pkl'

config['arch'] = 'single-triplet'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

/lfs/1/sahaana/enrichment/ember/embedding/configs/SQuAD_sent-uncased-masked-ALL-BM25-single-triplet-260004-1.json
python scripts/train_embedding.py -c configs/SQuAD_sent-uncased-masked-ALL-BM25-single-triplet-260004-1.json


{'data': 'SQuAD_sent',
 'datapath_l': '/lfs/1/sahaana/enrichment/data/SQuAD/train_tableA_processed.pkl',
 'datapath_r': '/lfs/1/sahaana/enrichment/data/SQuAD/train_tableB_sent_processed.pkl',
 'train_supervision': '/lfs/1/sahaana/enrichment/data/SQuAD/train_sent_triplets.pkl',
 'eval_datapath_l': '/lfs/1/sahaana/enrichment/data/SQuAD/dev_tableA_processed.pkl',
 'eval_datapath_r': '/lfs/1/sahaana/enrichment/data/SQuAD/dev_tableB_sent_processed.pkl',
 'test_supervision': '/lfs/1/sahaana/enrichment/data/SQuAD/dev_sent_labels.pkl',
 'arch': 'single-triplet',
 'bert_path': '/lfs/1/sahaana/enrichment//ember/pretraining/models/SQuAD_sent-uncased-masked-ALL-BM25',
 'column': 'merged_all',
 'tokenizer': 'distilbert-base-uncased',
 'train_size': 260004,
 'epochs': 1,
 'batch_size': 8,
 'final_size': 200,
 'lr': 1e-05,
 'loss': 'triplet',
 'tl_margin': 1.0,
 'tl_p': 2,
 'pool_type': 'CLS',
 'tokenizer_max_length': 512,
 'knn_k': 300,
 'model_name': 'SQuAD_sent-uncased-masked-ALL-BM25-single-tripl

## IMDB_Wiki

In [6]:
data = 'imdb_wiki'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/imdb_wiki/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/imdb_wiki/dev_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'single-triplet'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

/lfs/1/sahaana/enrichment/ember/embedding/configs/imdb_wiki-uncased-masked-ALL-BM25-single-triplet-38250-1.json
python scripts/train_embedding.py -c configs/imdb_wiki-uncased-masked-ALL-BM25-single-triplet-38250-1.json


{'data': 'imdb_wiki',
 'datapath_l': '/lfs/1/sahaana/enrichment/data/imdb_wiki/train_tableA_processed.pkl',
 'datapath_r': '/lfs/1/sahaana/enrichment/data/imdb_wiki/train_tableB_processed.pkl',
 'train_supervision': '/lfs/1/sahaana/enrichment//data/imdb_wiki/supervision_train.pkl',
 'eval_datapath_l': '/lfs/1/sahaana/enrichment/data/imdb_wiki/dev_tableA_processed.pkl',
 'eval_datapath_r': '/lfs/1/sahaana/enrichment/data/imdb_wiki/dev_tableB_processed.pkl',
 'test_supervision': '/lfs/1/sahaana/enrichment//data/imdb_wiki/supervision_test.pkl',
 'arch': 'single-triplet',
 'bert_path': '/lfs/1/sahaana/enrichment//ember/pretraining/models/imdb_wiki-uncased-masked-ALL-BM25',
 'column': 'merged_all',
 'tokenizer': 'distilbert-base-uncased',
 'train_size': 38250,
 'epochs': 1,
 'batch_size': 8,
 'final_size': 200,
 'lr': 1e-05,
 'loss': 'triplet',
 'tl_margin': 1.0,
 'tl_p': 2,
 'pool_type': 'CLS',
 'tokenizer_max_length': 512,
 'knn_k': 300,
 'model_name': 'imdb_wiki-uncased-masked-ALL-BM25-s

## Fuzzy Main

In [7]:
data = 'main_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'single-triplet'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

FileNotFoundError: [Errno 2] No such file or directory: '/lfs/1/sahaana/enrichment//data/main_fuzzy/supervision_train.pkl'

## Fuzzy Hard

In [None]:
data = 'hard_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'single-triplet'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## Fuzzy Easy

In [None]:
data = 'easy_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'single-triplet'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## DM_Joined

In [36]:
dm_data  = {0:"joined_abt_buy_exp_data", 
            1:"joined_amazon_google_exp_data", 
            2:"joined_beer_exp_data", 
            3:"joined_company_exp_data", 
            4:"joined_dblp_acm_exp_data", 
            5:"joined_dblp_scholar_exp_data", 
            6:"joined_dirty_dblp_acm_exp_data", 
            7:"joined_dirty_dblp_scholar_exp_data", 
            8:"joined_dirty_itunes_amazon_exp_data", 
            9:"joined_dirty_walmart_amazon_exp_data", 
            10:"joined_fodors_zagat_exp_data", 
            11:"joined_itunes_amazon_exp_data", 
            12:"joined_walmart_amazon_exp_data"}
data = 'dm_blocked'

for i in dm_data:
    config = defaultdict(dict) 

    config['data'] = data
    config['datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['train_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_train.pkl'

    config['eval_datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['eval_datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['test_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_test.pkl'

    config['arch'] = 'single-triplet'
    config['bert_path']= path_base + f'/ember/pretraining/models/{dm_data[i]}-uncased-masked-ALL-BM25'
    config['column'] = "merged_all"
    config['tokenizer'] = 'distilbert-base-uncased'

    train_supervision = pd.read_pickle(config['train_supervision'])
    config['train_size'] = int(len(train_supervision)/1)

    config['epochs'] = 1
    config['batch_size'] = 8
    config['final_size'] = 200
    config['lr'] = .00001
    config['loss'] = 'triplet'
    config['tl_margin'] = 1.0
    config['tl_p'] = 2
    config['pool_type'] = "CLS"
    config['tokenizer_max_length'] = 512

    config['knn_k'] = 30

    config['model_name'] = f"{dm_data[i]}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

    config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
    #print(config_path)
    print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json ;")
    save_config(config_path)
    load_config(config_path)
    #print()

python scripts/train_embedding.py -c configs/joined_abt_buy_exp_data-uncased-masked-ALL-BM25-single-triplet-611-1.json ;
python scripts/train_embedding.py -c configs/joined_amazon_google_exp_data-uncased-masked-ALL-BM25-single-triplet-631-1.json ;
python scripts/train_embedding.py -c configs/joined_beer_exp_data-uncased-masked-ALL-BM25-single-triplet-40-1.json ;
python scripts/train_embedding.py -c configs/joined_company_exp_data-uncased-masked-ALL-BM25-single-triplet-16859-1.json ;
python scripts/train_embedding.py -c configs/joined_dblp_acm_exp_data-uncased-masked-ALL-BM25-single-triplet-1332-1.json ;
python scripts/train_embedding.py -c configs/joined_dblp_scholar_exp_data-uncased-masked-ALL-BM25-single-triplet-1860-1.json ;
python scripts/train_embedding.py -c configs/joined_dirty_dblp_acm_exp_data-uncased-masked-ALL-BM25-single-triplet-1332-1.json ;
python scripts/train_embedding.py -c configs/joined_dirty_dblp_scholar_exp_data-uncased-masked-ALL-BM25-single-triplet-1860-1.json ;


In [None]:
dm_data  = {0:"joined_abt_buy_exp_data", 
            1:"joined_amazon_google_exp_data", 
            2:"joined_beer_exp_data", 
            3:"joined_company_exp_data", 
            4:"joined_dblp_acm_exp_data", 
            5:"joined_dblp_scholar_exp_data", 
            6:"joined_dirty_dblp_acm_exp_data", 
            7:"joined_dirty_dblp_scholar_exp_data", 
            8:"joined_dirty_itunes_amazon_exp_data", 
            9:"joined_dirty_walmart_amazon_exp_data", 
            10:"joined_fodors_zagat_exp_data", 
            11:"joined_itunes_amazon_exp_data", 
            12:"joined_walmart_amazon_exp_data"}
data = 'dm_blocked'

for i in dm_data:
    config = defaultdict(dict) 

    config['data'] = data
    config['datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['train_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_train.pkl'

    config['eval_datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['eval_datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['test_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_test.pkl'

    config['arch'] = 'single-triplet'
    config['bert_path']= path_base + f'/ember/pretraining/models/{dm_data[i]}-uncased-masked-ALL-BM25'
    config['column'] = "merged_all"
    config['tokenizer'] = 'distilbert-base-uncased'

    train_supervision = pd.read_pickle(config['train_supervision'])
    config['train_size'] = int(len(train_supervision)*15)

    config['epochs'] = 1
    config['batch_size'] = 8
    config['final_size'] = 200
    config['lr'] = .00001
    config['loss'] = 'triplet'
    config['tl_margin'] = 1.0
    config['tl_p'] = 2
    config['pool_type'] = "CLS"
    config['tokenizer_max_length'] = 512

    config['knn_k'] = 300

    config['model_name'] = f"{dm_data[i]}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

    config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
    #print(config_path)
    print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json ;")
    save_config(config_path)
    load_config(config_path)
    #print()

# Pretrained BERT (no fine tuning)

## MS MARCO (running with BM25 over the already 25-ed 1k)

In [None]:
data = 'MSMARCO'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/tableB_processed.pkl'
config['train_supervision'] = path_base + f'data/{data}/qidpidtriples.train.full.2.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/tableB_processed.pkl'
config['test_supervision'] = path_base + f'data/{data}/supervision_test.pkl'

config['arch'] = 'pretrained'
config['bert_path']= 'distilbert-base-uncased'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

config['train_size'] = 1

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## SQuAD Sent

In [None]:
data = 'SQuAD_sent'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/SQuAD/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/SQuAD/train_tableB_sent_processed.pkl'
config['train_supervision'] = path_base + f'data/SQuAD/train_sent_triplets.pkl'

config['eval_datapath_l'] = path_base + f'data/SQuAD/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/SQuAD/dev_tableB_sent_processed.pkl'
config['test_supervision'] = path_base + f'data/SQuAD/dev_sent_labels.pkl'

config['arch'] = 'pretrained'
config['bert_path']= 'distilbert-base-uncased'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

config['train_size'] = 1

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## IMDB_Wiki

In [None]:
data = 'imdb_wiki'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/imdb_wiki/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/imdb_wiki/dev_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'pretrained'
config['bert_path']= 'distilbert-base-uncased'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

config['train_size'] = 1

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## Fuzzy Main

In [None]:
data = 'main_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'pretrained'
config['bert_path']= 'distilbert-base-uncased'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

config['train_size'] = 1

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## Fuzzy Hard

In [None]:
data = 'hard_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'pretrained'
config['bert_path']= 'distilbert-base-uncased'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

config['train_size'] = 1

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## Fuzzy Easy

In [None]:
data = 'easy_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'pretrained'
config['bert_path']= 'distilbert-base-uncased'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

config['train_size'] = 1

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## DM_joined

In [None]:
dm_data  = {0:"joined_abt_buy_exp_data", 
            1:"joined_amazon_google_exp_data", 
            2:"joined_beer_exp_data", 
            3:"joined_company_exp_data", 
            4:"joined_dblp_acm_exp_data", 
            5:"joined_dblp_scholar_exp_data", 
            6:"joined_dirty_dblp_acm_exp_data", 
            7:"joined_dirty_dblp_scholar_exp_data", 
            8:"joined_dirty_itunes_amazon_exp_data", 
            9:"joined_dirty_walmart_amazon_exp_data", 
            10:"joined_fodors_zagat_exp_data", 
            11:"joined_itunes_amazon_exp_data", 
            12:"joined_walmart_amazon_exp_data"}
data = 'dm_blocked'

for i in dm_data:
    config = defaultdict(dict) 

    config['data'] = data
    config['datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['train_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_train.pkl'

    config['eval_datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['eval_datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['test_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_test.pkl'

    config['arch'] = 'pretrained'
    config['bert_path']= 'distilbert-base-uncased' 
    config['column'] = "merged_all"
    config['tokenizer'] = 'distilbert-base-uncased'

    config['train_size'] = 1

    config['epochs'] = 1
    config['batch_size'] = 8
    config['final_size'] = 200 #useless
    config['lr'] = .00001
    config['loss'] = 'triplet'
    config['tl_margin'] = 1.0
    config['tl_p'] = 2
    config['pool_type'] = "CLS"
    config['tokenizer_max_length'] = 512

    config['knn_k'] = 300

    config['model_name'] = f"{dm_data[i]}-{config['arch']}-{config['train_size']}-{config['epochs']}"

    config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
    #print(config_path)
    print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json ;")
    save_config(config_path)
    load_config(config_path)
    #print()

# Double Tower 

## MS MARCO (running with BM25 over the already 25-ed 1k)

In [None]:
data = 'MSMARCO'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/tableB_processed.pkl'
config['train_supervision'] = path_base + f'data/{data}/qidpidtriples.train.full.2.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/tableB_processed.pkl'
config['test_supervision'] = path_base + f'data/{data}/supervision_test.pkl'

config['arch'] = 'double-triplet'
config['bert_path']= path_base + f'/ember/pretraining/models/MARCO-uncased-masked-ALL-BM25' # change manuaallyyy~
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/100)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## SQuAD Sent

In [None]:
data = 'SQuAD_sent'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/SQuAD/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/SQuAD/train_tableB_sent_processed.pkl'
config['train_supervision'] = path_base + f'data/SQuAD/train_sent_triplets.pkl'

config['eval_datapath_l'] = path_base + f'data/SQuAD/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/SQuAD/dev_tableB_sent_processed.pkl'
config['test_supervision'] = path_base + f'data/SQuAD/dev_sent_labels.pkl'

config['arch'] = 'double-triplet'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## IMDB_Wiki

In [None]:
data = 'imdb_wiki'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/imdb_wiki/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/imdb_wiki/dev_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'double-triplet'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 4
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## Fuzzy Main

In [None]:
data = 'main_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'double-triplet'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## Fuzzy Hard

In [None]:
data = 'hard_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'double-triplet'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## Fuzzy Easy

In [None]:
data = 'easy_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'double-triplet'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

##  DM_joined MANUALLY CHANGE COMPANY TO BATCH SIZE OF 4

In [34]:
dm_data  = {0:"joined_abt_buy_exp_data", 
            1:"joined_amazon_google_exp_data", 
            2:"joined_beer_exp_data", 
            3:"joined_company_exp_data", 
            4:"joined_dblp_acm_exp_data", 
            5:"joined_dblp_scholar_exp_data", 
            6:"joined_dirty_dblp_acm_exp_data", 
            7:"joined_dirty_dblp_scholar_exp_data", 
            8:"joined_dirty_itunes_amazon_exp_data", 
            9:"joined_dirty_walmart_amazon_exp_data", 
            10:"joined_fodors_zagat_exp_data", 
            11:"joined_itunes_amazon_exp_data", 
            12:"joined_walmart_amazon_exp_data"}
data = 'dm_blocked'

for i in dm_data:
    config = defaultdict(dict) 

    config['data'] = data
    config['datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['train_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_train.pkl'

    config['eval_datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['eval_datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['test_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_test.pkl'

    config['arch'] = 'double-triplet'
    config['bert_path']= path_base + f'/ember/pretraining/models/{dm_data[i]}-uncased-masked-ALL-BM25'
    config['column'] = "merged_all"
    config['tokenizer'] = 'distilbert-base-uncased'

    train_supervision = pd.read_pickle(config['train_supervision'])
    config['train_size'] = int(len(train_supervision)/1)

    config['epochs'] = 1
    if 'company' in dm_data[i]:
        config['batch_size'] = 4
    else:
        config['batch_size'] = 8
    config['final_size'] = 200
    config['lr'] = .00001
    config['loss'] = 'triplet'
    config['tl_margin'] = 1.0
    config['tl_p'] = 2
    config['pool_type'] = "CLS"
    config['tokenizer_max_length'] = 512

    config['knn_k'] = 30

    config['model_name'] = f"{dm_data[i]}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

    config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
    #print(config_path)
    print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json ;")
    save_config(config_path)
    load_config(config_path)
    #print()

python scripts/train_embedding.py -c configs/joined_abt_buy_exp_data-uncased-masked-ALL-BM25-double-triplet-611-1.json ;
python scripts/train_embedding.py -c configs/joined_amazon_google_exp_data-uncased-masked-ALL-BM25-double-triplet-631-1.json ;
python scripts/train_embedding.py -c configs/joined_beer_exp_data-uncased-masked-ALL-BM25-double-triplet-40-1.json ;
python scripts/train_embedding.py -c configs/joined_company_exp_data-uncased-masked-ALL-BM25-double-triplet-16859-1.json ;
python scripts/train_embedding.py -c configs/joined_dblp_acm_exp_data-uncased-masked-ALL-BM25-double-triplet-1332-1.json ;
python scripts/train_embedding.py -c configs/joined_dblp_scholar_exp_data-uncased-masked-ALL-BM25-double-triplet-1860-1.json ;
python scripts/train_embedding.py -c configs/joined_dirty_dblp_acm_exp_data-uncased-masked-ALL-BM25-double-triplet-1332-1.json ;
python scripts/train_embedding.py -c configs/joined_dirty_dblp_scholar_exp_data-uncased-masked-ALL-BM25-double-triplet-1860-1.json ;


In [None]:
dm_data  = {0:"joined_abt_buy_exp_data", 
            1:"joined_amazon_google_exp_data", 
            2:"joined_beer_exp_data", 
            3:"joined_company_exp_data", 
            4:"joined_dblp_acm_exp_data", 
            5:"joined_dblp_scholar_exp_data", 
            6:"joined_dirty_dblp_acm_exp_data", 
            7:"joined_dirty_dblp_scholar_exp_data", 
            8:"joined_dirty_itunes_amazon_exp_data", 
            9:"joined_dirty_walmart_amazon_exp_data", 
            10:"joined_fodors_zagat_exp_data", 
            11:"joined_itunes_amazon_exp_data", 
            12:"joined_walmart_amazon_exp_data"}
data = 'dm_blocked'

for i in dm_data:
    config = defaultdict(dict) 

    config['data'] = data
    config['datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['train_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_train.pkl'

    config['eval_datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['eval_datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['test_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_test.pkl'

    config['arch'] = 'double-triplet'
    config['bert_path']= path_base + f'/ember/pretraining/models/{dm_data[i]}-uncased-masked-ALL-BM25'
    config['column'] = "merged_all"
    config['tokenizer'] = 'distilbert-base-uncased'

    train_supervision = pd.read_pickle(config['train_supervision'])
    config['train_size'] = int(len(train_supervision)*15)

    config['epochs'] = 1
    if 'company' in dm_data[i]:
        config['batch_size'] = 4
    else:
        config['batch_size'] = 8
    config['final_size'] = 200
    config['lr'] = .00001
    config['loss'] = 'triplet'
    config['tl_margin'] = 1.0
    config['tl_p'] = 2
    config['pool_type'] = "CLS"
    config['tokenizer_max_length'] = 512

    config['knn_k'] = 300

    config['model_name'] = f"{dm_data[i]}-uncased-masked-ALL-BM25-{config['arch']}-{config['train_size']}-{config['epochs']}"

    config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
    #print(config_path)
    print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json ;")
    save_config(config_path)
    load_config(config_path)
    #print()

# Pretrained BERT + Fine Tuning

## MS MARCO (running with BM25 over the already 25-ed 1k)

In [None]:
data = 'MSMARCO'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/tableB_processed.pkl'
config['train_supervision'] = path_base + f'data/{data}/qidpidtriples.train.full.2.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/tableB_processed.pkl'
config['test_supervision'] = path_base + f'data/{data}/supervision_test.pkl'

config['arch'] = 'single-triplet'
config['bert_path']='distilbert-base-uncased' 
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/100)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-distilbert-base-uncased-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## SQuAD Sent

In [None]:
data = 'SQuAD_sent'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/SQuAD/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/SQuAD/train_tableB_sent_processed.pkl'
config['train_supervision'] = path_base + f'data/SQuAD/train_sent_triplets.pkl'

config['eval_datapath_l'] = path_base + f'data/SQuAD/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/SQuAD/dev_tableB_sent_processed.pkl'
config['test_supervision'] = path_base + f'data/SQuAD/dev_sent_labels.pkl'

config['arch'] = 'single-triplet'
config['bert_path']= 'distilbert-base-uncased'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-distilbert-base-uncased-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## IMDB_Wiki

In [None]:
data = 'imdb_wiki'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/imdb_wiki/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/imdb_wiki/dev_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'single-triplet'
config['bert_path']= 'distilbert-base-uncased'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-distilbert-base-uncased-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## Main Fuzzy

In [None]:
data = 'main_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'single-triplet'
config['bert_path']= 'distilbert-base-uncased'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-distilbert-base-uncased-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## Hard Fuzzy

In [None]:
data = 'hard_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'single-triplet'
config['bert_path']= 'distilbert-base-uncased'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-distilbert-base-uncased-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## Easy Fuzzy

In [None]:
data = 'easy_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'single-triplet'
config['bert_path']= 'distilbert-base-uncased'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

train_supervision = pd.read_pickle(config['train_supervision'])
config['train_size'] = int(len(train_supervision)/1)

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-distilbert-base-uncased-{config['arch']}-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## DM_joined

In [33]:
dm_data  = {0:"joined_abt_buy_exp_data", 
            1:"joined_amazon_google_exp_data", 
            2:"joined_beer_exp_data", 
            3:"joined_company_exp_data", 
            4:"joined_dblp_acm_exp_data", 
            5:"joined_dblp_scholar_exp_data", 
            6:"joined_dirty_dblp_acm_exp_data", 
            7:"joined_dirty_dblp_scholar_exp_data", 
            8:"joined_dirty_itunes_amazon_exp_data", 
            9:"joined_dirty_walmart_amazon_exp_data", 
            10:"joined_fodors_zagat_exp_data", 
            11:"joined_itunes_amazon_exp_data", 
            12:"joined_walmart_amazon_exp_data"}
data = 'dm_blocked'

for i in dm_data:
    config = defaultdict(dict) 

    config['data'] = data
    config['datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['train_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_train.pkl'

    config['eval_datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['eval_datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['test_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_test.pkl'

    config['arch'] = 'single-triplet'
    config['bert_path']= 'distilbert-base-uncased'
    config['column'] = "merged_all"
    config['tokenizer'] = 'distilbert-base-uncased'

    train_supervision = pd.read_pickle(config['train_supervision'])
    config['train_size'] = int(len(train_supervision)/1)

    config['epochs'] = 1
    config['batch_size'] = 8
    config['final_size'] = 200
    config['lr'] = .00001
    config['loss'] = 'triplet'
    config['tl_margin'] = 1.0
    config['tl_p'] = 2
    config['pool_type'] = "CLS"
    config['tokenizer_max_length'] = 512

    config['knn_k'] = 30

    config['model_name'] = f"{dm_data[i]}-distilbert-base-uncased-{config['arch']}-{config['train_size']}-{config['epochs']}"

    config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
    #print(config_path)
    print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json ;")
    save_config(config_path)
    load_config(config_path)
    #print()

python scripts/train_embedding.py -c configs/joined_abt_buy_exp_data-distilbert-base-uncased-single-triplet-611-1.json ;
python scripts/train_embedding.py -c configs/joined_amazon_google_exp_data-distilbert-base-uncased-single-triplet-631-1.json ;
python scripts/train_embedding.py -c configs/joined_beer_exp_data-distilbert-base-uncased-single-triplet-40-1.json ;
python scripts/train_embedding.py -c configs/joined_company_exp_data-distilbert-base-uncased-single-triplet-16859-1.json ;
python scripts/train_embedding.py -c configs/joined_dblp_acm_exp_data-distilbert-base-uncased-single-triplet-1332-1.json ;
python scripts/train_embedding.py -c configs/joined_dblp_scholar_exp_data-distilbert-base-uncased-single-triplet-1860-1.json ;
python scripts/train_embedding.py -c configs/joined_dirty_dblp_acm_exp_data-distilbert-base-uncased-single-triplet-1332-1.json ;
python scripts/train_embedding.py -c configs/joined_dirty_dblp_scholar_exp_data-distilbert-base-uncased-single-triplet-1860-1.json ;


In [None]:
dm_data  = {0:"joined_abt_buy_exp_data", 
            1:"joined_amazon_google_exp_data", 
            2:"joined_beer_exp_data", 
            3:"joined_company_exp_data", 
            4:"joined_dblp_acm_exp_data", 
            5:"joined_dblp_scholar_exp_data", 
            6:"joined_dirty_dblp_acm_exp_data", 
            7:"joined_dirty_dblp_scholar_exp_data", 
            8:"joined_dirty_itunes_amazon_exp_data", 
            9:"joined_dirty_walmart_amazon_exp_data", 
            10:"joined_fodors_zagat_exp_data", 
            11:"joined_itunes_amazon_exp_data", 
            12:"joined_walmart_amazon_exp_data"}
data = 'dm_blocked'

for i in dm_data:
    config = defaultdict(dict) 

    config['data'] = data
    config['datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['train_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_train.pkl'

    config['eval_datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['eval_datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['test_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_test.pkl'

    config['arch'] = 'single-triplet'
    config['bert_path']= 'distilbert-base-uncased'
    config['column'] = "merged_all"
    config['tokenizer'] = 'distilbert-base-uncased'

    train_supervision = pd.read_pickle(config['train_supervision'])
    config['train_size'] = int(len(train_supervision)*15)

    config['epochs'] = 1
    config['batch_size'] = 8
    config['final_size'] = 200
    config['lr'] = .00001
    config['loss'] = 'triplet'
    config['tl_margin'] = 1.0
    config['tl_p'] = 2
    config['pool_type'] = "CLS"
    config['tokenizer_max_length'] = 512

    config['knn_k'] = 300

    config['model_name'] = f"{dm_data[i]}-distilbert-base-uncased-{config['arch']}-{config['train_size']}-{config['epochs']}"

    config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
    #print(config_path)
    print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json ;")
    save_config(config_path)
    load_config(config_path)
    #print()

# BM25-MLM BERT (no fine tuning)

## DeepMatcher

In [None]:
dm_data  = {0:"abt_buy_exp_data", 
            1:"amazon_google_exp_data", 
            2:"beer_exp_data", 
            3:"company_exp_data", 
            4:"dblp_acm_exp_data", 
            5:"dblp_scholar_exp_data", 
            6:"dirty_dblp_acm_exp_data", 
            7:"dirty_dblp_scholar_exp_data", 
            8:"dirty_itunes_amazon_exp_data", 
            9:"dirty_walmart_amazon_exp_data", 
            10:"fodors_zagat_exp_data", 
            11:"itunes_amazon_exp_data", 
            12:"walmart_amazon_exp_data"}
data = 'deepmatcher'

for i in dm_data:
    config = defaultdict(dict) 

    config['data'] = data
    config['datapath_l'] = path_base + f'data/deepmatcher/{dm_data[i]}/tableA_processed.pkl'
    config['datapath_r'] = path_base + f'data/deepmatcher/{dm_data[i]}/tableB_processed.pkl'
    config['train_supervision'] = path_base + f'data/deepmatcher/{dm_data[i]}/supervision_train.pkl'

    config['eval_datapath_l'] = path_base + f'data/deepmatcher/{dm_data[i]}/tableA_processed.pkl'
    config['eval_datapath_r'] = path_base + f'data/deepmatcher/{dm_data[i]}/tableB_processed.pkl'
    config['test_supervision'] = path_base + f'data/deepmatcher/{dm_data[i]}/supervision_test.pkl'

    config['arch'] = 'pretrained'
    config['bert_path']= path_base + f'/ember/pretraining/models/{dm_data[i]}-uncased-masked-ALL-BM25'
    config['column'] = "merged_all"
    config['tokenizer'] = 'distilbert-base-uncased'

    config['train_size'] = 1

    config['epochs'] = 1
    config['batch_size'] = 8
    config['final_size'] = 200 #useless
    config['lr'] = .00001
    config['loss'] = 'triplet'
    config['tl_margin'] = 1.0
    config['tl_p'] = 2
    config['pool_type'] = "CLS"
    config['tokenizer_max_length'] = 512

    config['knn_k'] = 300

    config['model_name'] = f"{dm_data[i]}-{config['arch']}-MLMBM25-{config['train_size']}-{config['epochs']}"

    config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
    #print(config_path)
    print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json ;")
    save_config(config_path)
    load_config(config_path)
    #print()

## MS MARCO (running with BM25 over the already 25-ed 1k)

In [None]:
data = 'MSMARCO'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/tableB_processed.pkl'
config['train_supervision'] = path_base + f'data/{data}/qidpidtriples.train.full.2.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/tableB_processed.pkl'
config['test_supervision'] = path_base + f'data/{data}/supervision_test.pkl'

config['arch'] = 'pretrained'
config['bert_path']= path_base + f'/ember/pretraining/models/MARCO-uncased-masked-ALL-BM25' # change manuaallyyy~
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

config['train_size'] = 1

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-{config['arch']}-MLMBM25-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## SQuAD Sent

In [None]:
data = 'SQuAD_sent'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/SQuAD/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/SQuAD/train_tableB_sent_processed.pkl'
config['train_supervision'] = path_base + f'data/SQuAD/train_sent_triplets.pkl'

config['eval_datapath_l'] = path_base + f'data/SQuAD/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/SQuAD/dev_tableB_sent_processed.pkl'
config['test_supervision'] = path_base + f'data/SQuAD/dev_sent_labels.pkl'

config['arch'] = 'pretrained'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

config['train_size'] = 1

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-{config['arch']}-MLMBM25-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## IMDB_Wiki

In [None]:
data = 'imdb_wiki'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/imdb_wiki/dev_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/imdb_wiki/dev_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'pretrained'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

config['train_size'] = 1

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-{config['arch']}-MLMBM25-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## Fuzzy Main

In [None]:
data = 'main_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'pretrained'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

config['train_size'] = 1

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-{config['arch']}-MLMBM25-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## Fuzzy Hard

In [None]:
data = 'hard_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'pretrained'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

config['train_size'] = 1

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-{config['arch']}-MLMBM25-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## Fuzzy Easy

In [None]:
data = 'easy_fuzzy'

config = defaultdict(dict) 

config['data'] = data
config['datapath_l'] = path_base + f'data/{data}/train_tableA_processed.pkl'
config['datapath_r'] = path_base + f'data/{data}/train_tableB_processed.pkl'
config['train_supervision'] = path_base + f'/data/{data}/supervision_train.pkl'

config['eval_datapath_l'] = path_base + f'data/{data}/test_tableA_processed.pkl'
config['eval_datapath_r'] = path_base + f'data/{data}/test_tableB_processed.pkl'
config['test_supervision'] = path_base + f'/data/{data}/supervision_test.pkl'

config['arch'] = 'pretrained'
config['bert_path']= path_base + f'/ember/pretraining/models/{data}-uncased-masked-ALL-BM25'
config['column'] = "merged_all"
config['tokenizer'] = 'distilbert-base-uncased'

config['train_size'] = 1

config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['knn_k'] = 300

config['model_name'] = f"{data}-{config['arch']}-MLMBM25-{config['train_size']}-{config['epochs']}"

config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
print(config_path)
print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json")
save_config(config_path)
load_config(config_path)

## DM_joined

In [None]:
dm_data  = {0:"joined_abt_buy_exp_data", 
            1:"joined_amazon_google_exp_data", 
            2:"joined_beer_exp_data", 
            3:"joined_company_exp_data", 
            4:"joined_dblp_acm_exp_data", 
            5:"joined_dblp_scholar_exp_data", 
            6:"joined_dirty_dblp_acm_exp_data", 
            7:"joined_dirty_dblp_scholar_exp_data", 
            8:"joined_dirty_itunes_amazon_exp_data", 
            9:"joined_dirty_walmart_amazon_exp_data", 
            10:"joined_fodors_zagat_exp_data", 
            11:"joined_itunes_amazon_exp_data", 
            12:"joined_walmart_amazon_exp_data"}
data = 'dm_blocked'

for i in dm_data:
    config = defaultdict(dict) 

    config['data'] = data
    config['datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['train_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_train.pkl'

    config['eval_datapath_l'] = path_base + f'data/{data}/{dm_data[i]}/tableA_processed.pkl'
    config['eval_datapath_r'] = path_base + f'data/{data}/{dm_data[i]}/tableB_processed.pkl'
    config['test_supervision'] = path_base + f'data/{data}/{dm_data[i]}/supervision_test.pkl'

    config['arch'] = 'pretrained'
    config['bert_path']= path_base + f'/ember/pretraining/models/{dm_data[i]}-uncased-masked-ALL-BM25'
    config['column'] = "merged_all"
    config['tokenizer'] = 'distilbert-base-uncased'

    config['train_size'] = 1

    config['epochs'] = 1
    config['batch_size'] = 8
    config['final_size'] = 200 #useless
    config['lr'] = .00001
    config['loss'] = 'triplet'
    config['tl_margin'] = 1.0
    config['tl_p'] = 2
    config['pool_type'] = "CLS"
    config['tokenizer_max_length'] = 512

    config['knn_k'] = 300

    config['model_name'] = f"{dm_data[i]}-{config['arch']}-MLMBM25-{config['train_size']}-{config['epochs']}"

    config_path = path_base + f"ember/embedding/configs/{config['model_name']}.json"
    #print(config_path)
    print(f"python scripts/train_embedding.py -c configs/{config['model_name']}.json ;")
    save_config(config_path)
    load_config(config_path)
    #print()