In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33msahaana[0m (use `wandb login --relogin` to force relogin)


True

# Imports 

In [4]:
import pandas as pd
import numpy as np
from collections import defaultdict

import matplotlib.pyplot as plt
from transformers import AutoTokenizer, DistilBertModel

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

In [45]:
import sys
#sys.path.append('/lfs/1/sahaana/enrichment/enrich/utils')
sys.path.append('/lfs/1/sahaana/enrichment/ember/utils')
 
    
from embedding_datasets import DeepMatcherDataset, EmberEvalDataset
from embedding_models import TripletSingleBERTModel
from embedding_utils import param_header, tokenize_batch  
from embedding_runner import train_model, eval_model
#from model_utils import MatchedDatasetTriplets, param_header_bert, tokenize_batch   
#from models import BatchedTripletSingleTowerModel, BatchedTripletSingleBERTModel
#from model_runner import train_model, eval_model
from knn_utils import FaissKNeighbors, knn_top_1_PRFS, knn_deepmatcher_recall #, knn_matching_accuracy, find_perfect_recall

In [6]:
%load_ext autoreload
%autoreload 2

# BUMBUM

In [7]:
datasets = {0:"abt_buy_exp_data", 
            1:"amazon_google_exp_data", 
            2:"beer_exp_data", 
            3:"company_exp_data", 
            4:"dblp_acm_exp_data", 
            5:"dblp_scholar_exp_data", 
            6:"dirty_dblp_acm_exp_data", 
            7:"dirty_dblp_scholar_exp_data", 
            8:"dirty_itunes_amazon_exp_data", 
            9:"dirty_walmart_amazon_exp_data", 
            10:"fodors_zagat_exp_data", 
            11:"itunes_amazon_exp_data", 
            12:"walmart_amazon_exp_data"}

In [9]:
for d in datasets:
    print(datasets[d])
    left = f'/lfs/1/sahaana/enrichment/data/deepmatcher/{datasets[d]}/tableA_processed.pkl'
    right = f'/lfs/1/sahaana/enrichment/data/deepmatcher/{datasets[d]}/tableB_processed.pkl'

    left = pd.read_pickle(left)
    right = pd.read_pickle(right)

    train_df = f'/lfs/1/sahaana/enrichment/data/deepmatcher/{datasets[d]}/train_updated.csv'
    train_df = pd.read_csv(train_df)

    val_df = f'/lfs/1/sahaana/enrichment/data/deepmatcher/{datasets[d]}/val_updated.csv'
    val_df = pd.read_csv(val_df)

    test_df = f'/lfs/1/sahaana/enrichment/data/deepmatcher/{datasets[d]}/test_updated.csv'
    test_df = pd.read_csv(test_df)

    epochs = 1
    batch_size = 8
    final_size = 200
    lr = .00001
    tl_margin = 1.0
    tl_p = 2
    pool_type = "CLS"
    column = "merged_all"
    shuffle = True
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    tokenizer_max_length = 512
    compute_val = False
    train_size = int(len(train_df)/4)

    bert_path=f'/lfs/1/sahaana/enrichment/ember/pretraining/models/{datasets[d]}-uncased-masked-ALL-BM25'
    bert_model = DistilBertModel.from_pretrained(bert_path, return_dict=True)

    model_name = f'{datasets[d]}-uncased-masked-ALL-BM25-{train_size}'

    train_data = DataLoader(DeepMatcherDataset(left, right, train_size, column, train_df), 
                            batch_size=batch_size,
                            shuffle = shuffle
                            )

    val_data = DataLoader(DeepMatcherDataset(left, right, len(val_df), column, val_df), 
                           batch_size=batch_size,
                           shuffle = False
                          )

    triplet_loss = nn.TripletMarginLoss(margin=tl_margin, p=tl_p)
    losses = []
    val_losses = []
    model = TripletSingleBERTModel(final_size, pool_type, bert_path)
    optimizer = optim.AdamW(model.parameters(), lr=lr)#optim.SGD(model.parameters(), lr=lr)

    save_dir = param_header(batch_size, final_size, lr, pool_type, epochs, train_size)
    save_dir = f'models/{model_name}/{save_dir}/'

    wandb.init(project=model_name)

    train_model(model, 
                tokenizer, 
                tokenize_batch, 
                train_data, 
                val_data, 
                triplet_loss, 
                optimizer, 
                epochs, 
                losses, 
                val_losses, 
                save_dir, 
                compute_val, 
                tokenizer_max_length = tokenizer_max_length)


    left_data = DataLoader(EmberEvalDataset(left, column), 
                           batch_size=batch_size,
                           shuffle = False
                          )
    right_data = DataLoader(EmberEvalDataset(right, column), 
                           batch_size=batch_size,
                           shuffle = False
                          )

    left_embeddings = eval_model(model, tokenizer, left_data, tokenizer_max_length=512)
    right_embeddings = eval_model(model, tokenizer, right_data, tokenizer_max_length=512)

    knn = FaissKNeighbors(k=30)
    knn.fit(right_embeddings)
    neib = knn.kneighbors(left_embeddings)
    print(datasets[d])
    print(f"precision, recall, F1, support: {knn_top_1_PRFS(neib[0], neib[1], test_df)}")
    print()
    print()
    print()


abt_buy_exp_data


In [9]:
#testing for the script
for d in datasets:
    print(datasets[d])
    left = f'/lfs/1/sahaana/enrichment/data/deepmatcher/{datasets[d]}/tableA_processed.pkl'
    right = f'/lfs/1/sahaana/enrichment/data/deepmatcher/{datasets[d]}/tableB_processed.pkl'

    left = pd.read_pickle(left)
    right = pd.read_pickle(right)

    train_df = f'/lfs/1/sahaana/enrichment/data/deepmatcher/{datasets[d]}/supervision_train.pkl'
    train_df = pd.read_pickle(train_df)

    test_df = f'/lfs/1/sahaana/enrichment/data/deepmatcher/{datasets[d]}/supervision_test.pkl'
    test_df = pd.read_pickle(test_df)

    epochs = 1
    batch_size = 8
    final_size = 200
    lr = .00001
    tl_margin = 1.0
    tl_p = 2
    pool_type = "CLS"
    column = "merged_all"
    shuffle = True
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    tokenizer_max_length = 512
    compute_val = False
    train_size = int(len(train_df)/4)

    bert_path=f'/lfs/1/sahaana/enrichment/ember/pretraining/models/{datasets[d]}-uncased-masked-ALL-BM25'
    bert_model = DistilBertModel.from_pretrained(bert_path, return_dict=True)

    model_name = f'{datasets[d]}-uncased-masked-ALL-BM25-{train_size}'

    train_data = DataLoader(DeepMatcherDataset(left, right, train_size, column, train_df), 
                            batch_size=batch_size,
                            shuffle = shuffle
                            )

    val_data = None

    triplet_loss = nn.TripletMarginLoss(margin=tl_margin, p=tl_p)
    losses = []
    val_losses = []
    model = TripletSingleBERTModel(final_size, pool_type, bert_path)
    optimizer = optim.AdamW(model.parameters(), lr=lr)#optim.SGD(model.parameters(), lr=lr)

    save_dir = param_header(batch_size, final_size, lr, pool_type, epochs, train_size)
    save_dir = f'models/{model_name}/{save_dir}/'

    wandb.init(project=model_name)

    train_model(model, 
                tokenizer, 
                tokenize_batch, 
                train_data, 
                val_data, 
                triplet_loss, 
                optimizer, 
                epochs, 
                losses, 
                val_losses, 
                save_dir, 
                compute_val, 
                tokenizer_max_length = tokenizer_max_length)


    left_data = DataLoader(EmberEvalDataset(left, column, indexed=True), 
                           batch_size=batch_size,
                           shuffle = False
                          )
    right_data = DataLoader(EmberEvalDataset(right, column, indexed=True), 
                           batch_size=batch_size,
                           shuffle = False
                          )

    left_embeddings = eval_model(model, tokenizer, left_data, tokenizer_max_length=512)
    right_embeddings = eval_model(model, tokenizer, right_data, tokenizer_max_length=512)

    knn = FaissKNeighbors(k=30)
    knn.fit(right_embeddings)
    neib = knn.kneighbors(left_embeddings)
    print(datasets[d])
    print(f"precision, recall, F1, support: {knn_top_1_PRFS(neib[0], neib[1], test_df)}")
    print()
    print()
    print()
    break

abt_buy_exp_data


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Epoch,0.0
_step,180.0
_runtime,32.0
_timestamp,1613879593.0
train batch loss,0.52706


0,1
Epoch,▁
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train batch loss,▇██▅▆▄▄▆▅▆▅▃▂▃▅▆▃▄▅▄▂▃▄▂▅▂▄▂▃▂▂▁▂▂▁▂▂▂▁▅


[34m[1mwandb[0m: wandb version 0.10.19 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Saved Model: models/abt_buy_exp_data-uncased-masked-ALL-BM25-1435/batch_size-8-final_size-200-opt_lr-1e-05-pooling-CLS-epochs-1-train-1435/19-54-20-02-21
abt_buy_exp_data
precision, recall, F1, support: ((0.9285714285714286, 0.8203883495145631, 0.8711340206185566, None), [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0

In [23]:
    left_data = DataLoader(EmberEvalDataset(left, column, indexed=True), 
                           batch_size=batch_size,
                           shuffle = False
                          )
    right_data = DataLoader(EmberEvalDataset(right, column, indexed=True), 
                           batch_size=batch_size,
                           shuffle = False
                          )

    left_index, left_embeddings = eval_model(model, tokenizer, left_data, tokenizer_max_length=512)
    right_index, right_embeddings = eval_model(model, tokenizer, right_data, tokenizer_max_length=512)

    knn = FaissKNeighbors(k=30)
    knn.fit(right_embeddings)
    neib = knn.kneighbors(left_embeddings)

In [24]:
left_embeddings

array([[ 0.2700619 ,  0.05895563, -0.48198986, ...,  0.26641858,
        -0.19231737, -0.6018808 ],
       [ 1.0032096 ,  0.28362516, -0.6229593 , ..., -0.17669171,
         0.05642968, -0.29127803],
       [-0.147342  , -0.30044255, -0.32882872, ..., -0.08050206,
         0.32331964, -0.45460165],
       ...,
       [-0.12334809, -0.13174953,  0.10212716, ..., -0.1475614 ,
         0.17826308, -0.02833558],
       [ 0.16046405, -0.00719314, -0.3470052 , ..., -0.22556528,
         0.1236966 , -0.36079985],
       [-0.31106687,  0.34422   , -0.8377285 , ...,  0.16381264,
         0.34641707, -0.47452974]], dtype=float32)

In [27]:
right_index

array([   0,    1,    2, ..., 1089, 1090, 1091])

In [35]:
knn_top_1_TEST(neib[0], neib[1], test_df, left_index, right_index)

NameError: name 'precision_recall_fscore_support' is not defined

In [39]:
np.min(neib[0][:,:1], axis=1)

array([ 4.7910423, 17.559998 ,  7.18272  , ...,  7.7182236, 11.385677 ,
        5.847275 ], dtype=float32)

In [40]:
neib[0]

array([[ 4.7910423, 11.013393 , 14.370218 , ..., 20.057037 , 20.16292  ,
        20.249737 ],
       [17.559998 , 17.996239 , 19.441633 , ..., 22.850372 , 22.924067 ,
        22.931057 ],
       [ 7.18272  , 11.802906 , 16.052406 , ..., 20.770567 , 20.799702 ,
        20.914497 ],
       ...,
       [ 7.7182236, 17.346403 , 18.087666 , ..., 22.572784 , 22.642424 ,
        22.656689 ],
       [11.385677 , 14.969315 , 15.236919 , ..., 20.900738 , 21.362034 ,
        22.040407 ],
       [ 5.847275 ,  7.9882774,  8.280563 , ..., 23.93537  , 24.098015 ,
        24.328302 ]], dtype=float32)

In [43]:
knn_top_1_PRFS(neib[0], neib[1], test_df, left_index, right_index)

((0.9285714285714286, 0.8203883495145631, 0.8711340206185566, None),
 [0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
 

In [73]:
knn_deepmatcher_recall(neib[0], neib[1], test_df, left_index, right_index, thresh = 9)

(0.640495867768595,
 0.7524271844660194,
 0.6919642857142857,
 None,
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
 

In [32]:
knn_top_1_PRFS(neib[0], neib[1], test_df)

((0.9285714285714286, 0.8203883495145631, 0.8711340206185566, None),
 [0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
 

In [57]:
knn_top_1_TEST(neib[0], neib[1], test_df, left_index, right_index, k = 2)

defaultdict(set,
            {0: {53, 710},
             1: {138, 154},
             2: {33, 121},
             3: {58, 195},
             4: {114, 153},
             5: {191, 389},
             6: {213, 223},
             7: {161, 1056},
             8: {37, 338},
             9: {57, 179},
             10: {34, 35},
             11: {34, 35},
             12: {58, 197},
             13: {58, 197},
             14: {44, 595},
             15: {56, 640},
             16: {376, 378},
             17: {223, 845},
             18: {104, 211},
             19: {39, 40},
             20: {39, 40},
             21: {196, 208},
             22: {62, 221},
             23: {483, 491},
             24: {42, 453},
             25: {20, 21},
             26: {201, 755},
             27: {47, 71},
             28: {47, 48},
             29: {41, 43},
             30: {41, 43},
             31: {643, 886},
             32: {50, 177},
             33: {198, 361},
             34: {169, 324},
       

In [56]:
def knn_top_1_TEST(dists: np.array, 
                   neibs: np.array, 
                   supervision: pd.DataFrame,
                   left_indexing: np.array,
                   right_indexing: np.array, k):
    neibs = right_indexing[neibs]
    if k is not None:
        l, r = np.where(dists <= np.max(dists[:,:k], axis=1)[:,None]) ## to get all equidistant mins
    else:
        l, r = np.where(dists <= thresh) 
    
    top_index = defaultdict(set)
    for i,j in zip(l,r):
        top_index[i].add(neibs[i,j])
    
    predicted = []
    supervision = supervision.to_numpy()
    true = supervision[:,2]
    for left, right, label in supervision:
        if right in top_index[left]:
            predicted += [1]
        else:
            predicted += [0]
    return top_index

In [None]:
def knn_IMDB_wiki_recall(dists: np.array,
                         neibs: np.array,
                         supervision: pd.DataFrame,
                         left_indexing: np.array,
                         right_indexing: np.array,
                         k: int = None,
                         thresh: float = None):
    supervision = supervision.set_index('IMDB_ID')
    mode = "QID"
    if k is not None:
        neibs = right_indexing[neibs[:,:k]]
    else:
        pass # TODO
    results = []
    MRR_results = []
    for idx, row in enumerate(neibs):
        match = 0
        mrr = 0
        
        qid = left_indexing[idx]
        true_match = supervision.loc[qid][mode]
        for entry in row: 
            mrr += 1.
            if entry == true_match:
                match = 1
                break
        results.append(match)
        MRR_results.append(match/mrr)
    return np.mean(results), np.sum(results), np.mean(MRR_results), results, MRR_results 

In [8]:
for d in range(3,13):
    print(datasets[d])
    left = f'/lfs/1/sahaana/enrichment/data/deepmatcher/{datasets[d]}/tableA_processed.pkl'
    right = f'/lfs/1/sahaana/enrichment/data/deepmatcher/{datasets[d]}/tableB_processed.pkl'

    left = pd.read_pickle(left)
    right = pd.read_pickle(right)

    train_df = f'/lfs/1/sahaana/enrichment/data/deepmatcher/{datasets[d]}/train_updated.csv'
    train_df = pd.read_csv(train_df)

    val_df = f'/lfs/1/sahaana/enrichment/data/deepmatcher/{datasets[d]}/val_updated.csv'
    val_df = pd.read_csv(val_df)

    test_df = f'/lfs/1/sahaana/enrichment/data/deepmatcher/{datasets[d]}/test_updated.csv'
    test_df = pd.read_csv(test_df)

    epochs = 1
    batch_size = 8
    final_size = 200
    lr = .00001
    tl_margin = 1.0
    tl_p = 2
    pool_type = "CLS"
    column = "merged_all"
    shuffle = True
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    tokenizer_max_length = 512
    compute_val = False
    train_size = len(train_df)

    bert_path=f'/lfs/1/sahaana/enrichment/ember/pretraining/models/{datasets[d]}-uncased-masked-ALL-BM25'
    bert_model = DistilBertModel.from_pretrained(bert_path, return_dict=True)

    model_name = f'{datasets[d]}-uncased-masked-ALL-BM25-{train_size}'

    train_data = DataLoader(DeepMatcherDataset(left, right, train_size, column, train_df), 
                            batch_size=batch_size,
                            shuffle = shuffle
                            )

    val_data = DataLoader(DeepMatcherDataset(left, right, len(val_df), column, val_df), 
                           batch_size=batch_size,
                           shuffle = False
                          )

    triplet_loss = nn.TripletMarginLoss(margin=tl_margin, p=tl_p)
    losses = []
    val_losses = []
    model = TripletSingleBERTModel(final_size, pool_type, bert_path)
    optimizer = optim.AdamW(model.parameters(), lr=lr)#optim.SGD(model.parameters(), lr=lr)

    save_dir = param_header(batch_size, final_size, lr, pool_type, epochs, train_size)
    save_dir = f'models/{model_name}/{save_dir}/'

    wandb.init(project=model_name)

    train_model(model, 
                tokenizer, 
                tokenize_batch, 
                train_data, 
                val_data, 
                triplet_loss, 
                optimizer, 
                epochs, 
                losses, 
                val_losses, 
                save_dir, 
                compute_val, 
                tokenizer_max_length = tokenizer_max_length)


    left_data = DataLoader(EmberEvalDataset(left, column), 
                           batch_size=batch_size,
                           shuffle = False
                          )
    right_data = DataLoader(EmberEvalDataset(right, column), 
                           batch_size=batch_size,
                           shuffle = False
                          )

    left_embeddings = eval_model(model, tokenizer, left_data, tokenizer_max_length=512)
    right_embeddings = eval_model(model, tokenizer, right_data, tokenizer_max_length=512)

    knn = FaissKNeighbors(k=30)
    knn.fit(right_embeddings)
    neib = knn.kneighbors(left_embeddings)
    print(datasets[d])
    print(f"precision, recall, F1, support: {knn_top_1_PRFS(neib[0], neib[1], test_df)}")
    print()
    print()
    print()

company_exp_data


[34m[1mwandb[0m: wandb version 0.10.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


company_exp_data
precision, recall, F1, support: (0.9941223617419183, 0.6597517730496454, 0.7931365234999467, None)



dblp_acm_exp_data


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Epoch,0.0
_step,8450.0
_runtime,4337.0
_timestamp,1612997681.0
train batch loss,0.0


0,1
Epoch,▁
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train batch loss,▂█▂▃▁▂▃▁▁▄▄▁▁▁▂▁▄▁▂▂▁▃▃▁▁▂▁▁▁▃▁▁▂▁▁▁▁▁▁▁


[34m[1mwandb[0m: wandb version 0.10.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


dblp_acm_exp_data
precision, recall, F1, support: (0.9608695652173913, 0.9954954954954955, 0.9778761061946903, None)



dblp_scholar_exp_data


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Epoch,0.0
_step,928.0
_runtime,111.0
_timestamp,1612998260.0
train batch loss,0.0


0,1
Epoch,▁
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train batch loss,█▁▁▁▁▁▃▁▁▁▁▁▂▄▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: wandb version 0.10.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


dblp_scholar_exp_data
precision, recall, F1, support: (0.9459459459459459, 0.4252336448598131, 0.5867182462927143, None)



dirty_dblp_acm_exp_data


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Epoch,0.0
_step,2153.0
_runtime,246.0
_timestamp,1612998525.0
train batch loss,0.0


0,1
Epoch,▁
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train batch loss,█▅█▂▁▁▅▁▃▂▁▁▁▁▁▁▁▁▇▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▃▅▁


[34m[1mwandb[0m: wandb version 0.10.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


dirty_dblp_acm_exp_data
precision, recall, F1, support: (0.9648351648351648, 0.9887387387387387, 0.9766407119021134, None)



dirty_dblp_scholar_exp_data


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Epoch,0.0
_step,928.0
_runtime,115.0
_timestamp,1612998786.0
train batch loss,0.0


0,1
Epoch,▁
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train batch loss,█▃▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁


[34m[1mwandb[0m: wandb version 0.10.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


dirty_dblp_scholar_exp_data
precision, recall, F1, support: (0.9481327800829875, 0.42710280373831777, 0.5889175257731959, None)



dirty_itunes_amazon_exp_data


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Epoch,0.0
_step,2153.0
_runtime,252.0
_timestamp,1612999058.0
train batch loss,0.0


0,1
Epoch,▁
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train batch loss,▇▃█▃▁█▅▂▂▂▂▃▅▁▁▂▁▁▁▁▁▁▁▂▁▁▂▁▁▁▆▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: wandb version 0.10.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


dirty_itunes_amazon_exp_data
precision, recall, F1, support: (1.0, 0.2222222222222222, 0.3636363636363636, None)



dirty_walmart_amazon_exp_data


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Epoch,0.0
_step,41.0
_runtime,13.0
_timestamp,1612999223.0
train batch loss,0.0


0,1
Epoch,▁
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
_runtime,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▄▄▄▄▄▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇███
_timestamp,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▄▄▄▄▄▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇███
train batch loss,█▇▆▇▅▇▁▁▁▂▁▁▂▁▁▁▁▂▁▂▁▁▁▁▁▁▁▁▂▁▁▁▃▁▁▁▁▁▁▁


[34m[1mwandb[0m: wandb version 0.10.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


dirty_walmart_amazon_exp_data
precision, recall, F1, support: (0.6, 0.7461139896373057, 0.6651270207852193, None)



fodors_zagat_exp_data


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Epoch,0.0
_step,768.0
_runtime,95.0
_timestamp,1612999465.0
train batch loss,0.0


0,1
Epoch,▁
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train batch loss,█▂▃▃▁▅▄▃▅▃▆▃▁▁▃▂▁▁▄▃▁▁▁▁▁▁▂▂▁▁▁▂▂▁▁▁▁▂▁▁


[34m[1mwandb[0m: wandb version 0.10.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


fodors_zagat_exp_data
precision, recall, F1, support: (0.7777777777777778, 0.9545454545454546, 0.8571428571428572, None)



itunes_amazon_exp_data


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Epoch,0.0
_step,71.0
_runtime,15.0
_timestamp,1612999540.0
train batch loss,0.0


0,1
Epoch,▁
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
_timestamp,▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train batch loss,█▇▆▃▅▄▃▄▄▅▃▂▂▆▁▅▃▂▃▃▂▃▁▁▁▃▂▁▁▂▁▁▁▂▁▁▁▁▁▁


[34m[1mwandb[0m: wandb version 0.10.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


itunes_amazon_exp_data
precision, recall, F1, support: (0.8888888888888888, 0.5925925925925926, 0.711111111111111, None)



walmart_amazon_exp_data


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Epoch,0.0
_step,41.0
_runtime,12.0
_timestamp,1612999562.0
train batch loss,0.0


0,1
Epoch,▁
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
_runtime,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▄▄▄▄▄▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇▇███
_timestamp,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▄▄▄▄▄▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇▇███
train batch loss,█▁▃▃▂▁▃▁▁▁▂▃▁▁▁▁▁▂▁▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: wandb version 0.10.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


walmart_amazon_exp_data
precision, recall, F1, support: (0.7089201877934272, 0.7823834196891192, 0.7438423645320198, None)



