# Sentence Pair Classification

In [1]:
import os
import sys
import logging
import torch
import numpy as np
import argparse
from math import exp

from torch import nn
from torch.optim import lr_scheduler
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim

from random import random as rand


from transformers import BertTokenizer, BertForNextSentencePrediction
from transformers import AutoTokenizer, AutoModelForMaskedLM,AutoModel
#import sentence_pair_utils
#import sentence_pair_evaluation
import transformers
import pandas as pd

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [2]:
transformers.logging.set_verbosity_error()

LOG_FORMAT = '%(asctime)s : %(filename)s : %(funcName)s : %(levelname)s : %(message)s'
logging.basicConfig(filename='SentencePairCLS.log', level=logging.INFO, format=LOG_FORMAT)
logger = logging.getLogger("SentencePairCLS")

In [5]:
import pickle
def pkl_vars(varname, filename):
    with open(filename, 'wb') as file:
        pickle.dump(varname, file)
        
def reload_vars(filename):
    this_var = None
    with open(filename, 'rb') as file:
        this_var = pickle.load(file)      
    return this_var

## Dataset prepation

In [33]:
df_flattened_dd = pd.read_csv("parsed_disambiguation_list_without===.csv")

In [132]:
df_flattened_dd.tail()

Unnamed: 0,dd_page_title,term,definition,def_page_title,categories,plain_text_definition
1976,Definition:Zero,Zero,Let $\mathbb A$ be one of the standard number ...,Definition:Zero Mapping,['Definitions/Mapping Theory'],"Let ùî∏ be one of the standard number systems ‚Ñï,..."
1977,Definition:Zero,Zero,Let $f: R \to R$ be a mapping on a ring $R$.\n...,Definition:Root of Mapping,"['Definitions/Roots of Mappings', 'Definitions...",Let f: R ‚Üí R be a mapping on a ring R.\n\nLet ...
1978,Definition:Zero,Zero,"Let $\left( R, +_R, \times_R \right)$ be a rin...",Definition:Zero Vector,"['Definitions/Module Theory', 'Definitions/Vec...","Let ( R, +_R, √ó_R ) be a ring.\n\nLet ( G, +_G..."
1979,Definition:Zero Locus,Zero Locus,Let $k$ be a field.\n\nLet $n\geq1$ be a natur...,Definition:Zero Locus of Set of Polynomials,['Definitions/Algebraic Geometry'],Let k be a field.\n\nLet n‚â•1 be a natural numb...
1980,Definition:Zero Locus,Zero Locus,Let $A$ be a commutative ring with unity.\n\nL...,Definition:Vanishing Set of Subset of Ring,['Definitions/Zariski Topology'],Let A be a commutative ring with unity.\n\nLet...


In [135]:
def_set = set(df_flattened_dd["plain_text_definition"])
pkl_vars(def_set, "data/vars/def_set.pkl")

title_set = set(df_flattened_dd["def_page_title"])
pkl_vars(title_set, "data/vars/title_set.pkl")

In [51]:
parsed_disamb_list = []
for key, group in df_flattened_dd.groupby("term"):
    items = group[["plain_text_definition", "def_page_title"]].to_dict(orient="records")
    parsed_disamb_list.append({"term": key, "def_item_list": items})

In [53]:
pkl_vars(parsed_disamb_list,"data/vars/parsed_disamb_list_without===.pkl")

In [60]:
parsed_disamb_list = reload_vars("data/vars/parsed_disamb_list_without===.pkl")

In [61]:
len(parsed_disamb_list)

344

In [62]:
import random
random.shuffle(parsed_disamb_list)
test_disamb_list = parsed_disamb_list[:68] # int(0.2 * len(parsed_disamb_list)) = 68
train_disamb_list = parsed_disamb_list[68:]
len(train_disamb_list)

276

In [68]:
pkl_vars(train_disamb_list, 'data/vars/train_disamb_list.pkl')
pkl_vars(test_disamb_list, 'data/vars/test_disamb_list.pkl')

In [63]:
test_disamb_list[0]

{'term': 'Negation',
 'def_item_list': [{'plain_text_definition': 'The logical not or (logical) negation operator is a unary connective whose action is to reverse the truth value of the statement on which it operates.\n\n:p is defined as:\n:p is not true\n:It is not the case that p is true\n:It is false that p\n:p is false.\n\n\nThus the statement p is called the negation of p.\n\n\np is voiced not p.\n\n\n=== Truth Function ===\nThe logical not connective defines the truth function f^ as follows:\n\n \n \n \n \n\n=== Truth Table ===\nThe characteristic truth table of the negation operator p is as follows:\n\n:[ p p; F T; T F;   ]\n\n=== Boolean Interpretation ===\nLet ùêÄ be a propositional formula.\n\nLet  denote the negation operator.\n\n\nThe truth value of ùêÄ under a boolean interpretation v is given by:\n\n:v (   )ùêÄ = T    : v (   )ùêÄ = F\nF    : v (   )ùêÄ = T',
   'def_page_title': 'Definition:Logical Not'},
  {'plain_text_definition': 'The negation function is the fun

### make  dict

In [65]:
# the kb is a dictionary of notion and definition page titles, each title contains the term and its domain
def make_def_kb(def_disamb_list):
    def_kb = dict()
    for x in def_disamb_list:
        term = x['term']
        titles = [def_item['def_page_title'] for def_item in x['def_item_list']]
        def_kb[term] = titles
    return def_kb

def_disamb_kb = make_def_kb(parsed_disamb_list)
len(def_disamb_kb)

344

In [66]:
pkl_vars(def_disamb_kb, 'data/vars/def_disamb_kb.pkl')

In [73]:
def flatten_def_disamb_list(disamb_list):
    flattened_dd_list = []
    for li in disamb_list:
        for def_item in li['def_item_list']:
            data = {
                'title': def_item['def_page_title'],
                'term': li['term'],
                'definition': def_item['plain_text_definition'],
                #'def_page_title': def_item['title'],
                #'categories': def_item['categories']
            }
            flattened_dd_list.append(data)
    return flattened_dd_list

In [74]:
flattened_test_disam_list = flatten_def_disamb_list(test_disamb_list)
len(flattened_test_disam_list)

376

In [75]:
flattened_train_disamb_list = flatten_def_disamb_list(train_disamb_list)
len(flattened_train_disamb_list)

1605

In [76]:
pkl_vars(flattened_test_disam_list, 'data/vars/flattened_test_disam_list.pkl'), pkl_vars(flattened_train_disamb_list, 'data/vars/flattened_train_disamb_list.pkl')

(None, None)

In [77]:
flattened_test_disam_list = reload_vars('data/vars/flattened_test_disam_list.pkl')
df_flattened_test_disam_list = pd.DataFrame(flattened_test_disam_list)
df_flattened_test_disam_list.to_csv("data/SP_CLS/df_flattened_test_disam_list.csv", index=False)

In [193]:
flattened_train_disamb_list = reload_vars('data/vars/flattened_train_disamb_list.pkl')
df_flattened_train_disam_list = pd.DataFrame(flattened_train_disamb_list)
df_flattened_train_disam_list.to_csv("data/SP_CLS/df_flattened_train_disam_list.csv", index=False)

### GLADIS data samples

# defining DefDisambiguationBERT

## model for next sentence prediction 

In [118]:
#Copied from https://github.com/tigerchen52/GLADIS/blob/master/source/acrobert.py
def_disamb_kb = reload_vars('data/vars/def_disamb_kb.pkl')

class DefDisambiguationBERT(nn.Module):
    def __init__(self, model_name="bert-base-uncased", from_tf=False, device='cpu'):
        super().__init__()
        self.model_name=model_name
        self.device = device
        self.model = BertForNextSentencePrediction.from_pretrained(model_name, from_tf=from_tf)
        self.tokenizer = BertTokenizer.from_pretrained(model_name)

    def forward(self, pos_x, masked_pos_x=None, neg_x=None, train=True):
        loss, scores = 0.0, 0.0
        if train:
            pos_samples = self.tokenizer(pos_x, padding=True, return_tensors='pt', truncation=True)["input_ids"]
            neg_x = self.tokenizer(neg_x, padding=True, return_tensors='pt', truncation=True)["input_ids"]

            pos_samples = pos_samples.to(self.device)
            neg_x = neg_x.to(self.device)

            pos_outputs = self.model(pos_samples).logits
            neg_outputs = self.model(neg_x).logits
            pos_scores = 1 - nn.Softmax(dim=1)(pos_outputs)[:, 0]
            neg_scores = 1 - nn.Softmax(dim=1)(neg_outputs)[:, 0]
            loss = triplet_loss(pos_scores, neg_scores, args.margin)

        else:
            samples = self.tokenizer(pos_x, padding=True, return_tensors='pt', truncation=True)["input_ids"]
            samples = samples.to(self.device)
            outputs = self.model(samples).logits
            scores = nn.Softmax(dim=1)(outputs)[:, 0]


        return loss if train else scores
    
        
def triplet_loss(pos_score, neg_score, margin=0.2):
    losses = torch.relu(pos_score - neg_score + margin)
    return losses.mean()


def softmax(elements):
    total = sum([exp(e) for e in elements])
    return exp(elements[0]) / total

def cal_nsp_score(model, tokenizer, titles, definition, batch_size):
    ps = list()
    for index in range(0, len(titles), batch_size):
        batch_tt = titles[index:index + batch_size]
        batch_df = [definition] * len(batch_tt)
        encoding = tokenizer(batch_tt, batch_df, return_tensors="pt", padding=True, truncation=True, max_length=400)#.to(device)
        outputs = model(**encoding)
        logits = outputs.logits#.cpu().detach().numpy()
        p = [softmax(lg) for lg in logits]
        ps.extend(p)
    return ps

## model for embedding similarity

In [187]:
class SentenceEncoderModel(nn.Module):
    def __init__(self, model_name="bert-base-uncased", from_tf=False, device='cpu'):
        super().__init__()
        self.model_name=model_name
        self.device = device
        self.model = AutoModel.from_pretrained(model_name, from_tf=from_tf)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        def_emb_dict_path = f"data/vars/{model_name.split('/')[-1]}_def_emb_dict.pkl"
        if not os.path.isfile(def_emb_dict_path):
            defs = list(reload_vars("data/vars/def_set.pkl"))
            def_embs = get_embeddings(self.model, self.tokenizer, defs)
            def_emb_dict = dict()
            for index, definition in enumerate(defs):
                def_emb_dict[definition] = def_embs[index]
            pkl_vars(def_emb_dict, def_emb_dict_path)
        else:
            def_emb_dict = reload_vars(def_emb_dict_path)           
        self.def_emb_dict = def_emb_dict

        title_emb_dict_path = f"data/vars/{model_name.split('/')[-1]}_title_emb_dict_path.pkl"
        if not os.path.isfile(title_emb_dict_path):
            titles = list(reload_vars("data/vars/title_set.pkl"))
            tt_embs = get_embeddings(self.model, self.tokenizer, titles)
            title_emb_dict = dict()
            for index, tt in enumerate(titles):
                title_emb_dict[tt] = tt_embs[index]
            pkl_vars(title_emb_dict, title_emb_dict_path)
        else:
            title_emb_dict = reload_vars(title_emb_dict_path)
        self.title_emb_dict = title_emb_dict
        
        
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def get_embeddings(model, tokenizer, sentences):
    batch_tokenizer_output = tokenizer(
        sentences,
        max_length=400,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    with torch.no_grad():
        batch_model_output = model(**batch_tokenizer_output, output_hidden_states=True)
    token_embeddings = batch_model_output.hidden_states[-1]
    return mean_pooling(batch_model_output, batch_tokenizer_output.attention_mask).tolist() # convert .tolist() to save storage spaces
        

def cal_sim_score(sem_model, titles, definition):
    if definition in sem_model.def_emb_dict:
        def_emb = sem_model.def_emb_dict[definition]
    else:
        def_emb = get_embeddings(sem_model.model, sem_model.tokenizer, [definition])[0]

    tt_embs = []
    for tt in titles: 
        if tt in sem_model.title_emb_dict:
            tt_emb = sem_model.title_emb_dict[tt]
        else:
            tt_emb = get_embeddings(sem_model.model, sem_model.tokenizer, [tt])[0]
        tt_embs.append(tt_emb)
    
    ps = [F.cosine_similarity(torch.Tensor(def_emb), torch.Tensor(tt_emb), dim = -1) for tt_emb in tt_embs]
    return ps

In [171]:
def predict(task_model, term, definition, batch_size, dd_kb=def_disamb_kb, cal="nsp"):
    titles = get_candidate(dd_kb, term)
    #titles = [str.lower(can) for can in titles]
    if "nsp" == cal:
        scores = cal_nsp_score(task_model.model, task_model.tokenizer, titles, definition, batch_size)
        max_index = np.argmax(scores)
    else:
        scores = cal_sim_score(task_model, titles, definition)
        max_index = np.argmax(scores)
    return titles[max_index]

# evaluate prediction based on nsp and kb
def eval_pred(task_model, flattened_dd_list_path, batch_size, dd_kb=def_disamb_kb, cal="nsp",train=True):
    data = reload_vars(flattened_dd_list_path)
    true_labels, pred_labels = list(), list()
    for index, sample in enumerate(data):
        if index % 100 == 0:
            logger.info('processing {a} lines '.format(a=index))
        if train and index > 200: break
        term = sample['term']
        title = sample['title']
        definition = sample['definition']
        pred = predict(task_model, term, definition, batch_size, dd_kb=def_disamb_kb, cal=cal)
        true_labels.append(title)
        pred_labels.append(pred)
    df_res = pd.DataFrame(data)
    df_res['pred'] = pred_labels
    df_res.to_csv(f"data/res/{task_model.model_name.split('/')[-1]}_{cal}.csv")
    macro_f1_score = macro_f1(true_labels, pred_labels)
    acc = accuracy(true_labels, pred_labels)
    return macro_f1_score, acc


# √† refaire
def train(device):
    model = DefDisambiguationBERT(device=device)
    model.to(device)
    loader = utils.AcroBERTLoader(batch_size=args.batch_size, tokenizer=model.tokenizer, kb=def_disamb_kb, shuffle=args.shuffle, hard_num=args.hard_neg_numbers)

    train_loader = loader(data_path=args.pre_train_path)
    trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
    logger.info(trainable_num)
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
    scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.99)

    max_f1, max_epoch = 0.0, 0
    for e in range(args.epoch):
        epoch_loss = 0
        batch_num = 0

        for pos_samples, masked_pos_samples, neg_samples in train_loader:
            model.train()
            optimizer.zero_grad()

            if batch_num % args.loss_check_step == 0 and batch_num != 0:
                logger.info('sample = {b}, loss = {a}'.format(a=epoch_loss / batch_num, b=batch_num * args.batch_size))

            if batch_num % args.check_step == 0 and batch_num != 0:
                for g in optimizer.param_groups:
                    g['lr'] *= args.lr_decay
            if batch_num % args.check_step == 0 and batch_num != 0:
                temp_path = args.model_path.format(a=str(e + 1) + '_' + str(batch_num))
                torch.save(model.state_dict(), temp_path)
            loss = model(pos_samples, masked_pos_samples, neg_samples)

            # backward
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            batch_num += 1

        scheduler.step()
        temp_path = args.model_path.format(a=e + 1)
        logger.info('the pre-training finished, saving model, path = {a}'.format(a=temp_path))
        torch.save(model.state_dict(), temp_path)
    return max_f1, max_epoch

## utils

In [79]:
# adpated from https://github.com/tigerchen52/GLADIS/blob/master/source/utils.py 

#def_disamb_kb = reload_vars('data/vars/def_disamb_kb.pkl')
def get_candidate(def_disamb_kb, term):
    return def_disamb_kb[term]

class TextData(Dataset):
    def __init__(self, data):
        self.all_term = data['term']
        self.all_title = data['title']
        self.all_definition = data['definition']

    def __len__(self):
        return len(self.all_term)

    def __getitem__(self, idx):
        return self.all_term[idx], self.all_title[idx], self.all_definition[idx]
    
def load_pretrain(train_data_path):
    all_term, all_title, all_definition = list(), list(), list()
    cnt = 0
    for line in open(data_path, encoding='utf8'):
        cnt += 1
        row = line.strip().split('\t')
        if len(row) != 3:continue
        # if cnt>200:continue
        # obj = json.loads(line)
        # short_term, long_term, context = obj['short_term'], obj['long_term'], ' '.join(obj['tokens'])
        term, long_term, context = row[0], row[1], row[2]
        all_term.append(term)
        all_title.append(title)
        all_definition.append(definition)

    return {'term': all_term, 'title': all_title, 'definition': all_definition}

class AcroBERTLoader():
    def __init__(self, batch_size, tokenizer, kb, shuffle=True, masked_prob=0.15, hard_num=2):
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.tokenizer = tokenizer
        self.masked_prob = masked_prob
        self.hard_num = hard_num
        self.kb = kb
        self.all_long_terms = list()
        for vs in self.kb.values():
            self.all_long_terms.extend(list(vs))

    def select_negative(self, target):
        selected, flag, max_time = None, True, 10
        if target in self.kb:
            long_term_candidates = self.kb[target]
            if len(long_term_candidates) == 1:
                long_term_candidates = self.all_long_terms
        else:
            long_term_candidates = self.all_long_terms
        attempt = 0
        while flag and attempt < max_time:
            attempt += 1
            selected = random.choice(long_term_candidates)
            if selected != target:
                flag = False
        if attempt == max_time:
            selected = random.choice(self.all_long_terms)
        return selected

    def collate_fn(self, batch_data):
        batch_short_term, batch_long_term, batch_context = list(zip(*batch_data))
        pos_samples, neg_samples, masked_pos_samples = list(),  list(), list()
        for _ in range(self.hard_num):
            temp_pos_samples = [batch_long_term[index] + ' [SEP] ' + batch_context[index] for index in range(len(batch_long_term))]
            neg_long_terms = [self.select_negative(st) for st in batch_short_term]
            temp_neg_samples = [neg_long_terms[index] + ' [SEP] ' + batch_context[index] for index in range(len(batch_long_term))]
            temp_masked_pos_samples = [batch_long_term[index] + ' [SEP] ' + batch_context[index] for index in range(len(batch_long_term))]

            pos_samples.extend(temp_pos_samples)
            neg_samples.extend(temp_neg_samples)
            masked_pos_samples.extend(temp_masked_pos_samples)
        return pos_samples,  masked_pos_samples,  neg_samples

    def __call__(self, data_path):
        dataset = load_pretrain(data_path=data_path)
        logger.info('loaded dataset, sample = {a}'.format(a=len(dataset['short_term'])))
        dataset = TextData(dataset)
        train_iterator = DataLoader(dataset=dataset, batch_size=self.batch_size // (2 * self.hard_num), shuffle=self.shuffle,
                                    collate_fn=self.collate_fn)
        return train_iterator

## evaluation

In [108]:
# adpated from https://github.com/tigerchen52/GLADIS/blob/master/source/evaluation.py
from sklearn.metrics import f1_score
from collections import OrderedDict


def transform_to_index(trues, preds):

    true_map = dict([(title, index) for index, title in enumerate(list(OrderedDict.fromkeys(trues)))])
    tag_cnt = len(true_map)
    true_index = [true_map[t] for t in trues]

    for pred in preds:
        if pred not in true_map:
            true_map[pred] = len(true_map)
    pred_index = [true_map[pred] for pred in preds]
    return true_index, pred_index, tag_cnt


def macro_f1(trues, preds):
    true_index, pred_index, tag_cnt = transform_to_index(trues, preds)
    f1_scores = f1_score(true_index, pred_index, average=None)[:tag_cnt]
    macro_f1_score = sum(f1_scores) / len(f1_scores)
    return macro_f1_score


def accuracy(trues, preds):
    acc_cnt = 0
    for index, true in enumerate(trues):
        pred = preds[index]
        if pred == true:
            acc_cnt += 1
    acc = acc_cnt * 1.0 / len(trues)
    return acc

# Evaluate NSP with OOB LMs
Note: To cache the pairwised NSP calculations

In [115]:
%%time
cc_model = DefDisambiguationBERT("InriaValda/cc_math_bert_ep10", from_tf=True, device= device)
macro_f1_score, acc = eval_pred(cc_model, "data/vars/flattened_test_disam_list.pkl",
                                batch_size = 32, dd_kb=def_disamb_kb, train=False) 
macro_f1_score, acc 



CPU times: user 2h 58min 16s, sys: 8min 32s, total: 3h 6min 49s
Wall time: 4min 7s


(0.08303624480095069, 0.13031914893617022)

In [110]:
%%time 
BERT_OOB = DefDisambiguationBERT(device= device)
macro_f1_score, acc = eval_pred(BERT_OOB, "data/vars/flattened_test_disam_list.pkl",
                                batch_size = 4, dd_kb=def_disamb_kb, train=False) 
macro_f1_score, acc



CPU times: user 3h 8min, sys: 10min 44s, total: 3h 18min 44s
Wall time: 4min 21s


(0.7405525846702316, 0.7872340425531915)

# Evaluate embedding similarity with OOB LMs
Notes: without the pickled sentence-embedding dictionaries, each 768-dimension sentence bert model takes ~ 11 minutes to calculate all the embeddings once.

In [183]:
%%time
cc_sem_model = SentenceEncoderModel("InriaValda/cc_math_bert_ep10", from_tf=True, device= device)



CPU times: user 3h 53min 50s, sys: 1h 46min 47s, total: 5h 40min 37s
Wall time: 10min 35s


In [184]:
%%time
macro_f1_score, acc = eval_pred(cc_sem_model, "data/vars/flattened_test_disam_list.pkl",
                                batch_size = 4, dd_kb=def_disamb_kb, train=False,
                               cal="sim") 
macro_f1_score, acc

CPU times: user 104 ms, sys: 20.2 ms, total: 124 ms
Wall time: 123 ms


(0.3265739803243325, 0.42819148936170215)

In [196]:
%%time
macro_f1_score, acc = eval_pred(cc_sem_model, "data/vars/flattened_train_disamb_list.pkl",
                                batch_size = 4, dd_kb=def_disamb_kb, train=False,
                               cal="sim") 
macro_f1_score, acc

CPU times: user 677 ms, sys: 49 ms, total: 726 ms
Wall time: 725 ms


(0.31244211633218444, 0.4093457943925234)

In [191]:
%%time
mini_sem_model = SentenceEncoderModel("sentence-transformers/all-MiniLM-L6-v2", device= device)

macro_f1_score, acc = eval_pred(mini_sem_model, "data/vars/flattened_test_disam_list.pkl",
                                batch_size = 4, dd_kb=def_disamb_kb, train=False,
                               cal="sim") 
macro_f1_score, acc

CPU times: user 35min 38s, sys: 42min 59s, total: 1h 18min 38s
Wall time: 2min 32s


(0.8358798064680415, 0.8696808510638298)

In [194]:
macro_f1_score, acc = eval_pred(mini_sem_model, "data/vars/flattened_train_disamb_list.pkl",
                                batch_size = 4, dd_kb=def_disamb_kb, train=False,
                               cal="sim") 
macro_f1_score, acc

(0.8222014051522248, 0.8616822429906542)

In [192]:
%%time
bert_sem_model = SentenceEncoderModel("bert-base-uncased", device= device)

macro_f1_score, acc = eval_pred(bert_sem_model, "data/vars/flattened_test_disam_list.pkl",
                                batch_size = 4, dd_kb=def_disamb_kb, train=False,
                               cal="sim") 
macro_f1_score, acc



CPU times: user 4h 12min 53s, sys: 1h 47min 41s, total: 6h 35s
Wall time: 10min 25s


(0.2493523113843971, 0.3484042553191489)

In [198]:
%%time
macro_f1_score, acc = eval_pred(bert_sem_model, "data/vars/flattened_train_disamb_list.pkl",
                                batch_size = 4, dd_kb=def_disamb_kb, train=False,
                               cal="sim") 
macro_f1_score, acc

CPU times: user 3.65 s, sys: 78.1 ms, total: 3.73 s
Wall time: 3.73 s


(0.2755867156522898, 0.3719626168224299)

In [199]:
%%time
MLM_arXiv_sem_model = SentenceEncoderModel("math-similarity/Bert-MLM_arXiv", device= device)

macro_f1_score, acc = eval_pred(MLM_arXiv_sem_model, "data/vars/flattened_test_disam_list.pkl",
                                batch_size = 4, dd_kb=def_disamb_kb, train=False,
                               cal="sim") 
macro_f1_score, acc



CPU times: user 4h 10min 15s, sys: 1h 48min 7s, total: 5h 58min 23s
Wall time: 10min 32s


(0.3274004367587258, 0.4308510638297872)

In [201]:
macro_f1_score, acc = eval_pred(MLM_arXiv_sem_model, "data/vars/flattened_train_disamb_list.pkl",
                                batch_size = 4, dd_kb=def_disamb_kb, train=False,
                               cal="sim") 
macro_f1_score, acc

(0.28560215301681124, 0.3850467289719626)

In [200]:
%%time
zbmath_sem_model = SentenceEncoderModel("math-similarity/Bert-MLM_arXiv-MP-class_zbMath", device= device)

macro_f1_score, acc = eval_pred(zbmath_sem_model, "data/vars/flattened_test_disam_list.pkl",
                                batch_size = 4, dd_kb=def_disamb_kb, train=False,
                               cal="sim") 
macro_f1_score, acc



config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

CPU times: user 4h 21min 22s, sys: 1h 44min 14s, total: 6h 5min 36s
Wall time: 10min 51s


(0.5223028605381546, 0.6063829787234043)

In [202]:
macro_f1_score, acc = eval_pred(zbmath_sem_model, "data/vars/flattened_train_disamb_list.pkl",
                                batch_size = 4, dd_kb=def_disamb_kb, train=False,
                               cal="sim") 
macro_f1_score, acc

(0.4723080926359607, 0.5526479750778817)

In [166]:
len(cc_sem_model.title_emb_dict['Definition:Field Zero'])

768

In [188]:
cal_sim_score(cc_sem_model, [title0, 'blabla'], def0)

[tensor(0.6233), tensor(0.5268)]

In [178]:
title02 = 'Definition:Field Zero2'
def02 = 'A2 dynamical system is a non-linear system in which a function describes the time dependence of a point in a geometrical space.\n\n\n=== Flow ===\nIn a dynamical system, a set of time-dependent equations is known as flow.\n\n '

In [189]:
embs = get_embeddings(cc_sem_model.model, cc_sem_model.tokenizer, [title0, 'blabla'])