In [1]:
# if '/home/ryparmar/.local/bin' not in sys.path:
#     sys.path.append('/home/ryparmar/.local/bin')

In [2]:
# !python -m pip install -U -q transformers==3.1.0 --user

In [3]:
# transformers.__version__

In [1]:
from tqdm import tqdm
from datetime import datetime
import sys, os
import logging

import torch 
from torch.utils.data import DataLoader, TensorDataset
import transformers

In [2]:
import faiss

In [3]:
if '/home/ryparmar/experimental-martin/inverse-cloze-task/src/' not in sys.path:
    sys.path.append('/home/ryparmar/experimental-martin/inverse-cloze-task/src')

import util, eval, io_util
from model import Encoder as Model

In [4]:
def optimizer_to(optim, device):
    for param in optim.state.values():
        # Not sure there are any global tensors in the state dict
        if isinstance(param, torch.Tensor):
            param.data = param.data.to(device)
            if param._grad is not None:
                param._grad.data = param._grad.data.to(device)
        elif isinstance(param, dict):
            for subparam in param.values():
                if isinstance(subparam, torch.Tensor):
                    subparam.data = subparam.data.to(device)
                    if subparam._grad is not None:
                        subparam._grad.data = subparam._grad.data.to(device)


def instantiate_model(config, tokenizer):
    configure_devices(config)
    model = Model(config)
    optimizer = transformers.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=0)
    last_epoch = 0
    epoch_avg_loss = 0
    if config.continue_training:
        state_dict = torch.load(config.continue_training, map_location='cpu')
        model.load_state_dict(state_dict['model'])
        if 'optimizer_state_dict' in state_dict:
            optimizer.load_state_dict(state_dict['optimizer_state_dict'])
        last_epoch = state_dict['epoch']
        # epoch_avg_loss = state_dict['loss']
        # del state_dict # TODO TEST
    if config.use_cuda:
        model = model.cuda()
        optimizer_to(optimizer, config.device)
        model = torch.nn.DataParallel(model, device_ids=config.devices)
    return model, optimizer, last_epoch, epoch_avg_loss


def configure_devices(config):
    config.devices = [int(device) for device in range(torch.cuda.device_count())]
    config.device = config.devices[0] if config.use_cuda else "cpu"


def get_loader(data, batch_size):
    data = TensorDataset(data)
    return DataLoader(data,
                      batch_size=batch_size,
                      shuffle=True,
                      sampler=None, drop_last=True)

In [6]:
class Config:
    def __init__(self):
#         self.mode = 'finetuning'
#         self.claims_path = "/mnt/data/factcheck/CTK/par4/ctk-data"
#         self.articles_path = "/mnt/data/factcheck/CTK/par4/interim/ctk_filtered.db"
#         self.articles_chunks_path = "/mnt/data/factcheck/ict_chunked_data/ids-chunks-288-finetuning-ctk_filtered.pkl"
#         self.continue_training = "/home/ryparmar/trained_models/finetuned_mbert_10epochs_lr_1e-6_ict_1.4.w"
        #ctk_pre_3ep_1e-5_288.w"
        self.mode = 'finetuning'
        self.claims_path = "/mnt/data/factcheck/fever/data-cs/fever-data"
        self.articles_path = "/mnt/data/factcheck/fever/data-cs/fever/fever.db"
        self.articles_chunks_path = "/home/ryparmar/trained_models/ids-chunks-288-finetuning-feverwiki.pkl"
        self.continue_training = "/home/ryparmar/trained_models/finetuned_mbert_10epochs_lr_1e-6_ict_1.4.w"
        self.bert_model = "bert-base-multilingual-cased"
        self.learning_rate = 1e-6
        self.max_seq = 288
        self.epoch = 1
        self.batch_size = 32
        self.test_batch_size = 64
        self.remove_percent = 0.9
        self.use_cuda = True
        self.logger = logging.getLogger(__name__)
        
    def add(self, name, val):
        if name == 'cls_token_id':
            self.cls_token_id = val
        if name == 'pad_token_id':
            self.pad_token_id = val

In [7]:
config = Config()
tokenizer = transformers.BertTokenizerFast.from_pretrained(config.bert_model)
config.add('cls_token_id', tokenizer.encode(tokenizer.cls_token, add_special_tokens=False)[0])
config.add('pad_token_id', tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0])


model, optimizer, _, _ = instantiate_model(config, tokenizer)
loss_fn = torch.nn.CrossEntropyLoss()
metrics = eval.Metrics()

In [8]:
chunks, articles_ids = util.make_chunks(config.articles_path, tokenizer, config, save_chunks=False)
chunks, chunks_mask = util.process_chunks(chunks, config)
dev_chunks, dev_masks, dev_articles_ids = chunks, chunks_mask, articles_ids

In [9]:
claims_dev, evidence_dev, labels_dev = util.load_claims('dev', config)

Loaded 9999 claims from dev split.


In [10]:
claims_dev, claims_dev_mask = util.process_claims(claims_dev, tokenizer, config, _pad_max=True)
print(f"{len(claims_dev)} dev claims prepared for finetuning.")

9999 dev claims prepared for finetuning.


In [11]:
claims_dev[0].shape

torch.Size([288])

In [12]:
tokenizer.convert_ids_to_tokens(claims_dev[0])

['[CLS]',
 'Soc',
 '##iologie',
 'je',
 'studium',
 'v',
 '##y',
 '##́',
 '##voje',
 'politik',
 '##y',
 '.',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',


## Evaluate dev set

In [13]:
## Load embedded documents
doc_emb_path = f"/home/ryparmar/trained_models/doc-emb-{config.continue_training.split('/')[-1]}.npy"
if os.path.exists(doc_emb_path):
    model.eval()
    eval_claim_embeddings = eval.encode_chunks(claims_dev, claims_dev_mask, model, batch_size=config.test_batch_size)
    eval_document_embeddings = io_util.load_np_embeddings(doc_emb_path)
else:
    eval_claim_embeddings, \
    eval_document_embeddings = eval.evaluation_preprocessing(claims_dev, claims_dev_mask, chunks, chunks_mask, model, config)
    ### CTK PRETRAIN -- Save embedded documents
    io_util.save_np_embeddings(eval_document_embeddings, doc_emb_path)
    model.to('cpu')

Embedding given chunks...: 100%|██████████| 7057/7057 [37:53<00:00,  3.10it/s]
Embedding given chunks...: 100%|██████████| 157/157 [00:50<00:00,  3.13it/s]


In [14]:
import numpy as np
query_embeddings = eval_claim_embeddings
corpus_embeddings = eval_document_embeddings
evidence, labels, corpus_ids, config, k = evidence_dev, labels_dev, dev_articles_ids, config, 10

macro_precision, macro_precision_hits = 0, 0
macro_recall, macro_recall_hits = 0, 0
macro_mrr, macro_mrr_hits = 0, 0

D, I = eval.search_top_k(corpus_embeddings, np.asarray(query_embeddings), corpus_embeddings.shape[-1], k, config)

In [None]:
for i, top_k_idxs in tqdm(enumerate(I), desc='Calculating evaluation metrics'):
    predicted_evidence = np.take(corpus_ids, I[i])
    
    try:
        macro_prec = eval.evidence_macro_precision(evidence[i], labels[i], predicted_evidence)
        macro_precision += macro_prec[0]
        macro_precision_hits += macro_prec[1]

        macro_rec = eval.evidence_macro_recall(evidence[i], labels[i], predicted_evidence)
        macro_recall += macro_rec[0]
        macro_recall_hits += macro_rec[1]

        macro_rr = eval.evidence_macro_mrr(evidence[i], labels[i], predicted_evidence)
        macro_mrr += macro_rr[0]
        macro_mrr_hits += macro_rr[1]
    except:
        print("ERROR on ", i)

Calculating evaluation metrics: 6849it [14:32,  7.87it/s]

In [None]:
pr = (macro_precision / macro_precision_hits) if macro_precision_hits > 0 else 1.0
rec = (macro_recall / macro_recall_hits) if macro_recall_hits > 0 else 0.0
mrr = (macro_mrr / macro_mrr_hits) if macro_mrr_hits > 0 else 0.0
f1 = 2.0 * pr * rec / (pr + rec + 1e-6)
print(f"F1: {f1*100}\nRecall@{k}: {rec*100}\nPrecision@10: {pr*100}\nMRR@10: {mrr*100}")

In [40]:
# # Evaluation
# for k in [10, 20]:
#     precision, recall, f1 = eval.retriever_score(eval_claim_embeddings, eval_document_embeddings, 
#                                                 evidence_dev, labels_dev, dev_articles_ids, config, k=k)
#     # config.logger.info
#     print(f"F1: {f1}\tRecall@{k}: {recall}\tPrecision@10: {precision}")

## FAISS

In [21]:
import numpy as np
import math

In [22]:
# SEARCH TOP K -- Prepare index
index = faiss.index_factory(eval_document_embeddings.shape[-1], "Flat", faiss.METRIC_INNER_PRODUCT)
faiss.normalize_L2(eval_document_embeddings)  # need to normalize query and corpus vectors for cosine distance
faiss.normalize_L2(eval_claim_embeddings)
if config.device != 'cpu':
    res = faiss.StandardGpuResources()
    if len(config.devices) > 1:
        dev_index = faiss.index_cpu_to_all_gpus(index)  # use gpu
    else:
        dev_index = faiss.index_cpu_to_gpu(res, 0, index)
else:
    dev_index = index
dev_index.add(eval_document_embeddings)

In [23]:
# Execute the search
D, I = dev_index.search(eval_claim_embeddings, 1000)

In [24]:
print(D.shape, I.shape)

(83158, 1000) (83158, 1000)


In [25]:
print(f"D: {D[:10, 0]}\nI: {I[:10, 0]}")

D: [0.79076725 0.831462   0.6842412  0.73045176 0.8863043  0.8287026
 0.77836084 0.7367048  0.79012954 0.6705815 ]
I: [292347 309477  49690 312889 211132 222134 422483 162622 322631 308474]


In [31]:
evidence_dev[1]

[[[245753, 246454, 'Sammy Cahn', 0, 'Sammy Cahn']]]

In [32]:
labels_dev[1]

'SUPPORTS'

In [36]:
I.shape

(83158, 1000)

In [37]:
len(articles_ids)

451629

In [68]:
# RETRIEVER SCORER
macro_precision, macro_precision_hits = 0, 0
macro_recall, macro_recall_hits = 0, 0
for i, top_k_idxs in enumerate(I):
    predicted_evidence = np.take(articles_ids, I[i])

    macro_prec = eval.evidence_macro_precision(evidence_train[i], labels_train[i], predicted_evidence, 
                                               max_evidence=None, page_only=True)
    
    macro_precision += macro_prec[0]
    macro_precision_hits += macro_prec[1]

    macro_rec = eval.evidence_macro_recall(evidence_train[i], labels_train[i], predicted_evidence)
    macro_recall += macro_rec[0]
    macro_recall_hits += macro_rec[1]

pr = (macro_precision / macro_precision_hits) if macro_precision_hits > 0 else 1.0
rec = (macro_recall / macro_recall_hits) if macro_recall_hits > 0 else 0.0
f1 = 2.0 * pr * rec / (pr + rec + 1e-6)

IndexError: string index out of range

In [69]:
labels_train[0]

'REFUTES'

In [70]:
evidence_train[0]

'Nelson Mandela'

In [71]:
predicted_evidence

array(['Tituly a vyznamenání Nelsona Mandely', '46664',
       'HIV/AIDS v Africe', 'Nelson Mandela', 'Veřejné zdraví',
       'Zdravotní pojištění', 'Umkhonto we Sizwe',
       'Zdravotnický záchranář', 'Populační fond OSN',
       'Diskografie Harveyho Mandela', 'Zdraví', 'Bill Nelson',
       'Oddlužení', 'Přístup k pitné vodě v rozvojovém světě',
       'Zdravá škola',
       'Zaměstnávání osob se zdravotním postižením',
       'Dětská úmrtnost',
       'Domov pro osoby se zdravotním postižením',
       'Ublížení na zdraví z nedbalosti', 'Zdravotnictví',
       'Shangrenade', 'Zdravotnické zařízení',
       'Převzetí člověka do zdravotnického zařízení bez jeho souhlasu',
       'Cena za lidskost', 'HIV/AIDS ve Svazijsku', 'Život dětem',
       'Zdravotní průkaz pracovníka v potravinářství',
       'Rodičovský příspěvek', 'Příspěvek na péči',
       'Dříve vyslovené přání',
       'Všeobecná zdravotni

In [72]:
evidence_train[0] in predicted_evidence

True

In [74]:
type(evidence_train[0])

str

In [73]:
macro_prec = eval.evidence_macro_precision(evidence_train[0], labels_train[0], predicted_evidence, 
                                                   max_evidence=None, page_only=True)

IndexError: string index out of range

In [75]:
def evidence_macro_precision(evidence, label, predicted_evid, max_evidence=None, page_only=True):
    """
    precision = predicted
    """
    this_precision = 0.0
    this_precision_hits = 0.0

    if label.upper() != "NOT ENOUGH INFO":
        if isinstance(evidence, list):
            if page_only:
                all_evi = [e[2] for eg in evidence for e in eg if e[3] is not None]
            else:
                all_evi = [[e[2], e[3]] for eg in evidence for e in eg if e[3] is not None]
        elif isinstance(evidence, str):
            all_evi = [evidence]
        else:
            print("UNEXPECTED EVIDENCE TYPE! in evidence_macro_precisin function!")

        for prediction in predicted_evid:
            if prediction in all_evi:
                this_precision += 1.0
            this_precision_hits += 1.0

        return (this_precision / this_precision_hits) if this_precision_hits > 0 else 1.0, 1.0

    return 0.0, 0.0

In [76]:
macro_prec = evidence_macro_precision(evidence_train[0], labels_train[0], predicted_evidence, 
                                                   max_evidence=None, page_only=True)

In [77]:
macro_prec

(0.001, 1.0)

In [41]:
for ev in evidence_dev[1]:
    print([e[2] for e in ev])

['Sammy Cahn']


In [35]:
print(f"precision: {pr}\trecall: {rec}\tf1: {f1}")

precision: 0.0	recall: 0.0	f1: 0.0


In [None]:
# update_metrics_scores_at_k(f1, k, metrics, precision, recall)
# if recall > metrics.max_rec and k == 10:  # we want to maximize recall@10
#     metrics.max_rec = recall
#     metrics.max_prec = precision
#     metrics.max_f1 = f1
#     config.logger.info(f"F1: {f1}\tRecall@{k}: {recall}\tPrecision@{k}: {precision}")
#     if save_model:
#         io_util.save_model(model, metrics.per_epoch['epoch'][-1], loss=metrics.per_epoch['loss'][-1], 
#                         rec=metrics.max_rec, optimizer=optimizer, config=config)

In [None]:
# eval.evaluate_model_update_metrics_and_save_if_best_recall(eval_claim_embeddings, eval_document_embeddings, 
#                                                 dev_articles_ids, evidence_dev, labels_dev, 
#                                                 metrics, model, optimizer, config, save_model=False)