In [1]:
import torch
from tqdm import tqdm

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    model = Encoder(config)
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
!python -m pip install transformers
!python -m pip install wget

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-32GB
[33mYou are using pip version 19.0.3, however version 20.2.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 19.0.3, however version 20.2.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
ls /mnt/data/factcheck/fever/data-cs/fever

fever.db


In [3]:
import sqlite3
import pandas as pd


# The URL for the dataset .db file.
dataset_path = '/mnt/data/factcheck/fever/data-cs/fever/fever.db'
devset_path = '/mnt/data/factcheck/fever/data-cs/fever-data/dev.jsonl'

# Create the connection
connection = sqlite3.connect(dataset_path)

# create the dataframe from a query
wiki = pd.read_sql_query("SELECT * FROM documents", connection)

In [4]:
import json

def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data 

webpage_data = load_jsonl('train.jsonl')
db_data = []
db_cols = ['id', 'verifiable', 'label', 'claim', 'evidence', 'claim_en']  # edit this to suit the jsonl file
for d in webpage_data:
    db_data.append([])
    for col in db_cols:
        db_data[-1].append(d.get(col, float('nan')))
        
devset = pd.DataFrame(db_data, columns=db_cols)

claims = devset.claim.values
evidence = devset.evidence.values 
labels = devset.label.values

corpus_full = wiki.text.values
corpus_full_id = wiki.id.values

Loaded 107330 records from train.jsonl


In [5]:
import util
import nltk

In [6]:
sent_split = [nltk.sent_tokenize(article) for article in tqdm(corpus_full)]

100%|██████████| 451629/451629 [01:00<00:00, 7469.14it/s]


In [7]:
import transformers
tokenizer = transformers.tokenization_bert.BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [8]:
"""Takes in a list of lists of sentences and returns a list of lists of tokenized sentences"""
tokenized_articles = [[[tokenizer.vocab[word_piece]for word_piece in tokenizer.tokenize(sent)] for sent in article] for article in tqdm(sent_split)]

100%|██████████| 451629/451629 [07:01<00:00, 1070.99it/s]


In [9]:
def nested_list_len(lst):
    total_sum = 0
    for sub_lst in lst:
        total_sum += len(sub_lst)
    return total_sum

In [10]:
# change this depending on your model
max_seq_len = 288
split_article_into_chunks = True

In [11]:
if split_article_into_chunks:
    chunks = []
    wiki_ids_experimental = []
    for i, article in tqdm(enumerate(tokenized_articles)):
        max_len = max_seq_len - 1 # has to be the same as the max sequence length your model was trained on
        chunk = []
        article_chunks = []
        for sentence in article:
            if nested_list_len(chunk) + len(sentence) > max_len:
                # a complete chunk
                article_chunks.append(chunk) # for counting (yeaah , wasteful) #TODO improve
                chunks.append(chunk)
                # add the current sentence to a new chunk
                chunk = [sentence]
            else:
                chunk.append(sentence)
        if chunk != []:
            article_chunks.append(chunk) # for counting, #TODO improve
            chunks.append(chunk)

        # add all the chunks from an article to chunks         
    #     chunks.append(article_chunks)
        #create wiki_ids so that each chunk has an appropriate wiki_id
        for j in range(len(article_chunks)):
            wiki_ids_experimental.append(corpus_full_id[i])
else:
    chunks = []
    for article in tokenized_articles:
        max_len = max_seq_len - 1 # has to be the same as the max sequence length your model was trained on
        chunk = []
        for sentence in article:
            if nested_list_len(chunk) + len(sentence) > max_len:
                break
            else:
                chunk.append(sentence)
        chunks.append(chunk)


451629it [00:03, 140094.50it/s]


In [12]:
# chunks = []
# for article in tokenized_articles:
#     max_len = max_seq_len - 1 # has to be the same as the max sequence length your model was trained on
#     chunk = []
#     for sentence in article:
#         if nested_list_len(chunk) + len(sentence) > max_len:
#             break
#         else:
#             chunk.append(sentence)
#     chunks.append(chunk)

In [13]:
len(chunks[0])

9

In [14]:
flat_chunks = []
for chunk in chunks:
    flat_chunk = []
    for sentence in chunk:
        for token in sentence:
            flat_chunk.append(token)
    flat_chunks.append(flat_chunk)

In [15]:
print(len(flat_chunks))
print(len(corpus_full))
#so there's a 60000 new chunks to search

510199
451629


In [16]:
del chunks
del tokenized_articles
del sent_split


In [17]:
#need to add tokens to chunks #for some reason the source code doesn't use "[SEP]" tokens
vocab = dict()
for k, v in tokenizer.vocab.items():
    vocab[k] = v
start_token = vocab["[CLS]"]
[chunk.insert(0, start_token) for chunk in flat_chunks]
# flat_chunks[1]
#need to pad to max seq length
chunk, chunk_mask = util.pad_sequence(flat_chunks, max_seq=max_seq_len, device=device)

In [18]:
from torch import nn
import transformers

class Encoder(nn.Module):
    """ a wrapper class for Huggingface transformer models"""
    def __init__(self, config):
        super(Encoder, self).__init__()
        self.encoder = transformers.modeling_bert.BertModel.from_pretrained(config['bert_model'])
        self.linear = torch.nn.Linear(768, 512) # from Two-tower paper
        self.config = config

    def forward(self, x, x_mask):
        hidden, x = self.encoder(input_ids=x, attention_mask=x_mask)
        hidden = self.linear(hidden[:, 0]) # selects cls token
        return hidden, x

In [19]:
config = {
    'bert_model':'bert-base-multilingual-cased', 
    'devices':device,
    'do_lower':False,
    'max_seq' :max_seq_len, 
    'model_weight':'./trained_models/mbert_ict_288_cls_20epoch_lr=1e-5_512layer.w', # path to your model
    'use_cuda':True, 
}

model = Encoder(config)

In [20]:
def load_model(model, device):
    model_to_load = model.module if hasattr(model, 'module') else model
    load_dict = torch.load(config["model_weight"], map_location=lambda storage, loc: storage.cuda(device))
    model_to_load.load_state_dict(load_dict['model'])
    return model_to_load

In [21]:
model = load_model(model, device) # uncommented as I am testing the baseline performance
model.cuda()
model.eval() #turn into evaluation mode

Encoder(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [22]:
import numpy as np
from tqdm import tqdm

In [23]:
# chunk_mask.to("cuda")
# chunk.to("cuda")
chunk_encode = []
batch_size = 32 # depends on your gpu vram. I haven't played around with this too much, but 
#ideally make it a power of 2 
with torch.no_grad(): # don't want to accumulate gradients when doing inference
    for i in tqdm(range(0, len(chunk), batch_size)):
        c_hidden, c_encode = model(x=chunk[i:i+batch_size], x_mask=chunk_mask[i:i+batch_size]) #now using the cls token
        chunk_encode.append(c_hidden.detach().cpu().numpy())
document_embeddings = np.concatenate(chunk_encode, axis=0)
del chunk_encode

100%|██████████| 15944/15944 [45:01<00:00,  5.90it/s]


In [24]:
# corpus_full_id[:10]

In [25]:
# debug_claims = ["Vietnamská jídla odráží elementy", "Chrám Trandruk se nachází 5km od Cethangu"]
# evidence = ["Vietnamská kuchyně", "Cethang"]


In [26]:
tokenized_claims = [[tokenizer.vocab[word_piece] for word_piece in tokenizer.tokenize(claim)] for claim in claims]

In [27]:
[claim.insert(0, start_token) for claim in tokenized_claims] #inserts only [CLS] token, 
# as that's what ICT was trained with
#need to pad to max seq length
claims_padded, claims_mask = util.pad_sequence(tokenized_claims, max_seq=max_seq_len, device=device)

In [28]:
torch.cuda.empty_cache()
# claims.to("cpu")
# claims_mask.to('cpu')
claims_encode = []
batch_size = 35

with torch.no_grad(): # don't want to accumulate gradients when doing inference
    for i in tqdm(range(0, len(claims), batch_size)):
        claim_hidden, claim_encode = model(x=claims_padded[i:i+batch_size], x_mask=claims_mask[i:i+batch_size])
        claims_encode.append(claim_hidden.detach().cpu().numpy()) #select the cls token

claim_embeddings = np.concatenate(claims_encode, axis=0)
del claims_encode

100%|██████████| 3067/3067 [05:57<00:00,  8.59it/s]


In [29]:
!pip install faiss-gpu --no-cache --user



In [30]:
import faiss
import numpy as np

def search_top_k(corp_emb, query_emb, embedding_dim=512, k=10, metric=faiss.METRIC_INNER_PRODUCT):
    '''
    Returns a tuple with ordered lists of lists of cosine distances between and top k matches in corpus_embeddings. 
    Each list corresponds to one query.
    Needs GPU
    Available metrics = faiss.METRIC_INNER_PRODUCT, faiss.METRIC_L2, ...
    more here https://github.com/facebookresearch/faiss/wiki/MetricType-and-distances
    
    return type = (List[List[cosine_distance]], List[List[index_of_corpus_embedding]]); 
    type(cosine_distance) == Float
    type(index_of_corpus_embedding) == Int
    '''

    index = faiss.index_factory(embedding_dim, "Flat", metric) # Flat = exhaustive search
    if metric == faiss.METRIC_INNER_PRODUCT:
        faiss.normalize_L2(corp_emb) # need to normalize query and corpus vectors for cosine distance
        faiss.normalize_L2(query_emb)
        
    res = faiss.StandardGpuResources()
    gpu_index = faiss.index_cpu_to_gpu(res, 0, index) # use gpu
    gpu_index.add(corp_emb)
    return gpu_index.search(query_emb, k)

def evidence_macro_precision(evidence, label, predicted_evid, max_evidence=None, page_only=True):
    """
    precision = predicted
    """
    this_precision = 0.0
    this_precision_hits = 0.0

    if label.upper() != "NOT ENOUGH INFO":
        if page_only:
            all_evi = [e[2] for eg in evidence for e in eg if e[3] is not None]
            
        else:
            all_evi = [[e[2], e[3]] for eg in evidence for e in eg if e[3] is not None]
        for prediction in predicted_evid:
            if prediction in all_evi:
                this_precision += 1.0
            this_precision_hits += 1.0

        return (this_precision / this_precision_hits) if this_precision_hits > 0 else 1.0, 1.0

    return 0.0, 0.0

def evidence_macro_recall(evidence, label ,predicted_evidence, max_evidence=None, page_only=True):
    # We only want to score F1/Precision/Recall of recalled evidence for NEI claims
    if label.upper() != "NOT ENOUGH INFO":
        # If there's no evidence to predict, return 1
        if len(evidence) == 0: #or all([len(eg) == 0 for eg in instance]):
            return 1.0, 1.0

        for evidence_group in evidence:
            evidence = [e[2] for e in evidence_group] if page_only else [[e[2], e[3]] for e in evidence_group]
            if all([item in predicted_evidence for item in evidence]):
                # We only want to score complete groups of evidence. Incomplete groups are worthless.
                return 1.0, 1.0
            
        return 0.0, 1.0
    return 0.0, 0.0

def retriever_score(corpus_embeddings, query_embeddings, evidence, labels, \
                                   corpus_ids, k=10, metric=faiss.METRIC_INNER_PRODUCT):
    macro_precision = 0
    macro_precision_hits = 0
    
    macro_recall = 0
    macro_recall_hits = 0
    
    D, I = search_top_k(corpus_embeddings, np.asarray(query_embeddings), metric=faiss.METRIC_INNER_PRODUCT, k=k)
    
    for i, top_k_idxs in enumerate(I):
        if i%100 == 0 and i != 0:
            print("claim ", i)
        predicted_evidence = np.take(corpus_ids, I[i])
        
        macro_prec = evidence_macro_precision(evidence[i], labels[i], predicted_evidence, max_evidence=None, page_only=True)
        macro_precision += macro_prec[0]
        macro_precision_hits += macro_prec[1]
        
        macro_rec = evidence_macro_recall(evidence[i], labels[i], predicted_evidence)
        macro_recall += macro_rec[0]
        macro_recall_hits += macro_rec[1]
        
    pr = (macro_precision / macro_precision_hits) if macro_precision_hits > 0 else 1.0
    rec = (macro_recall / macro_recall_hits) if macro_recall_hits > 0 else 0.0
    f1 = 2.0 * pr * rec / (pr + rec)
    
    return pr, rec, f1


In [35]:
for item in [document_embeddings, claim_embeddings, evidence, labels, corpus_full_id, wiki_ids_experimental]:
    print(str(len(item)))

510199
107330
107330
107330
451629
510199


In [None]:
retriever_score(document_embeddings, claim_embeddings[:2000], evidence, labels, wiki_ids_experimental, k=500)

In [None]:
_, I = search_top_k(document_embeddings, claim_embeddings, k=2)

In [None]:
predicted_evidence = np.take(corpus_full_id, I[0])
print(predicted_evidence)