# **Model Testing** 

In [0]:
!pip install fever-scorer

Collecting fever-scorer
  Downloading https://files.pythonhosted.org/packages/61/d1/95f1133ded0d74a9d24fe5e15c43f2b3c31f018d0227fa34376f93cf0f08/fever-scorer-2.0.39.tar.gz
Building wheels for collected packages: fever-scorer
  Building wheel for fever-scorer (setup.py) ... [?25l[?25hdone
  Created wheel for fever-scorer: filename=fever_scorer-2.0.39-cp36-none-any.whl size=3585 sha256=74e9645e63f24b30c710c6426c026e0baa892a0a3fb1262c92af7707b94e8799
  Stored in directory: /root/.cache/pip/wheels/9d/f1/2f/bdeac68eff673e4c1cfaab09d14438cd4e4c8a585aeba7ff40
Successfully built fever-scorer
Installing collected packages: fever-scorer
Successfully installed fever-scorer-2.0.39


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import torch
import spacy
from tqdm import tqdm
from torchtext import data
from torchtext import datasets
import torch.nn.functional as tnf
from fever.scorer import fever_score
import pandas as pd
import pdb
import dill

In [0]:
test_path = "/content/gdrive/My Drive/NLPWikiData/processed_test_data3.csv"
org_test_path = "/content/gdrive/My Drive/NLPWikiData/test.jsonl"
model_path = "/content/gdrive/My Drive/sent_selec_E4_0.425.pt"
sen_pred_test_path = "/content/gdrive/My Drive/NLPWikiData/sen_pred_test3.jsonl"

In [0]:
TEXT = data.Field(include_lengths = True, tokenize='spacy')
LABEL = data.LabelField()
OTHER = data.RawField()
OTHER.is_target = False

In [0]:
testset_fields = {"sentence":("sentence",TEXT), "claim":("claim", TEXT), 
                 "org_sentence":("org_sentence",OTHER), "docid_claimid_sentno":("docid_claimid_sentno",OTHER)}

In [0]:
with open("/content/gdrive/My Drive/TEXT_VOCAB_5EPOCH", "rb") as f:
    TEST_TEXT = dill.load(f)
    print("Text Load Successfull")
with open("/content/gdrive/My Drive/LABEL_VOCAB_5EPOCH", "rb") as f:
    TEST_LABEL = dill.load(f)
    print("Label Load Successfull")

Text Load Successfull
Label Load Successfull


In [0]:
testset = data.TabularDataset(test_path, format="CSV", fields=testset_fields, skip_header=False)

In [0]:
print(len(testset))
print(vars(testset.examples[0]))

980291
{'sentence': ['Henry', 'Spencer', 'is', 'a', 'Canadian', 'computer', 'programmer', 'and', 'space', 'enthusiast', '.'], 'claim': ['Henry', 'Spencer', 'is', 'played', 'by', 'a', 'Greek', 'actor', '.'], 'org_sentence': 'Henry Spencer is a Canadian computer programmer and space enthusiast .', 'docid_claimid_sentno': 'Henry_Spencer_-LRB-disambiguation-RRB-{#--#}89296{#--#}0'}


In [0]:
TEXT.build_vocab(testset)

In [0]:
LABEL.build_vocab(testset)

In [0]:
TEXT.vocab = TEST_TEXT.vocab
TEXT.vocab.itos = TEST_TEXT.vocab.itos
TEXT.vocab.stoi = TEST_TEXT.vocab.stoi

In [0]:
LABEL.vocab = TEST_LABEL.vocab
LABEL.vocab.itos = TEST_LABEL.vocab.itos
LABEL.vocab.stoi = TEST_LABEL.vocab.stoi

In [0]:
# vocabulary of training data (same to be used for dev and test)
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 90622
Unique tokens in LABEL vocabulary: 2


In [0]:
print(TEXT.vocab.freqs.most_common(20))
print(TEXT.vocab.itos[:10])
print(vars(LABEL.vocab))

[('.', 6679593), (',', 5993860), ('the', 5457627), ('in', 3084025), ('and', 3053898), ('of', 2935330), ('a', 2786757), ('is', 2062525), ('was', 1467657), ('to', 1264993), ('The', 1254807), ('-LRB-', 1104248), ('-RRB-', 1104213), ('-', 1028424), ('for', 960542), ('as', 863462), ("'s", 799846), ('by', 771416), ('`', 761885), ('an', 748843)]
['<unk>', '<pad>', '.', ',', 'the', 'in', 'and', 'of', 'a', 'is']
{'freqs': Counter({'False': 3082707, 'True': 264198}), 'itos': ['False', 'True'], 'stoi': defaultdict(<function _default_unk_index at 0x7fbaf8759620>, {'False': 0, 'True': 1}), 'vectors': None}


In [0]:
BATCH_SIZE=128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("We are working with ", device)

We are working with  cuda


In [0]:
test_iterator = data.BucketIterator(
    testset, 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    sort_key = lambda x: (len(x.claim)),
    device = device)

In [0]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_dim*2*2, output_dim)

        # self.dropoutVar = nn.Dropout(dropout)
        
    def forward_again(self, text, text_lengths):
        
        # print(text)
        # text = [sent_len, batch_size]
        # print("Text_Shape:  ",text.shape)
        # print("Text_Length: ",text_lengths)
        # print("Text_Length_Shape: ",text_lengths.shape)

        output = self.embedding(text) #get embeddings
        pps = nn.utils.rnn.pack_padded_sequence(output, text_lengths, enforce_sorted=False) #perform packed padded sequence
        output2, (hiddenLSTM, cellLSTM) = self.lstm(pps) #lstm
        hidden = torch.cat((hiddenLSTM[-2,:,:], hiddenLSTM[-1,:,:]),1) #get concatenated hidden

        # print("Output:  ",output)
        # print("Output_Shape:  ",output.shape)
        
        # print("PPS:  ",pps)
        # print("PPS_Shape:  ",pps.shape)

        # print("Output2:  ",output2)
        # print("Output2_Shape:  ",output2.shape)
        
        # print("Hidden:  ",hidden)
        # print("Hidden_Shape:  ",hidden.shape)
        
        return hidden

    def forward(self, claims, sentences):
        claim_text = claims[0]
        claim_text_length = claims[1]
        sentence_text = sentences[0]
        sentence_text_length = sentences[1]

        claim_hidden = self.forward_again(claim_text, claim_text_length)
        sentence_hidden = self.forward_again(sentence_text, sentence_text_length)

        concatenated_hidden = torch.cat((claim_hidden,sentence_hidden), 1)

        return self.fc(concatenated_hidden)

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 2
N_LAYERS = 1
BIDIRECTIONAL = True
# DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = LSTM(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            PAD_IDX)

In [0]:
criterion = nn.CrossEntropyLoss()
model.load_state_dict(torch.load(model_path, map_location=device)) 
model = model.to(device)
criterion = criterion.to(device)

In [0]:
def evaluate(model, iterator, file):
  
    epoch_loss = 0

    # doc_ids = []
    # sentence_nos = []
    # claim_ids = []
    docid_claimid_sentno = []
    org_sentences = []
    predicted_sentences = []
    probabilities = []
    correct_predictions = 0
    total_claims = 0

    # epoch_acc = 0

  
  
    with torch.no_grad():
  
        for i, batch in enumerate(iterator):
            model.eval()
            
            claims, sentences = batch.claim, batch.sentence
            
            eval_predictions = model(claims, sentences)
            probability = tnf.softmax(eval_predictions, 1)
            # correct_predictions += (torch.max(eval_predictions, 1)[1].view(batch.sent_label.size()) == batch.sent_label).sum().item()
            # dev_loss = criterion(eval_predictions, batch.sent_label)
            
            # epoch_loss += dev_loss.item()
            # total_claims += batch.sent_label.size(0)
            # epoch_acc += acc.item()

            # predicted_sentences.extend(eval_predictions[:,1].tolist())
            probabilities.extend(probability[:,1].tolist())
            docid_claimid_sentno.extend(batch.docid_claimid_sentno)
            org_sentences.extend(batch.org_sentence)

        file_data, fever_data = get_score_test(probabilities, docid_claimid_sentno, org_sentences, org_test_path)   
        # average_accuracy = 100. * correct_predictions / total_claims
        # print(f'Correct Predictions: {correct_predictions}')
        # print(f'Total Claims: {total_claims}')
        # print(f'Validation Loss: {epoch_loss/len(iterator)}')
        # print(f'Average Accuracy: {average_accuracy}%')
        print(f'-----------------------------')

    return file_data, fever_data

In [0]:
def get_score_test(probabilities, docid_claimid_sentno, org_sentence, org_test_path):
    org_test_data = pd.read_json(org_test_path, lines=True)

    claim_dict = dict()

    for i,val in enumerate(docid_claimid_sentno):
        doc_id, claim_id, sentno = docid_claimid_sentno[i].split("{#--#}")
        claim_id = int(claim_id)
        if claim_id not in claim_dict:
            claim_dict[claim_id] = [{"probability": probabilities[i], "doc_id": doc_id, 
                                    "sentno": sentno, "org_sentence": org_sentence[i]}]
        else:
            claim_dict[claim_id].extend([{"probability": probabilities[i], "doc_id": doc_id, 
                                    "sentno": sentno, "org_sentence": org_sentence[i]}])
    
    file_data = []
    fever_data = []
    prob_count = 0
    # pdb.set_trace()
    for org_test_claim_id, org_test_claim, in zip(org_test_data['id'], org_test_data['claim']):
        temp_data = dict()
        fever_dict = dict()

        org_test_claim_id = int(org_test_claim_id)
        predicted_sentences = []
        if org_test_claim_id not in claim_dict:
            # that claim id was not in the predictions, hence no predicted sentences
            predicted_sentences = []

            # for RTE .jsonl file
            temp_data['id'] = org_test_claim_id
            temp_data['claim'] = org_test_claim
            temp_data['sentences'] = []
            temp_data['page_ids'] = []
            temp_data['indices'] = []
            
        else:
            the_claim_dict = claim_dict[org_test_claim_id]
            for value in the_claim_dict:
                #   if value['probability'] >= 0.5:
                # print (predicted_sentences)
                # print (value)
                prob_count +=1
                predicted_sentences.append([value['probability'], value['sentno'], value['org_sentence'], value['doc_id']])
                            
            sorted_predicted_sentences = sorted(predicted_sentences, key=lambda x: x[0], reverse=True)
                
            # for RTE .jsonl file
            temp_data['id'] = org_test_claim_id
            temp_data['claim'] = org_test_claim
            temp_data['sentences'] = [u[2] for u in sorted_predicted_sentences][:5]
            temp_data['page_ids'] = [v[3] for v in sorted_predicted_sentences][:5]
            temp_data['indices'] = [w[1] for w in sorted_predicted_sentences][:5]
            

            # for fever score
            # fever_dict['label'] = org_dev_claim_label
            # fever_dict['predicted_label'] = org_dev_claim_label
            # fever_dict['predicted_evidence'] = [[x[3], int(x[1])] for x in sorted_predicted_sentences][:5]
            # fever_dict['evidence'] = org_dev_evidence_list
            fever_dict = "No fever Data"

        file_data.append(temp_data)
            # fever_data.append(fever_dict)
    # pd.DataFrame(file_data).to_json(sen_pred_test_path, orient='records', lines=True)
    print('prob_count', prob_count)
    return file_data, fever_data

In [0]:
test_file_data, test_fever_data = evaluate(model, test_iterator, test_path)
pd.DataFrame(test_file_data).to_json(sen_pred_test_path, orient='records', lines=True)
# test_fever_val, test_accuracy, test_precision, test_recall, f1score = test_fever_score(fever_data)
# print(f'Fever Score: {fever_val} | Accuracy: {accuracy}')
# print(f'Precision: {precision} | Recall: {recall} | F1Score: {f1score}')

prob_count 980291
-----------------------------
