# Evaulate Distilbert Model

In [20]:
import json
import pickle
import sys

import pandas as pd
import sklearn.metrics
import tqdm
import torch
import transformers as hft

sys.path.insert(0, "/home/jupyter")
import util.log
import util.data

In [2]:
MODEL_NAME = "distilbert-base-uncased"
MODEL_FILE ="saved_model_epoch10_20201128_2225.tar"
VAL_ENCODINGS = "val_encodings_23Nov_gcpf_distilbert.pickle"
EVAL_BATCH_SIZE = 16

DESCRIPTION = "Distilbert trained on politifact and gossipcop training data."

In [3]:
eval_dataset = util.data.load_dataset_from_path(VAL_ENCODINGS)
print(len(eval_dataset))

3663


In [4]:
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
device

device(type='cuda')

In [5]:
checkpoint = torch.load(MODEL_FILE, device)

In [6]:
model = (hft.DistilBertForSequenceClassification
         .from_pretrained(MODEL_NAME))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [7]:
model.load_state_dict(checkpoint["model"])

<All keys matched successfully>

In [8]:
model.to(device)
model.eval();

In [9]:
eval_loader = torch.utils.data.DataLoader(eval_dataset,
                                batch_size=EVAL_BATCH_SIZE,
                                shuffle=False)

In [10]:
labels = []
preds = []
logits = []
with torch.no_grad():
    for art_num, article in tqdm.tqdm(enumerate(eval_loader)):
        input_ids = article["input_ids"].to(device)
        attent_mask = article["attention_mask"].to(device)
        output = model(input_ids,
                       attention_mask=attent_mask,
                       output_hidden_states=False,
                       output_attentions=False)
        logit = output[0].detach().cpu().numpy()
        pred = [0 if lgt[0] > lgt[1] else 1 for lgt in logit]
        label = article["labels"].numpy()
        labels.extend(label)
        preds.extend(pred)
        logits.extend(logit)

229it [00:36,  6.35it/s]


In [11]:
# Label of 0 is real, 1 is fake
eval_results = pd.DataFrame({"Article": list(range(len(labels))),
                             "Predictions": preds,
                             "Labels": labels,
                             "Logit_real(0)": [x[0] for x in logits],
                             "Logit_fake(1)": [x[1] for x in logits]})

In [12]:
eval_results

Unnamed: 0,Article,Predictions,Labels,Logit_real(0),Logit_fake(1)
0,0,0,0,2.850009,-3.273690
1,1,0,0,0.377962,-0.371999
2,2,0,0,2.692442,-3.026399
3,3,0,0,2.985261,-3.340193
4,4,1,1,-3.186574,3.877971
...,...,...,...,...,...
3658,3658,1,1,-3.505335,4.226383
3659,3659,0,0,1.093364,-1.092360
3660,3660,1,1,-3.459407,4.121661
3661,3661,1,1,-2.985819,3.614302


In [13]:
eval_results.query("Predictions == 1")

Unnamed: 0,Article,Predictions,Labels,Logit_real(0),Logit_fake(1)
4,4,1,1,-3.186574,3.877971
5,5,1,1,-0.311678,0.443620
24,24,1,1,-0.552113,0.699427
28,28,1,0,-1.277365,1.526949
32,32,1,1,-3.702299,4.498945
...,...,...,...,...,...
3648,3648,1,1,-2.688560,3.180310
3649,3649,1,1,-2.844619,3.345943
3658,3658,1,1,-3.505335,4.226383
3660,3660,1,1,-3.459407,4.121661


In [14]:
for x in range(0, 25):
    print(eval_results["Logit_real(0)"][x], eval_results["Logit_fake(1)"][x])

2.850008726119995 -3.2736902236938477
0.37796199321746826 -0.3719988465309143
2.6924424171447754 -3.0263993740081787
2.9852609634399414 -3.3401927947998047
-3.1865742206573486 3.8779709339141846
-0.31167763471603394 0.44361987709999084
0.3588881194591522 -0.2540079653263092
3.916956901550293 -4.597268581390381
1.934731364250183 -2.0337040424346924
2.6825239658355713 -2.98710036277771
3.9099483489990234 -4.566473484039307
3.1525959968566895 -3.4811224937438965
3.0261616706848145 -3.4413564205169678
3.5827066898345947 -4.13591194152832
3.302001714706421 -3.752885103225708
0.9585110545158386 -1.0261831283569336
3.839869499206543 -4.504690170288086
2.9708428382873535 -3.3438961505889893
3.9053101539611816 -4.5931596755981445
3.6977710723876953 -4.349445819854736
3.741704225540161 -4.385188579559326
3.497767686843872 -4.073032379150391
3.405294179916382 -3.961172580718994
1.3517165184020996 -1.5384490489959717
-0.5521133542060852 0.6994267106056213


In [15]:
precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
    labels, preds, average="binary")
accuracy = sklearn.metrics.accuracy_score(labels, preds)

In [16]:
metrics = {"description": DESCRIPTION,
           "model": MODEL_FILE,
           "epochs": checkpoint["epoch"],
           "batch_size": 32,
           "Date_evaluated": "23Nov2020",
           "train_data": "train_encodings_23Nov_gcpf_distilbert_1000.pickle",
           "eval_data": VAL_ENCODINGS,
           "accuracy": accuracy,
           "precision": precision,
           "recall": recall,
           "f1": f1}

In [22]:
metrics

{'description': 'Distilbert trained on politifact and gossipcop training data.',
 'model': 'saved_model_epoch10_20201128_2225.tar',
 'epochs': 10,
 'batch_size': 32,
 'Date_evaluated': '23Nov2020',
 'train_data': 'train_encodings_23Nov_gcpf_distilbert_1000.pickle',
 'eval_data': 'val_encodings_23Nov_gcpf_distilbert.pickle',
 'accuracy': 0.8542178542178542,
 'precision': 0.7379134860050891,
 'recall': 0.6387665198237885,
 'f1': 0.6847697756788665}

In [24]:
with open("evaluation_metrics_10_epochs.json", "wt") as pfile:
    json.dump(metrics, pfile)

In [25]:
eval_results.to_csv("evaluation_results_10_epochs.csv")    

In [19]:
sum(labels)

908