# Evaulate Distilbert Model with Test Data

In [1]:
import json
import pickle
import sys

import pandas as pd
import sklearn.metrics
import tqdm
import torch
import transformers as hft

sys.path.insert(0, "/home/jupyter")
import util.log
import util.data

In [2]:
MODEL_NAME = "distilbert-base-uncased"
MODEL_FILE ="saved_model_epoch10_20201128_2225.tar"
TEST_DATASET = "test_dataset_all_29Nov_distilbert.pickle"
EVAL_BATCH_SIZE = 32

DESCRIPTION = "Distilbert trained on politifact and gossipcop test data."

In [3]:
with open(TEST_DATASET, "rb") as pfile:
    test_dataset = pickle.load(pfile)

In [7]:
checkpoint = torch.load(MODEL_FILE)

In [8]:
model = (hft.DistilBertForSequenceClassification
         .from_pretrained(MODEL_NAME))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [9]:
model.load_state_dict(checkpoint["model"])

<All keys matched successfully>

In [10]:
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
device

device(type='cuda')

In [11]:
model.to(device)
model.eval();

In [12]:
eval_loader = torch.utils.data.DataLoader(test_dataset,
                                batch_size=EVAL_BATCH_SIZE,
                                shuffle=False)

In [14]:
labels = []
preds = []
logits = []
probs = []
sources = []
article_tokens = []
file_names = []

softmax = torch.nn.Softmax(dim=1)

with torch.no_grad():
    for art_num, article in tqdm.tqdm(enumerate(eval_loader)):
        input_ids = article["input_ids"].to(device)
        attent_mask = article["attention_mask"].to(device)
        output = model(input_ids,
                       attention_mask=attent_mask,
                       output_hidden_states=False,
                       output_attentions=False)
        prob = softmax(output[0].detach().cpu()).numpy()
        logit = output[0].detach().cpu().numpy()
        pred = [0 if lgt[0] > lgt[1] else 1 for lgt in logit]
        label = article["labels"].numpy()
        labels.extend(label)
        preds.extend(pred)
        logits.extend(logit)
        probs.extend(prob)
        article_tokens.extend(article["article_tokens"].numpy())
        sources.extend(article["sources"])
        file_names.extend(article["file_names"])

115it [00:31,  3.66it/s]


In [15]:
# Label of 0 is real, 1 is fake
eval_results = pd.DataFrame({"Article": list(range(len(labels))),
                             "Source": sources,
                             "Token_Length": article_tokens,
                             "Predictions": preds,
                             "Labels": labels,
                             "Logit_real(0)": [x[0] for x in logits],
                             "Logit_fake(1)": [x[1] for x in logits],
                             "Prob_real(0)": [x[0] for x in probs],
                             "Prob_real(1)": [x[1] for x in probs],
                             "File_Name": file_names})

eval_results

Unnamed: 0,Article,Source,Token_Length,Predictions,Labels,Logit_real(0),Logit_fake(1),Prob_real(0),Prob_real(1),File_Name
0,0,gossipcop,135,0,1,3.591313,-4.220492,0.999595,0.000405,gossipcop-4970023834
1,1,gossipcop,151,0,0,3.811080,-4.437657,0.999739,0.000262,gossipcop-928976
2,2,gossipcop,428,0,0,3.787916,-4.433542,0.999731,0.000269,gossipcop-895265
3,3,gossipcop,488,0,0,3.554034,-4.068045,0.999511,0.000489,gossipcop-869160
4,4,gossipcop,1306,0,0,3.700177,-4.342400,0.999679,0.000321,gossipcop-913152
...,...,...,...,...,...,...,...,...,...,...
3655,3655,gossipcop,472,0,0,3.797359,-4.427724,0.999732,0.000268,gossipcop-919460
3656,3656,gossipcop,638,0,0,3.385705,-3.908506,0.999321,0.000679,gossipcop-924850
3657,3657,gossipcop,289,0,0,1.372837,-1.613860,0.951970,0.048030,gossipcop-850273
3658,3658,gossipcop,345,0,0,3.876119,-4.598004,0.999791,0.000209,gossipcop-897736


In [19]:
# Save evaluation data to csv file
eval_results.to_csv("test_data_" + MODEL_FILE[12:-4] + ".csv")

In [16]:
eval_results.query("Predictions == 1")

Unnamed: 0,Article,Source,Token_Length,Predictions,Labels,Logit_real(0),Logit_fake(1),Prob_real(0),Prob_real(1),File_Name
8,8,gossipcop,813,1,1,-1.946828,2.410327,0.012653,0.987347,gossipcop-2332731598
11,11,gossipcop,1502,1,1,-1.775744,2.059896,0.021131,0.978869,gossipcop-9938144731
12,12,gossipcop,300,1,1,-3.371477,4.078383,0.000581,0.999419,gossipcop-1026488145
20,20,gossipcop,231,1,1,-2.725329,3.237154,0.002567,0.997433,gossipcop-3578554429
22,22,gossipcop,495,1,0,-0.715122,0.706294,0.194440,0.805560,gossipcop-942913
...,...,...,...,...,...,...,...,...,...,...
3639,3639,gossipcop,120,1,0,-0.902984,0.997764,0.130024,0.869976,gossipcop-902421
3640,3640,gossipcop,553,1,0,-0.492071,0.574124,0.256127,0.743873,gossipcop-906911
3642,3642,gossipcop,244,1,1,-0.171820,0.138512,0.423034,0.576966,gossipcop-8975289821
3644,3644,gossipcop,296,1,1,-3.580489,4.345992,0.000361,0.999639,gossipcop-5875692446


In [17]:
for x in range(0, 10):
    print(eval_results["Logit_real(0)"][x], eval_results["Logit_fake(1)"][x])

3.591313362121582 -4.220492362976074
3.811079502105713 -4.437656879425049
3.7879157066345215 -4.433541774749756
3.5540339946746826 -4.068045139312744
3.700176954269409 -4.342399597167969
2.970214366912842 -3.4135096073150635
2.9841504096984863 -3.3099899291992188
3.1035165786743164 -3.4988579750061035
-1.9468281269073486 2.410327434539795
3.6806998252868652 -4.260814189910889


In [28]:
# Overall Metrics
precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
    eval_results.Labels, eval_results.Predictions, average="binary")
accuracy = sklearn.metrics.accuracy_score(eval_results.Labels, eval_results.Predictions)

metrics = {"description": DESCRIPTION,
           "eval_notes": "Evaluating both gossipcop and poltifact data",
           "model": MODEL_FILE,
           "epochs": checkpoint["epoch"],
           "batch_size": EVAL_BATCH_SIZE,
           "Date_evaluated": "25Nov2020",
           "train_data": "train_dataset_25Nov_source_distilbert.pickle",
           "val_data": "eval_data_" + MODEL_FILE[12:-4] + ".csv",
           "eval_data": TEST_DATASET,
           "accuracy": accuracy,
           "precision": precision,
           "recall": recall,
           "f1": f1}
metrics

{'description': 'Distilbert trained on politifact and gossipcop test data.',
 'eval_notes': 'Evaluating both gossipcop and poltifact data',
 'model': 'saved_model_epoch10_20201128_2225.tar',
 'epochs': 10,
 'batch_size': 32,
 'Date_evaluated': '25Nov2020',
 'train_data': 'train_dataset_25Nov_source_distilbert.pickle',
 'val_data': 'eval_data_epoch10_20201128_2225.csv',
 'eval_data': 'test_dataset_all_29Nov_distilbert.pickle',
 'accuracy': 0.842896174863388,
 'precision': 0.7210460772104608,
 'recall': 0.6225806451612903,
 'f1': 0.668205424120023}

In [22]:
eval_results

Unnamed: 0,Article,Source,Token_Length,Predictions,Labels,Logit_real(0),Logit_fake(1),Prob_real(0),Prob_real(1),File_Name
0,0,gossipcop,135,0,1,3.591313,-4.220492,0.999595,0.000405,gossipcop-4970023834
1,1,gossipcop,151,0,0,3.811080,-4.437657,0.999739,0.000262,gossipcop-928976
2,2,gossipcop,428,0,0,3.787916,-4.433542,0.999731,0.000269,gossipcop-895265
3,3,gossipcop,488,0,0,3.554034,-4.068045,0.999511,0.000489,gossipcop-869160
4,4,gossipcop,1306,0,0,3.700177,-4.342400,0.999679,0.000321,gossipcop-913152
...,...,...,...,...,...,...,...,...,...,...
3655,3655,gossipcop,472,0,0,3.797359,-4.427724,0.999732,0.000268,gossipcop-919460
3656,3656,gossipcop,638,0,0,3.385705,-3.908506,0.999321,0.000679,gossipcop-924850
3657,3657,gossipcop,289,0,0,1.372837,-1.613860,0.951970,0.048030,gossipcop-850273
3658,3658,gossipcop,345,0,0,3.876119,-4.598004,0.999791,0.000209,gossipcop-897736


In [32]:
# Save metrics to JSON text file.
with open("eval_metrics_" + MODEL_FILE[12:-4] + ".json", "wt") as jfile:
    json.dump(metrics, jfile)

In [23]:
sum(labels)

930

In [33]:
cm = pd.DataFrame(sklearn.metrics.confusion_matrix(eval_results.Labels, eval_results.Predictions))
label_titles = {0: "real", 1: "fake"}
cm.rename(index=label_titles, columns=label_titles, inplace=True)
cm

Unnamed: 0,real,fake
real,2506,224
fake,351,579
