# Evaulate Distilbert Model with Test Data
### Includes Titles

In [2]:
import json
import pickle
import sys

import pandas as pd
import sklearn.metrics
import tqdm
import torch
import transformers as hft

sys.path.insert(0, "/home/jupyter")
import util.log
import util.data

In [4]:
MODEL_NAME = "distilbert-base-uncased"
MODEL_FILE ="saved_model_epoch10_20201129_0335.tar"
TEST_DATASET = "test_dataset_all_titles_29Nov_distilbert.pickle"
EVAL_BATCH_SIZE = 64

DESCRIPTION = "Distilbert trained on politifact and gossipcop test data, with encoded titles and weighting."

In [5]:
with open(TEST_DATASET, "rb") as pfile:
    test_dataset = pickle.load(pfile)

In [6]:
checkpoint = torch.load(MODEL_FILE)
model = (hft.DistilBertForSequenceClassification
         .from_pretrained(MODEL_NAME))
model.load_state_dict(checkpoint["model"])

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

<All keys matched successfully>

In [7]:
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
model.to(device)
model.eval();

In [8]:
eval_loader = torch.utils.data.DataLoader(test_dataset,
                                batch_size=EVAL_BATCH_SIZE,
                                shuffle=False)

In [9]:
labels = []
preds = []
logits = []
probs = []
sources = []
article_tokens = []
file_names = []

softmax = torch.nn.Softmax(dim=1)

with torch.no_grad():
    for art_num, article in tqdm.tqdm(enumerate(eval_loader)):
        input_ids = article["input_ids"].to(device)
        attent_mask = article["attention_mask"].to(device)
        output = model(input_ids,
                       attention_mask=attent_mask,
                       output_hidden_states=False,
                       output_attentions=False)
        prob = softmax(output[0].detach().cpu()).numpy()
        logit = output[0].detach().cpu().numpy()
        pred = [0 if lgt[0] > lgt[1] else 1 for lgt in logit]
        label = article["labels"].numpy()
        labels.extend(label)
        preds.extend(pred)
        logits.extend(logit)
        probs.extend(prob)
        article_tokens.extend(article["article_tokens"].numpy())
        sources.extend(article["sources"])
        file_names.extend(article["file_names"])

58it [00:31,  1.86it/s]


In [10]:
# Label of 0 is real, 1 is fake
eval_results = pd.DataFrame({"Article": list(range(len(labels))),
                             "Source": sources,
                             "Token_Length": article_tokens,
                             "Predictions": preds,
                             "Labels": labels,
                             "Logit_real(0)": [x[0] for x in logits],
                             "Logit_fake(1)": [x[1] for x in logits],
                             "Prob_real(0)": [x[0] for x in probs],
                             "Prob_real(1)": [x[1] for x in probs],
                             "File_Name": file_names})

eval_results

Unnamed: 0,Article,Source,Token_Length,Predictions,Labels,Logit_real(0),Logit_fake(1),Prob_real(0),Prob_real(1),File_Name
0,0,gossipcop,389,0,1,4.118822,-3.870095,0.999661,0.000339,gossipcop-8686267368
1,1,gossipcop,645,0,0,4.164303,-4.039660,0.999726,0.000273,gossipcop-934994
2,2,gossipcop,738,0,0,3.778802,-3.594267,0.999372,0.000628,gossipcop-843576
3,3,gossipcop,616,0,0,3.699242,-3.514220,0.999264,0.000736,gossipcop-930194
4,4,gossipcop,480,1,1,-3.499019,3.575635,0.000846,0.999154,gossipcop-4054055901
...,...,...,...,...,...,...,...,...,...,...
3655,3655,gossipcop,592,1,1,-3.287464,3.315183,0.001355,0.998645,gossipcop-394386553
3656,3656,gossipcop,408,0,0,3.608117,-3.526240,0.999203,0.000797,gossipcop-931949
3657,3657,gossipcop,233,0,0,2.297114,-2.078357,0.987574,0.012426,gossipcop-872770
3658,3658,gossipcop,407,0,0,3.754860,-3.639502,0.999386,0.000614,gossipcop-870644


In [11]:
# Save evaluation data to csv file
eval_results.to_csv("test_data_" + MODEL_FILE[12:-4] + ".csv")

In [12]:
eval_results.query("Predictions == 1")

Unnamed: 0,Article,Source,Token_Length,Predictions,Labels,Logit_real(0),Logit_fake(1),Prob_real(0),Prob_real(1),File_Name
4,4,gossipcop,480,1,1,-3.499019,3.575635,0.000846,0.999154,gossipcop-4054055901
9,9,gossipcop,427,1,1,-2.234159,2.113158,0.012776,0.987224,gossipcop-8953282482
10,10,gossipcop,433,1,1,-3.721581,3.693871,0.000602,0.999398,gossipcop-1170441384
16,16,gossipcop,655,1,1,-2.855064,2.879922,0.003221,0.996780,gossipcop-3998725211
19,19,gossipcop,358,1,1,-3.504628,3.571532,0.000844,0.999156,gossipcop-638030695
...,...,...,...,...,...,...,...,...,...,...
3632,3632,gossipcop,715,1,1,-3.483557,3.394110,0.001029,0.998971,gossipcop-2511047593
3636,3636,politifact,176,1,1,-2.525460,2.501106,0.006519,0.993482,politifact14596
3651,3651,gossipcop,659,1,1,-3.673880,3.829642,0.000551,0.999449,gossipcop-1603768996
3652,3652,gossipcop,404,1,1,-3.587660,3.671566,0.000703,0.999297,gossipcop-2908447234


In [13]:
for x in range(0, 10):
    print(eval_results["Logit_real(0)"][x], eval_results["Logit_fake(1)"][x])

4.11882209777832 -3.8700945377349854
4.164303302764893 -4.0396599769592285
3.7788021564483643 -3.594266891479492
3.699241876602173 -3.5142202377319336
-3.499019145965576 3.5756349563598633
4.174830913543701 -4.098593235015869
3.966257095336914 -3.833629608154297
4.232416152954102 -4.095065593719482
4.329822540283203 -4.283166885375977
-2.234158515930176 2.1131577491760254


In [14]:
# Overall Metrics
precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
    eval_results.Labels, eval_results.Predictions, average="binary")
accuracy = sklearn.metrics.accuracy_score(eval_results.Labels, eval_results.Predictions)

metrics = {"description": DESCRIPTION,
           "eval_notes": "Evaluating both gossipcop and poltifact data",
           "model": MODEL_FILE,
           "epochs": checkpoint["epoch"],
           "batch_size": EVAL_BATCH_SIZE,
           "Date_evaluated": "25Nov2020",
           "train_data": "train_dataset_25Nov_source_distilbert.pickle",
           "val_data": "eval_data_" + MODEL_FILE[12:-4] + ".csv",
           "eval_data": TEST_DATASET,
           "accuracy": accuracy,
           "precision": precision,
           "recall": recall,
           "f1": f1}
metrics

{'description': 'Distilbert trained on politifact and gossipcop test data, with encoded titles and weighting.',
 'eval_notes': 'Evaluating both gossipcop and poltifact data',
 'model': 'saved_model_epoch10_20201129_0335.tar',
 'epochs': 10,
 'batch_size': 64,
 'Date_evaluated': '25Nov2020',
 'train_data': 'train_dataset_25Nov_source_distilbert.pickle',
 'val_data': 'eval_data_epoch10_20201129_0335.csv',
 'eval_data': 'test_dataset_all_titles_29Nov_distilbert.pickle',
 'accuracy': 0.830327868852459,
 'precision': 0.6507317073170732,
 'recall': 0.7172043010752688,
 'f1': 0.6823529411764706}

In [15]:
# Save metrics to JSON text file.
with open("eval_metrics_" + MODEL_FILE[12:-4] + ".json", "wt") as jfile:
    json.dump(metrics, jfile)

In [18]:
sum(preds)

1025

In [19]:
cm = pd.DataFrame(sklearn.metrics.confusion_matrix(eval_results.Labels, eval_results.Predictions))
label_titles = {0: "real", 1: "fake"}
cm.rename(index=label_titles, columns=label_titles, inplace=True)
cm

Unnamed: 0,real,fake
real,2372,358
fake,263,667
