# Evaulate Distilbert Model
### 25 Nov 2020

## I. Evaluation Parameters

In [1]:
# Set Evaulation Parameters
MODEL_NAME = "distilbert-base-uncased"
MODEL_FILE ="saved_model_epoch10_20201126_1415.tar"
VAL_DATASET = "val_dataset_25Nov_gcpf_distilbert.pickle"
EVAL_BATCH_SIZE = 32
DESCRIPTION = "Distilbert trained on politifact and gossipcop training data."

## II. Setup and Load Model

In [2]:
import pickle
import sys

import json
import pandas as pd
import pickle
import sklearn.metrics
import tqdm
import torch
import transformers as hft

sys.path.insert(0, "/home/jupyter")
import util.log
import util.data

In [3]:
# Load evaluation data
with open(VAL_DATASET, "rb") as vfile:
    val_dataset = pickle.load(vfile)

In [4]:
# Load saved model
checkpoint = torch.load(MODEL_FILE)
model = (hft.DistilBertForSequenceClassification
         .from_pretrained(MODEL_NAME))
model.load_state_dict(checkpoint["model"])

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

<All keys matched successfully>

In [5]:
# Move model to GPU if available
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
model.to(device)
model.eval();
device

device(type='cuda')

## III. Evaluation Loop and Results Dataframe

In [6]:
# Evaulate model
eval_loader = torch.utils.data.DataLoader(val_dataset,
                                batch_size=EVAL_BATCH_SIZE,
                                shuffle=False)
labels = []
preds = []
logits = []
probs = []
sources = []
article_tokens = []
file_names = []

softmax = torch.nn.Softmax(dim=1)

with torch.no_grad():
    for art_num, article in tqdm.tqdm(enumerate(eval_loader)):
        input_ids = article["input_ids"].to(device)
        attent_mask = article["attention_mask"].to(device)
        output = model(input_ids,
                       attention_mask=attent_mask,
                       output_hidden_states=False,
                       output_attentions=False)
        prob = softmax(output[0].detach().cpu()).numpy()
        logit = output[0].detach().cpu().numpy()
        pred = [0 if lgt[0] > lgt[1] else 1 for lgt in logit]
        label = article["labels"].numpy()
        labels.extend(label)
        preds.extend(pred)
        logits.extend(logit)
        probs.extend(prob)
        article_tokens.extend(article["article_tokens"].numpy())
        sources.extend(article["sources"])
        file_names.extend(article["file_names"])

115it [00:31,  3.68it/s]


In [7]:
# Label of 0 is real, 1 is fake
eval_results = pd.DataFrame({"Article": list(range(len(labels))),
                             "Source": sources,
                             "Token_Length": article_tokens,
                             "Predictions": preds,
                             "Labels": labels,
                             "Logit_real(0)": [x[0] for x in logits],
                             "Logit_fake(1)": [x[1] for x in logits],
                             "Prob_real(0)": [x[0] for x in probs],
                             "Prob_real(1)": [x[1] for x in probs],
                             "File_Name": file_names})

eval_results

Unnamed: 0,Article,Source,Token_Length,Predictions,Labels,Logit_real(0),Logit_fake(1),Prob_real(0),Prob_real(1),File_Name
0,0,gossipcop,1366,0,0,4.722003,-4.976188,0.999939,0.000061,gossipcop-848731
1,1,gossipcop,509,0,0,3.600271,-3.713171,0.999334,0.000666,gossipcop-944840
2,2,gossipcop,298,0,1,3.897395,-4.097625,0.999663,0.000337,gossipcop-1267637959
3,3,gossipcop,1493,1,0,-0.645965,0.517232,0.238087,0.761913,gossipcop-909119
4,4,gossipcop,151,0,0,4.753866,-5.004924,0.999942,0.000058,gossipcop-866623
...,...,...,...,...,...,...,...,...,...,...
3658,3658,gossipcop,670,1,1,-0.558948,0.280135,0.301728,0.698272,gossipcop-6231788357
3659,3659,gossipcop,218,0,0,4.796472,-5.073880,0.999948,0.000052,gossipcop-896655
3660,3660,gossipcop,10233,1,0,-0.745799,0.738307,0.184808,0.815192,gossipcop-856760
3661,3661,gossipcop,459,0,0,4.764915,-5.085905,0.999947,0.000053,gossipcop-854426


In [9]:
# Save evaluation data to csv file
eval_results.to_csv("eval_data_" + MODEL_FILE[12:-4] + ".csv")

## IV. Overall Metrics

In [8]:
# Overall metrics
precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
    labels, preds, average="binary")
accuracy = sklearn.metrics.accuracy_score(labels, preds)

metrics = {"description": DESCRIPTION,
           "eval_notes": "Evaluating both gossipcop and poltifact data",
           "model": MODEL_FILE,
           "epochs": checkpoint["epoch"],
           "batch_size": EVAL_BATCH_SIZE,
           "Date_evaluated": "25Nov2020",
           "train_data": "train_dataset_25Nov_gcpf_distilbert.pickle",
           "val_data": "eval_data_" + MODEL_FILE[12:-4] + ".csv",
           "eval_data": VAL_DATASET,
           "accuracy": accuracy,
           "precision": precision,
           "recall": recall,
           "f1": f1}
metrics

{'description': 'Distilbert trained on politifact and gossipcop training data.',
 'eval_notes': 'Evaluating both gossipcop and poltifact data',
 'model': 'saved_model_epoch10_20201126_1415.tar',
 'epochs': 10,
 'batch_size': 32,
 'Date_evaluated': '25Nov2020',
 'train_data': 'train_dataset_25Nov_gcpf_distilbert.pickle',
 'val_data': 'eval_data_epoch10_20201126_1415.csv',
 'eval_data': 'val_dataset_25Nov_gcpf_distilbert.pickle',
 'accuracy': 0.8555828555828556,
 'precision': 0.7588555858310627,
 'recall': 0.6127612761276128,
 'f1': 0.6780279975654291}

In [11]:
# Save metrics to JSON text file.
with open("eval_metrics_" + MODEL_FILE[12:-4] + ".json", "wt") as jfile:
    json.dump(metrics, jfile)

## V. Evaluate on Gossipcop Only

In [9]:
# Gossipcop evaluation
gc_results = eval_results.query("Source == 'gossipcop'")
filtered_labels = gc_results.Labels
filtered_preds = gc_results.Predictions
precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
    filtered_labels, filtered_preds, average="binary")
accuracy = sklearn.metrics.accuracy_score(filtered_labels, filtered_preds)

gc_metrics = {"description": DESCRIPTION,
           "eval_notes": "Evaluating on gossipcop data only (model trained on both).",
           "model": MODEL_FILE,
           "epochs": checkpoint["epoch"],
           "batch_size": EVAL_BATCH_SIZE,
           "Date_evaluated": "25Nov2020",
           "train_data": "train_dataset_25Nov_gcpf_distilbert.pickle",
           "val_data": "eval_data_" + MODEL_FILE[12:-4] + ".csv",
           "eval_data": VAL_DATASET,
           "accuracy": accuracy,
           "precision": precision,
           "recall": recall,
           "f1": f1}
gc_metrics

{'description': 'Distilbert trained on politifact and gossipcop training data.',
 'eval_notes': 'Evaluating on gossipcop data only (model trained on both).',
 'model': 'saved_model_epoch10_20201126_1415.tar',
 'epochs': 10,
 'batch_size': 32,
 'Date_evaluated': '25Nov2020',
 'train_data': 'train_dataset_25Nov_gcpf_distilbert.pickle',
 'val_data': 'eval_data_epoch10_20201126_1415.csv',
 'eval_data': 'val_dataset_25Nov_gcpf_distilbert.pickle',
 'accuracy': 0.8587634713556438,
 'precision': 0.7489239598278336,
 'recall': 0.6177514792899408,
 'f1': 0.6770428015564202}

In [13]:
# Save metrics to JSON text file.
with open("eval_metrics_gc_" + MODEL_FILE[12:-4] + ".json", "wt") as jfile:
    json.dump(gc_metrics, jfile)

## VI. Evaluate on Politifact Only

In [10]:
# Politifact Metrics
pf_results = eval_results.query("Source == 'politifact'")
filtered_labels = pf_results.Labels
filtered_preds = pf_results.Predictions
precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
    filtered_labels, filtered_preds, average="binary")
accuracy = sklearn.metrics.accuracy_score(filtered_labels, filtered_preds)

pf_metrics = {"description": DESCRIPTION,
           "eval_notes": "Evaluating on politifact data only (model trained on both).",
           "model": MODEL_FILE,
           "epochs": checkpoint["epoch"],
           "batch_size": EVAL_BATCH_SIZE,
           "Date_evaluated": "25Nov2020",
           "train_data": "train_dataset_25Nov_gcpf_distilbert.pickle",
           "val_data": "eval_data_" + MODEL_FILE[12:-4] + ".csv",
           "eval_data": VAL_DATASET,
           "accuracy": accuracy,
           "precision": precision,
           "recall": recall,
           "f1": f1}
pf_metrics

{'description': 'Distilbert trained on politifact and gossipcop training data.',
 'eval_notes': 'Evaluating on politifact data only (model trained on both).',
 'model': 'saved_model_epoch10_20201126_1415.tar',
 'epochs': 10,
 'batch_size': 32,
 'Date_evaluated': '25Nov2020',
 'train_data': 'train_dataset_25Nov_gcpf_distilbert.pickle',
 'val_data': 'eval_data_epoch10_20201126_1415.csv',
 'eval_data': 'val_dataset_25Nov_gcpf_distilbert.pickle',
 'accuracy': 0.7737226277372263,
 'precision': 0.9459459459459459,
 'recall': 0.546875,
 'f1': 0.6930693069306929}

In [15]:
# Save metrics to JSON text file.
with open("eval_metrics_pf_" + MODEL_FILE[12:-4] + ".json", "wt") as jfile:
    json.dump(pf_metrics, jfile)

## VII. Articles <= 512 Tokens

In [11]:
# Short Articles
short_results = eval_results.query("Token_Length <= 512")
filtered_labels = short_results.Labels
filtered_preds = short_results.Predictions
precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
    filtered_labels, filtered_preds, average="binary")
accuracy = sklearn.metrics.accuracy_score(filtered_labels, filtered_preds)

short_metrics = {"description": DESCRIPTION,
           "eval_notes": "Evaluating on short articles only.",
           "model": MODEL_FILE,
           "epochs": checkpoint["epoch"],
           "batch_size": EVAL_BATCH_SIZE,
           "Date_evaluated": "25Nov2020",
           "train_data": "train_dataset_25Nov_gcpf_distilbert.pickle",
           "val_data": "eval_data_" + MODEL_FILE[12:-4] + ".csv",
           "eval_data": VAL_DATASET,
           "accuracy": accuracy,
           "precision": precision,
           "recall": recall,
           "f1": f1}
short_metrics

{'description': 'Distilbert trained on politifact and gossipcop training data.',
 'eval_notes': 'Evaluating on short articles only.',
 'model': 'saved_model_epoch10_20201126_1415.tar',
 'epochs': 10,
 'batch_size': 32,
 'Date_evaluated': '25Nov2020',
 'train_data': 'train_dataset_25Nov_gcpf_distilbert.pickle',
 'val_data': 'eval_data_epoch10_20201126_1415.csv',
 'eval_data': 'val_dataset_25Nov_gcpf_distilbert.pickle',
 'accuracy': 0.8677130044843049,
 'precision': 0.8092643051771117,
 'recall': 0.6414686825053996,
 'f1': 0.7156626506024095}

In [17]:
# Save metrics to JSON text file.
with open("eval_metrics_short_" + MODEL_FILE[12:-4] + ".json", "wt") as jfile:
    json.dump(short_metrics, jfile)

## VIII. Articles > 512 Tokens

In [12]:
# Long Articles
long_results = eval_results.query("Token_Length > 512")
filtered_labels = long_results.Labels
filtered_preds = long_results.Predictions
precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
    filtered_labels, filtered_preds, average="binary")
accuracy = sklearn.metrics.accuracy_score(filtered_labels, filtered_preds)

long_metrics = {"description": DESCRIPTION,
           "eval_notes": "Evaluating on long articles only.",
           "model": MODEL_FILE,
           "epochs": checkpoint["epoch"],
           "batch_size": EVAL_BATCH_SIZE,
           "Date_evaluated": "25Nov2020",
           "train_data": "train_dataset_25Nov_gcpf_distilbert.pickle",
           "val_data": "eval_data_" + MODEL_FILE[12:-4] + ".csv",
           "eval_data": VAL_DATASET,
           "accuracy": accuracy,
           "precision": precision,
           "recall": recall,
           "f1": f1}
long_metrics

{'description': 'Distilbert trained on politifact and gossipcop training data.',
 'eval_notes': 'Evaluating on long articles only.',
 'model': 'saved_model_epoch10_20201126_1415.tar',
 'epochs': 10,
 'batch_size': 32,
 'Date_evaluated': '25Nov2020',
 'train_data': 'train_dataset_25Nov_gcpf_distilbert.pickle',
 'val_data': 'eval_data_epoch10_20201126_1415.csv',
 'eval_data': 'val_dataset_25Nov_gcpf_distilbert.pickle',
 'accuracy': 0.8440659925492283,
 'precision': 0.7084468664850136,
 'recall': 0.5829596412556054,
 'f1': 0.6396063960639606}

In [19]:
# Save metrics to JSON text file.
with open("eval_metrics_long_" + MODEL_FILE[12:-4] + ".json", "wt") as jfile:
    json.dump(long_metrics, jfile)