# Evaulate Roberta Model on Politifact Test Data
### 1 Dec 2020

## I. Evaluation Parameters

In [1]:
# Set Evaulation Parameters
MODEL_NAME = "roberta-base"
MODEL_FILE ="saved_model_epoch10_20201129_0424.tar"
TEST_DATASET = "test_data_1Dec_roberta_pf.pickle"
EVAL_BATCH_SIZE = 16
DESCRIPTION = "PolitiFact only Roberta model on test data"

## II. Setup and Load Model

In [2]:
import pickle
import sys

import json
import pandas as pd
import pickle
import sklearn.metrics
import tqdm
import torch
import transformers as hft

sys.path.insert(0, "/home/jupyter")
import util.log
import util.data

In [8]:
# Load evaluation data
with open(TEST_DATASET, "rb") as vfile:
    test_dataset = pickle.load(vfile)

In [5]:
# Load saved model
checkpoint = torch.load(MODEL_FILE)
model = (hft.RobertaForSequenceClassification
         .from_pretrained(MODEL_NAME))
model.load_state_dict(checkpoint["model"])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

<All keys matched successfully>

In [6]:
# Move model to GPU if available
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
model.to(device)
model.eval();
device

device(type='cuda')

## III. Evaluation Loop and Results Dataframe

In [9]:
# Evaulate model
eval_loader = torch.utils.data.DataLoader(test_dataset,
                                batch_size=EVAL_BATCH_SIZE,
                                shuffle=False)
labels = []
preds = []
logits = []
probs = []
sources = []
article_tokens = []
file_names = []

softmax = torch.nn.Softmax(dim=1)

with torch.no_grad():
    for art_num, article in tqdm.tqdm(enumerate(eval_loader)):
        input_ids = article["input_ids"].to(device)
        attent_mask = article["attention_mask"].to(device)
        output = model(input_ids,
                       attention_mask=attent_mask,
                       output_hidden_states=False,
                       output_attentions=False)
        prob = softmax(output[0].detach().cpu()).numpy()
        logit = output[0].detach().cpu().numpy()
        pred = [0 if lgt[0] > lgt[1] else 1 for lgt in logit]
        label = article["labels"].numpy()
        labels.extend(label)
        preds.extend(pred)
        logits.extend(logit)
        probs.extend(prob)
        article_tokens.extend(article["article_tokens"].numpy())
        sources.extend(article["sources"])
        file_names.extend(article["file_names"])

8it [00:05,  1.37it/s]


In [10]:
# Label of 0 is real, 1 is fake
eval_results = pd.DataFrame({"Article": list(range(len(labels))),
                             "Source": sources,
                             "Token_Length": article_tokens,
                             "Predictions": preds,
                             "Labels": labels,
                             "Logit_real(0)": [x[0] for x in logits],
                             "Logit_fake(1)": [x[1] for x in logits],
                             "Prob_real(0)": [x[0] for x in probs],
                             "Prob_real(1)": [x[1] for x in probs],
                             "File_Name": file_names})

eval_results

Unnamed: 0,Article,Source,Token_Length,Predictions,Labels,Logit_real(0),Logit_fake(1),Prob_real(0),Prob_real(1),File_Name
0,0,politifact,1163,0,0,3.827148,-3.899718,0.999559,0.000441,politifact35
1,1,politifact,1442,1,0,-3.737876,3.626107,0.000633,0.999367,politifact4028
2,2,politifact,505,0,0,4.127038,-4.195304,0.999757,0.000243,politifact567
3,3,politifact,260,1,1,-4.686815,4.649739,0.000088,0.999912,politifact14342
4,4,politifact,506,1,1,-4.678712,4.526700,0.000100,0.999900,politifact15427
...,...,...,...,...,...,...,...,...,...,...
119,119,politifact,145,1,1,-3.874329,3.699367,0.000514,0.999486,politifact15108
120,120,politifact,11333,0,0,4.112895,-4.151903,0.999743,0.000257,politifact2881
121,121,politifact,19813,0,0,4.220101,-4.236417,0.999788,0.000212,politifact809
122,122,politifact,587,0,0,3.772211,-4.076408,0.999610,0.000390,politifact13305


In [11]:
# Save evaluation data to csv file
eval_results.to_csv("test_data.csv")

## IV. Overall Metrics

In [12]:
# Overall metrics
precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
    eval_results.Labels, eval_results.Predictions, average="binary")
accuracy = sklearn.metrics.accuracy_score(eval_results.Labels, eval_results.Predictions)

metrics = {"description": DESCRIPTION,
           "eval_notes": "Evaluating PolitiFact data",
           "model": MODEL_FILE,
           "epochs": checkpoint["epoch"],
           "batch_size": EVAL_BATCH_SIZE,
           "Date_evaluated": "1Dec2020",
           "train_data": "train_data_28Nov_roberta_pf.pickle",
           "test_data": TEST_DATASET,
           "accuracy": accuracy,
           "precision": precision,
           "recall": recall,
           "f1": f1}
metrics

{'description': 'PolitiFact only Roberta model on test data',
 'eval_notes': 'Evaluating PolitiFact data',
 'model': 'saved_model_epoch10_20201129_0424.tar',
 'epochs': 10,
 'batch_size': 16,
 'Date_evaluated': '1Dec2020',
 'train_data': 'train_data_28Nov_roberta_pf.pickle',
 'test_data': 'test_data_1Dec_roberta_pf.pickle',
 'accuracy': 0.9435483870967742,
 'precision': 0.8870967741935484,
 'recall': 1.0,
 'f1': 0.9401709401709402}

In [13]:
# Save metrics to JSON text file.
with open("test_metrics.json", "wt") as jfile:
    json.dump(metrics, jfile)

In [14]:
cm = pd.DataFrame(sklearn.metrics.confusion_matrix(eval_results.Labels, eval_results.Predictions))
label_titles = {0: "gossip", 1: "political"}
cm.rename(index=label_titles, columns=label_titles, inplace=True)
cm

Unnamed: 0,gossip,political
gossip,62,7
political,0,55
