# Evaulate Roberta Models with Sorted Data
### Includes Titles

In [1]:
import json
import pickle
import sys

import pandas as pd
import sklearn.metrics
import tqdm
import torch
import transformers as hft

sys.path.insert(0, "/home/jupyter")
import util.log
import util.data

In [2]:
MODEL_NAME = "roberta-base"
GC_MODEL_FILE ="gc_saved_model_epoch10_20201127_2032.tar"
PF_MODEL_FILE = "pf_saved_model_epoch10_20201129_0424.tar"
GC_TEST_DATASET = "sorted_dataset_gc_roberta_29Nov.pickle"
PF_TEST_DATASET = "sorted_dataset_pf_roberta_29Nov.pickle"
EVAL_BATCH_SIZE = 16

DESCRIPTION = "Roberta trained on politifact or gossipcop test data, with encoded titles and weighting, data machine sorted."

In [3]:
with open(GC_TEST_DATASET, "rb") as pfile:
    gc_test_dataset = pickle.load(pfile)

In [4]:
checkpoint = torch.load(GC_MODEL_FILE)
model = (hft.RobertaForSequenceClassification
         .from_pretrained(MODEL_NAME))
model.load_state_dict(checkpoint["model"])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

<All keys matched successfully>

In [5]:
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
model.to(device)
model.eval();

In [6]:
eval_loader = torch.utils.data.DataLoader(gc_test_dataset,
                                batch_size=EVAL_BATCH_SIZE,
                                shuffle=False)

In [7]:
labels = []
preds = []
logits = []
probs = []
sources = []
article_tokens = []
file_names = []

softmax = torch.nn.Softmax(dim=1)

with torch.no_grad():
    for art_num, article in tqdm.tqdm(enumerate(eval_loader)):
        input_ids = article["input_ids"].to(device)
        attent_mask = article["attention_mask"].to(device)
        output = model(input_ids,
                       attention_mask=attent_mask,
                       output_hidden_states=False,
                       output_attentions=False)
        prob = softmax(output[0].detach().cpu()).numpy()
        logit = output[0].detach().cpu().numpy()
        pred = [0 if lgt[0] > lgt[1] else 1 for lgt in logit]
        label = article["labels"].numpy()
        labels.extend(label)
        preds.extend(pred)
        logits.extend(logit)
        probs.extend(prob)
        article_tokens.extend(article["article_tokens"].numpy())
        sources.extend(article["sources"])
        file_names.extend(article["file_names"])

221it [01:03,  3.48it/s]


In [8]:
# Label of 0 is real, 1 is fake
eval_results = pd.DataFrame({"Article": list(range(len(labels))),
                             "Source": sources,
                             "Token_Length": article_tokens,
                             "Predictions": preds,
                             "Labels": labels,
                             "Logit_real(0)": [x[0] for x in logits],
                             "Logit_fake(1)": [x[1] for x in logits],
                             "Prob_real(0)": [x[0] for x in probs],
                             "Prob_real(1)": [x[1] for x in probs],
                             "File_Name": file_names})

eval_results

Unnamed: 0,Article,Source,Token_Length,Predictions,Labels,Logit_real(0),Logit_fake(1),Prob_real(0),Prob_real(1),File_Name
0,0,gossipcop,584,1,1,-2.757137,2.776392,0.003936,0.996064,gossipcop-9843277966
1,1,gossipcop,641,1,1,-2.756062,2.775949,0.003942,0.996058,gossipcop-7642653086
2,2,gossipcop,330,0,0,2.424183,-2.559280,0.993196,0.006804,gossipcop-928427
3,3,gossipcop,1573,0,0,2.443434,-2.575524,0.993432,0.006568,gossipcop-953132
4,4,gossipcop,1227,1,1,-2.613986,2.674522,0.005024,0.994976,gossipcop-2045311114
...,...,...,...,...,...,...,...,...,...,...
3519,3519,gossipcop,332,0,0,2.445893,-2.586979,0.993522,0.006478,gossipcop-946711
3520,3520,gossipcop,258,0,0,2.442909,-2.570607,0.993396,0.006604,gossipcop-917098
3521,3521,gossipcop,705,0,0,2.439296,-2.586404,0.993476,0.006524,gossipcop-931388
3522,3522,gossipcop,553,1,1,-2.716794,2.750360,0.004205,0.995795,gossipcop-5713918011


In [9]:
# Save evaluation data to csv file
eval_results.to_csv("test_data_GC.csv")

In [10]:
eval_results.query("Predictions == 1")

Unnamed: 0,Article,Source,Token_Length,Predictions,Labels,Logit_real(0),Logit_fake(1),Prob_real(0),Prob_real(1),File_Name
0,0,gossipcop,584,1,1,-2.757137,2.776392,0.003936,0.996064,gossipcop-9843277966
1,1,gossipcop,641,1,1,-2.756062,2.775949,0.003942,0.996058,gossipcop-7642653086
4,4,gossipcop,1227,1,1,-2.613986,2.674522,0.005024,0.994976,gossipcop-2045311114
26,26,gossipcop,455,1,1,-2.757249,2.776614,0.003935,0.996065,gossipcop-3136276770
27,27,gossipcop,424,1,0,-2.550592,2.624370,0.005625,0.994375,gossipcop-858455
...,...,...,...,...,...,...,...,...,...,...
3508,3508,gossipcop,489,1,0,-0.170699,0.273740,0.390684,0.609316,gossipcop-927391
3510,3510,gossipcop,163,1,0,-0.085287,0.171870,0.436063,0.563937,gossipcop-876798
3512,3512,gossipcop,644,1,1,-2.743232,2.769320,0.004020,0.995980,gossipcop-2900315313
3514,3514,gossipcop,426,1,1,-2.740091,2.766339,0.004044,0.995956,gossipcop-4409997082


In [11]:
for x in range(0, 10):
    print(eval_results["Logit_real(0)"][x], eval_results["Logit_fake(1)"][x])

-2.7571372985839844 2.7763917446136475
-2.7560620307922363 2.7759487628936768
2.4241833686828613 -2.559279680252075
2.4434337615966797 -2.575523853302002
-2.6139862537384033 2.6745223999023438
2.431814432144165 -2.576071262359619
2.2687695026397705 -2.331120252609253
2.4352691173553467 -2.536878824234009
2.449446439743042 -2.575528144836426
1.6820950508117676 -1.6644458770751953


In [12]:
# Overall Metrics
precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
    eval_results.Labels, eval_results.Predictions, average="binary")
accuracy = sklearn.metrics.accuracy_score(eval_results.Labels, eval_results.Predictions)

metrics = {"description": DESCRIPTION,
           "eval_notes": "Evaluating both gossipcop and poltifact data",
           "model": GC_MODEL_FILE,
           "epochs": checkpoint["epoch"],
           "batch_size": EVAL_BATCH_SIZE,
           "Date_evaluated": "25Nov2020",
           "train_data": "train_dataset_25Nov_source_distilbert.pickle",
           "val_data": "eval_data_" + GC_MODEL_FILE[12:-4] + ".csv",
           "eval_data": GC_TEST_DATASET,
           "accuracy": accuracy,
           "precision": precision,
           "recall": recall,
           "f1": f1}
metrics

{'description': 'Roberta trained on politifact or gossipcop test data, with encoded titles and weighting, data machine sorted.',
 'eval_notes': 'Evaluating both gossipcop and poltifact data',
 'model': 'gc_saved_model_epoch10_20201127_2032.tar',
 'epochs': 10,
 'batch_size': 16,
 'Date_evaluated': '25Nov2020',
 'train_data': 'train_dataset_25Nov_source_distilbert.pickle',
 'val_data': 'eval_data_el_epoch10_20201127_2032.csv',
 'eval_data': 'sorted_dataset_gc_roberta_29Nov.pickle',
 'accuracy': 0.854994324631101,
 'precision': 0.7272727272727273,
 'recall': 0.6666666666666666,
 'f1': 0.6956521739130435}

In [13]:
# Save metrics to JSON text file.
with open("eval_metrics_GC.json", "wt") as jfile:
    json.dump(metrics, jfile)

In [16]:
sum(preds)

803

In [14]:
cm = pd.DataFrame(sklearn.metrics.confusion_matrix(eval_results.Labels, eval_results.Predictions))
label_titles = {0: "real", 1: "fake"}
cm.rename(index=label_titles, columns=label_titles, inplace=True)
cm

Unnamed: 0,real,fake
real,2429,219
fake,292,584
