# Evaulate Roberta Models with Sorted Data - Political
### Includes Titles

In [19]:
import json
import pickle
import sys

import pandas as pd
import sklearn.metrics
import tqdm
import torch
import transformers as hft

sys.path.insert(0, "/home/jupyter")
import util.log
import util.data

In [20]:
MODEL_NAME = "roberta-base"
GC_MODEL_FILE ="gc_saved_model_epoch10_20201127_2032.tar"
PF_MODEL_FILE = "pf_saved_model_epoch10_20201129_0424.tar"
GC_TEST_DATASET = "sorted_dataset_gc_roberta_29Nov.pickle"
PF_TEST_DATASET = "sorted_dataset_pf_roberta_29Nov.pickle"
EVAL_BATCH_SIZE = 16

DESCRIPTION = "Roberta trained on politifact or gossipcop test data, with encoded titles and weighting, data machine sorted."

In [22]:
with open(PF_TEST_DATASET, "rb") as pfile:
    pf_test_dataset = pickle.load(pfile)

In [23]:
checkpoint = torch.load(PF_MODEL_FILE)
model = (hft.RobertaForSequenceClassification
         .from_pretrained(MODEL_NAME))
model.load_state_dict(checkpoint["model"])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

<All keys matched successfully>

In [24]:
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
model.to(device)
model.eval();

In [25]:
eval_loader = torch.utils.data.DataLoader(gc_test_dataset,
                                batch_size=EVAL_BATCH_SIZE,
                                shuffle=False)

In [26]:
labels = []
preds = []
logits = []
probs = []
sources = []
article_tokens = []
file_names = []

softmax = torch.nn.Softmax(dim=1)

with torch.no_grad():
    for art_num, article in tqdm.tqdm(enumerate(eval_loader)):
        input_ids = article["input_ids"].to(device)
        attent_mask = article["attention_mask"].to(device)
        output = model(input_ids,
                       attention_mask=attent_mask,
                       output_hidden_states=False,
                       output_attentions=False)
        prob = softmax(output[0].detach().cpu()).numpy()
        logit = output[0].detach().cpu().numpy()
        pred = [0 if lgt[0] > lgt[1] else 1 for lgt in logit]
        label = article["labels"].numpy()
        labels.extend(label)
        preds.extend(pred)
        logits.extend(logit)
        probs.extend(prob)
        article_tokens.extend(article["article_tokens"].numpy())
        sources.extend(article["sources"])
        file_names.extend(article["file_names"])

9it [00:02,  3.64it/s]


In [27]:
# Label of 0 is real, 1 is fake
eval_results = pd.DataFrame({"Article": list(range(len(labels))),
                             "Source": sources,
                             "Token_Length": article_tokens,
                             "Predictions": preds,
                             "Labels": labels,
                             "Logit_real(0)": [x[0] for x in logits],
                             "Logit_fake(1)": [x[1] for x in logits],
                             "Prob_real(0)": [x[0] for x in probs],
                             "Prob_real(1)": [x[1] for x in probs],
                             "File_Name": file_names})

eval_results

Unnamed: 0,Article,Source,Token_Length,Predictions,Labels,Logit_real(0),Logit_fake(1),Prob_real(0),Prob_real(1),File_Name
0,0,gossipcop,281,1,1,-3.197902,2.537999,0.003218,0.996782,gossipcop-9324384637
1,1,politifact,15506,0,0,4.058324,-4.163477,0.999731,0.000269,politifact8737
2,2,gossipcop,351,1,0,-1.782905,1.667640,0.030753,0.969247,gossipcop-889110
3,3,politifact,218,1,1,-4.291286,3.983364,0.000255,0.999745,politifact15205
4,4,politifact,145,1,1,-2.897077,2.571214,0.004201,0.995799,politifact14490
...,...,...,...,...,...,...,...,...,...,...
131,131,gossipcop,22648,0,0,4.291415,-4.294261,0.999813,0.000187,gossipcop-866971
132,132,gossipcop,220,0,0,3.243027,-3.369665,0.998659,0.001341,gossipcop-897119
133,133,politifact,410,1,1,-4.776204,4.671124,0.000079,0.999921,politifact14222
134,134,politifact,277,0,0,4.147800,-4.251074,0.999775,0.000225,politifact669


In [29]:
# Save evaluation data to csv file
eval_results.to_csv("test_data_PF.csv")

In [30]:
eval_results.query("Predictions == 1")

Unnamed: 0,Article,Source,Token_Length,Predictions,Labels,Logit_real(0),Logit_fake(1),Prob_real(0),Prob_real(1),File_Name
0,0,gossipcop,281,1,1,-3.197902,2.537999,0.003218,0.996782,gossipcop-9324384637
2,2,gossipcop,351,1,0,-1.782905,1.667640,0.030753,0.969247,gossipcop-889110
3,3,politifact,218,1,1,-4.291286,3.983364,0.000255,0.999745,politifact15205
4,4,politifact,145,1,1,-2.897077,2.571214,0.004201,0.995799,politifact14490
5,5,politifact,376,1,1,-4.651823,4.536079,0.000102,0.999898,politifact15270
...,...,...,...,...,...,...,...,...,...,...
123,123,gossipcop,319,1,1,-4.661384,4.604560,0.000095,0.999905,gossipcop-8998543910
125,125,gossipcop,399,1,1,-0.617848,0.207708,0.304586,0.695414,gossipcop-8606143021
127,127,politifact,260,1,1,-4.686815,4.649739,0.000088,0.999912,politifact14342
128,128,politifact,276,1,1,-4.543458,4.390044,0.000132,0.999868,politifact14876


In [31]:
for x in range(0, 10):
    print(eval_results["Logit_real(0)"][x], eval_results["Logit_fake(1)"][x])

-3.1979024410247803 2.537999153137207
4.058324337005615 -4.163477420806885
-1.782905101776123 1.667639970779419
-4.291285991668701 3.9833643436431885
-2.8970773220062256 2.571213960647583
-4.651822566986084 4.536078929901123
-4.691722393035889 4.678485870361328
0.20492489635944366 -0.5231598019599915
-4.68992280960083 4.668687343597412
3.4687769412994385 -3.818591594696045


In [33]:
# Overall Metrics
precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
    eval_results.Labels, eval_results.Predictions, average="binary")
accuracy = sklearn.metrics.accuracy_score(eval_results.Labels, eval_results.Predictions)

metrics = {"description": DESCRIPTION,
           "eval_notes": "Evaluating both gossipcop and poltifact data",
           "model": PF_MODEL_FILE,
           "epochs": checkpoint["epoch"],
           "batch_size": EVAL_BATCH_SIZE,
           "Date_evaluated": "25Nov2020",
           "train_data": "train_dataset_25Nov_source_distilbert.pickle",
           "val_data": "eval_data_" + PF_MODEL_FILE[12:-4] + ".csv",
           "eval_data": PF_TEST_DATASET,
           "accuracy": accuracy,
           "precision": precision,
           "recall": recall,
           "f1": f1}
metrics

{'description': 'Roberta trained on politifact or gossipcop test data, with encoded titles and weighting, data machine sorted.',
 'eval_notes': 'Evaluating both gossipcop and poltifact data',
 'model': 'pf_saved_model_epoch10_20201129_0424.tar',
 'epochs': 10,
 'batch_size': 16,
 'Date_evaluated': '25Nov2020',
 'train_data': 'train_dataset_25Nov_source_distilbert.pickle',
 'val_data': 'eval_data_el_epoch10_20201129_0424.csv',
 'eval_data': 'sorted_dataset_pf_roberta_29Nov.pickle',
 'accuracy': 0.8897058823529411,
 'precision': 0.819672131147541,
 'recall': 0.9259259259259259,
 'f1': 0.8695652173913043}

In [34]:
# Save metrics to JSON text file.
with open("eval_metrics_GC.json", "wt") as jfile:
    json.dump(metrics, jfile)

In [35]:
sum(preds)

61

In [36]:
cm = pd.DataFrame(sklearn.metrics.confusion_matrix(eval_results.Labels, eval_results.Predictions))
label_titles = {0: "real", 1: "fake"}
cm.rename(index=label_titles, columns=label_titles, inplace=True)
cm

Unnamed: 0,real,fake
real,71,11
fake,4,50
