# Split Dataset Using Topic Chooser
#### 29 Nov 2020

In [1]:
# Set Evaulation Parameters
MODEL_NAME = "distilbert-base-uncased"
MODEL_FILE ="sorter_saved_model_epoch5_20201126_1904.tar"
SORT_DATASET = "test_dataset_all_titles_29Nov_distilbert.pickle"
TEST_DATASET = "test_dataset_all_titles_29Nov_roberta.pickle"

In [2]:
import copy
import itertools
import pickle
import sys

import json
import pandas as pd
import pickle
import tqdm
import torch
import torch.utils.data
import transformers as hft

sys.path.insert(0, "/home/jupyter")
import util.log
import util.data

In [3]:
# Load dataset that will be used for sorting
with open(SORT_DATASET, "rb") as dfile:
    dataset = pickle.load(dfile)

In [4]:
# Load saved model
checkpoint = torch.load(MODEL_FILE)
model = (hft.DistilBertForSequenceClassification
         .from_pretrained(MODEL_NAME))
model.load_state_dict(checkpoint["model"])

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

<All keys matched successfully>

In [5]:
# Move model to GPU if available
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
model.to(device)
model.eval();
device

device(type='cuda')

In [6]:
political_files = []
gossip_files = []

with torch.no_grad():
    for idx, article in tqdm.tqdm(enumerate(dataset)):
        input_ids = article["input_ids"].to(device)
        attent_mask = article["attention_mask"].to(device)
        output = model(input_ids.unsqueeze(0),
                       attention_mask=attent_mask.unsqueeze(0),
                       output_hidden_states=False,
                       output_attentions=False)
        logit = output[0].squeeze(0).detach().cpu().numpy()
        if logit[0] > logit[1]:
            gossip_files.append(article["file_names"])
        else:
            political_files.append(article["file_names"])

3660it [00:40, 91.06it/s]


In [7]:
# Load dataset that will be split
with open(TEST_DATASET, "rb") as pfile:
    test_dataset = pickle.load(pfile)

In [8]:
polit_data = util.data.subset_FNDataset(test_dataset, political_files)

In [9]:
gossip_data = util.data.subset_FNDataset(test_dataset, gossip_files)

In [10]:
assert len(polit_data) + len(gossip_data) == len(dataset)

In [11]:
len(polit_data)

136

In [12]:
len(gossip_data)

3524

In [13]:
polit_data.file_names[:20]

['gossipcop-9324384637',
 'politifact8737',
 'gossipcop-889110',
 'politifact15205',
 'politifact14490',
 'politifact15270',
 'politifact14621',
 'gossipcop-863027',
 'politifact13766',
 'politifact8611',
 'politifact1783',
 'gossipcop-7262259263',
 'gossipcop-856597',
 'politifact14071',
 'politifact546',
 'politifact13052',
 'politifact14211',
 'politifact2881',
 'politifact239',
 'politifact1135']

In [14]:
gossip_data.file_names[:20]

['gossipcop-9843277966',
 'gossipcop-7642653086',
 'gossipcop-928427',
 'gossipcop-953132',
 'gossipcop-2045311114',
 'gossipcop-881389',
 'gossipcop-2389938114',
 'gossipcop-869038',
 'gossipcop-915948',
 'gossipcop-919444',
 'gossipcop-890617',
 'gossipcop-882273',
 'gossipcop-854882',
 'gossipcop-849046',
 'gossipcop-940508',
 'gossipcop-854448',
 'gossipcop-892283',
 'gossipcop-931140',
 'gossipcop-952734',
 'gossipcop-943331']

In [15]:
with open("political_filenames_test.pickle", "wb") as pfile:
    pickle.dump(polit_data.file_names, pfile)

In [16]:
with open("gossip_filenames_test.pickle", "wb") as pfile:
    pickle.dump(gossip_data.file_names, pfile)

In [17]:
with open("sorted_dataset_gc_roberta_29Nov.pickle", "wb") as pfile:
    pickle.dump(gossip_data, pfile)

In [18]:
with open("sorted_dataset_pf_roberta_29Nov.pickle", "wb") as pfile:
    pickle.dump(polit_data, pfile)