In [1]:
import pandas as pd
from nltk import WhitespaceTokenizer
from nltk.corpus import stopwords, words, wordnet
from nltk.lm import Vocabulary
from collections import OrderedDict
import numpy as np
import torch
from torch import nn
import concurrent.futures
from torch.optim import Adam
from data_loading import process_text_df, NewsText, tensorize_sentences, collate_fn
from tqdm import tqdm as pbar
from models import NewsNet
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch import tensor
import matplotlib.pyplot as plt
import tqdm
stopwords = stopwords.words()
words = words.words() 
wordnet = wordnet.words()

In [2]:
fake, true = pd.read_csv('Fake.csv'), pd.read_csv('True.csv')
fake['label'] = 'fake'
true['label'] = 'true'
news = pd.concat((fake, true))
news = news.sample(frac=1)
news.reset_index(inplace=True, drop=True)

In [3]:
p1 = process_text_df(news, ['title', 'text'])

In [4]:
%%time
p1.process_text_col()

CPU times: user 2.97 s, sys: 1.21 s, total: 4.18 s
Wall time: 4min 23s


In [5]:
p1.df

Unnamed: 0,title,text,subject,date,label
0,"[south, korea, call, bonenumb, sanction, north...","[seoul, reuter, , south, korea, warn, north, k...",politicsNews,"January 13, 2016",true
1,"[trump, tri, scare, us, isi, terrorist, group,...","[know, trump, paranoia, isi, make, realli, ann...",News,"February 11, 2017",fake
2,"[trump, vietnam, presid, underscor, free, open...","[reuter, , us, presid, donald, trump, vietname...",politicsNews,"November 12, 2017",true
3,"[obamacar, privaci, practic, question, critic,...","[rep, diann, black, healthcar, industri, decad...",Government News,"Jun 15, 2015",fake
4,"[clinton, trounc, trump, gop, district, doubl,...","[donald, trump, continu, sink, polit, parti, d...",News,"August 10, 2016",fake
...,...,...,...,...,...
44893,"[trump, america, first, speech, alarm, us, alli]","[london, reuter, , donald, trump, first, major...",politicsNews,"April 27, 2016",true
44894,"[saudi, arabia, forfeit, sovereign, right, enr...","[riyadh, reuter, , saudi, arabia, forfeit, sov...",worldnews,"December 21, 2017",true
44895,"[trump, new, pathet, antihillari, frame, bill,...","[anyon, point, finger, mistreat, women, probab...",News,"May 23, 2016",fake
44896,"[lebanon, pass, disput, tax, hike, fund, publi...","[beirut, reuter, , lebanon, parliament, approv...",worldnews,"October 9, 2017",true


In [6]:
p1.tokenize_sentences()

In [7]:
l, labs = tensorize_sentences(p1.df.text.apply(lambda sent: sent[:50]), p1.df.label)

In [None]:
net = NewsNet(p1.vocab, hidden_size=4, embedding_dim=8, num_layers=2)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = Adam(net.parameters(), lr=1e-4, weight_decay=5e-6)
idx = len(l) // 4

l_tr, labs_tr = l[:-2 * idx], labs[: -2 * idx]
l_val, labs_val = l[-2 * idx:-idx], labs[-2 * idx:-idx]
l_tst, labs_tst = l[-idx:], labs[-idx:]
l_tr, l_val, labs_tr, labs_val = l[:idx], l[idx:], labs[:idx], labs[idx:]
tr_set = NewsText(l_tr, labs_tr)
val_set = NewsText(l_val, labs_val)
_, val_set = enumerate(DataLoader(val_set, batch_size=len(val_set), collate_fn=collate_fn)).__next__()
val_features, val_labels = val_set
val_labels = val_labels.unsqueeze(-1).float()
loader = DataLoader(tr_set, batch_size=4, collate_fn=collate_fn)
loss_list = []
val_loss_list = []
accuracy_list = []
epochs = 60
for i in range(epochs):
    print(f'Epoch {i + 1}')
    for sents, labels in pbar(loader):
        net.train()
        labels = labels.float().unsqueeze(-1)
        out = net(sents)
        loss = loss_fn(out, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        net.eval()
    with torch.no_grad():
        val_out = net(val_features)
        val_loss = loss_fn(val_out, val_labels)
        val_loss_list.append(val_loss.item())
        val_guesses = torch.round(nn.Sigmoid()(val_out))
        accuracy = (val_guesses == val_labels).float().mean().item()
        
        accuracy_list.append(accuracy)
        
    loss_list.append(loss.item())

  0%|          | 5/2806 [00:00<00:56, 49.69it/s]

Epoch 1


100%|██████████| 2806/2806 [00:44<00:00, 62.94it/s]
  0%|          | 6/2806 [00:00<00:47, 58.72it/s]

Epoch 2


100%|██████████| 2806/2806 [00:44<00:00, 62.72it/s]
  0%|          | 6/2806 [00:00<00:47, 59.32it/s]

Epoch 3


100%|██████████| 2806/2806 [00:45<00:00, 62.35it/s]
  0%|          | 6/2806 [00:00<00:48, 58.28it/s]

Epoch 4


100%|██████████| 2806/2806 [00:43<00:00, 64.22it/s]
  0%|          | 7/2806 [00:00<00:45, 61.60it/s]

Epoch 5


100%|██████████| 2806/2806 [00:44<00:00, 63.48it/s]
  0%|          | 7/2806 [00:00<00:44, 62.34it/s]

Epoch 6


100%|██████████| 2806/2806 [00:43<00:00, 64.40it/s]
  0%|          | 7/2806 [00:00<00:43, 63.74it/s]

Epoch 7


100%|██████████| 2806/2806 [00:43<00:00, 65.02it/s]
  0%|          | 7/2806 [00:00<00:44, 62.49it/s]

Epoch 8


100%|██████████| 2806/2806 [00:42<00:00, 65.26it/s]
  0%|          | 7/2806 [00:00<00:44, 63.48it/s]

Epoch 9


100%|██████████| 2806/2806 [00:42<00:00, 65.27it/s]
  0%|          | 7/2806 [00:00<00:45, 61.68it/s]

Epoch 10


100%|██████████| 2806/2806 [00:43<00:00, 65.21it/s]
  0%|          | 7/2806 [00:00<00:44, 62.69it/s]

Epoch 11


100%|██████████| 2806/2806 [00:43<00:00, 65.04it/s]
  0%|          | 7/2806 [00:00<00:46, 60.81it/s]

Epoch 12


100%|██████████| 2806/2806 [00:43<00:00, 65.23it/s]
  0%|          | 7/2806 [00:00<00:44, 62.76it/s]

Epoch 13


100%|██████████| 2806/2806 [00:43<00:00, 65.11it/s]
  0%|          | 7/2806 [00:00<00:44, 62.70it/s]

Epoch 14


100%|██████████| 2806/2806 [00:43<00:00, 65.16it/s]
  0%|          | 7/2806 [00:00<00:45, 61.60it/s]

Epoch 15


100%|██████████| 2806/2806 [00:43<00:00, 64.93it/s]
  0%|          | 7/2806 [00:00<00:45, 61.85it/s]

Epoch 16


100%|██████████| 2806/2806 [00:43<00:00, 65.19it/s]
  0%|          | 7/2806 [00:00<00:44, 63.36it/s]

Epoch 17


100%|██████████| 2806/2806 [00:43<00:00, 65.01it/s]
  0%|          | 7/2806 [00:00<00:44, 63.08it/s]

Epoch 18


100%|██████████| 2806/2806 [00:43<00:00, 64.98it/s]
  0%|          | 7/2806 [00:00<00:44, 63.27it/s]

Epoch 19


100%|██████████| 2806/2806 [00:43<00:00, 65.23it/s]
  0%|          | 7/2806 [00:00<00:44, 63.53it/s]

Epoch 20


100%|██████████| 2806/2806 [00:42<00:00, 65.33it/s]
  0%|          | 7/2806 [00:00<00:44, 63.53it/s]

Epoch 21


100%|██████████| 2806/2806 [00:43<00:00, 65.23it/s]
  0%|          | 7/2806 [00:00<00:44, 63.35it/s]

Epoch 22


100%|██████████| 2806/2806 [00:42<00:00, 65.38it/s]
  0%|          | 7/2806 [00:00<00:44, 63.36it/s]

Epoch 23


100%|██████████| 2806/2806 [00:42<00:00, 65.28it/s]
  0%|          | 7/2806 [00:00<00:44, 62.66it/s]

Epoch 24


100%|██████████| 2806/2806 [00:42<00:00, 65.35it/s]
  0%|          | 7/2806 [00:00<00:44, 63.44it/s]

Epoch 25


100%|██████████| 2806/2806 [00:42<00:00, 65.38it/s]
  0%|          | 7/2806 [00:00<00:45, 61.81it/s]

Epoch 26


100%|██████████| 2806/2806 [00:42<00:00, 65.82it/s]
  0%|          | 7/2806 [00:00<00:43, 63.79it/s]

Epoch 27


100%|██████████| 2806/2806 [00:40<00:00, 68.99it/s]
  0%|          | 7/2806 [00:00<00:43, 64.74it/s]

Epoch 28


100%|██████████| 2806/2806 [00:40<00:00, 69.05it/s]
  0%|          | 7/2806 [00:00<00:43, 64.66it/s]

Epoch 29


100%|██████████| 2806/2806 [00:40<00:00, 68.82it/s]
  0%|          | 7/2806 [00:00<00:42, 65.60it/s]

Epoch 30


100%|██████████| 2806/2806 [00:40<00:00, 68.75it/s]
  0%|          | 7/2806 [00:00<00:43, 64.78it/s]

Epoch 31


100%|██████████| 2806/2806 [00:40<00:00, 68.96it/s]
  0%|          | 7/2806 [00:00<00:42, 65.55it/s]

Epoch 32


100%|██████████| 2806/2806 [00:40<00:00, 69.07it/s]
  0%|          | 7/2806 [00:00<00:42, 65.49it/s]

Epoch 33


100%|██████████| 2806/2806 [00:40<00:00, 69.00it/s]
  0%|          | 7/2806 [00:00<00:42, 65.41it/s]

Epoch 34


100%|██████████| 2806/2806 [00:40<00:00, 69.00it/s]
  0%|          | 7/2806 [00:00<00:42, 65.59it/s]

Epoch 35


100%|██████████| 2806/2806 [00:40<00:00, 69.01it/s]
  0%|          | 7/2806 [00:00<00:42, 65.13it/s]

Epoch 36


100%|██████████| 2806/2806 [00:40<00:00, 68.63it/s]
  0%|          | 7/2806 [00:00<00:42, 65.57it/s]

Epoch 37


100%|██████████| 2806/2806 [00:40<00:00, 69.02it/s]
  0%|          | 7/2806 [00:00<00:42, 66.23it/s]

Epoch 38


100%|██████████| 2806/2806 [00:40<00:00, 68.97it/s]
  0%|          | 7/2806 [00:00<00:42, 65.83it/s]

Epoch 39


100%|██████████| 2806/2806 [00:40<00:00, 68.67it/s]
  0%|          | 7/2806 [00:00<00:43, 64.81it/s]

Epoch 40


100%|██████████| 2806/2806 [00:40<00:00, 68.50it/s]
  0%|          | 7/2806 [00:00<00:43, 65.05it/s]

Epoch 41


100%|██████████| 2806/2806 [00:41<00:00, 68.37it/s]
  0%|          | 7/2806 [00:00<00:42, 66.07it/s]

Epoch 42


100%|██████████| 2806/2806 [00:40<00:00, 68.59it/s]
  0%|          | 7/2806 [00:00<00:42, 65.42it/s]

Epoch 43


100%|██████████| 2806/2806 [00:41<00:00, 68.40it/s]
  0%|          | 7/2806 [00:00<00:42, 65.46it/s]

Epoch 44


100%|██████████| 2806/2806 [00:42<00:00, 65.27it/s]
  0%|          | 7/2806 [00:00<00:43, 64.34it/s]

Epoch 45


100%|██████████| 2806/2806 [00:42<00:00, 65.54it/s]
  0%|          | 7/2806 [00:00<00:43, 64.27it/s]

Epoch 46


100%|██████████| 2806/2806 [00:42<00:00, 65.44it/s]
  0%|          | 7/2806 [00:00<00:44, 63.44it/s]

Epoch 47


100%|██████████| 2806/2806 [00:42<00:00, 66.72it/s]
  0%|          | 7/2806 [00:00<00:42, 65.23it/s]

Epoch 48


100%|██████████| 2806/2806 [00:40<00:00, 68.66it/s]
  0%|          | 7/2806 [00:00<00:42, 65.62it/s]

Epoch 49


100%|██████████| 2806/2806 [00:40<00:00, 68.75it/s]
  0%|          | 7/2806 [00:00<00:42, 65.50it/s]

Epoch 50


100%|██████████| 2806/2806 [00:40<00:00, 68.49it/s]
  0%|          | 7/2806 [00:00<00:42, 65.59it/s]

Epoch 51


100%|██████████| 2806/2806 [00:40<00:00, 68.81it/s]
  0%|          | 7/2806 [00:00<00:44, 62.32it/s]

Epoch 52


100%|██████████| 2806/2806 [00:41<00:00, 66.93it/s]
  0%|          | 7/2806 [00:00<00:46, 60.68it/s]

Epoch 53


100%|██████████| 2806/2806 [00:43<00:00, 64.56it/s]
  0%|          | 7/2806 [00:00<00:43, 64.33it/s]

Epoch 54


100%|██████████| 2806/2806 [00:41<00:00, 67.16it/s]
  0%|          | 7/2806 [00:00<00:44, 63.32it/s]

Epoch 55


100%|██████████| 2806/2806 [00:41<00:00, 67.02it/s]
  0%|          | 7/2806 [00:00<00:43, 64.84it/s]

Epoch 56


100%|██████████| 2806/2806 [00:41<00:00, 67.20it/s]
  0%|          | 7/2806 [00:00<00:43, 64.70it/s]

Epoch 57


100%|██████████| 2806/2806 [00:43<00:00, 64.21it/s]
  0%|          | 7/2806 [00:00<00:45, 61.25it/s]

Epoch 58


100%|██████████| 2806/2806 [00:47<00:00, 58.71it/s]
  0%|          | 7/2806 [00:00<00:46, 60.15it/s]

Epoch 59


100%|██████████| 2806/2806 [00:48<00:00, 58.29it/s]
  0%|          | 6/2806 [00:00<00:49, 56.29it/s]

Epoch 60


 34%|███▍      | 963/2806 [00:16<00:29, 63.50it/s]

In [None]:
fig, ax = plt.subplots()
pd.Series(loss_list).plot(ax=ax, label='Tr')
pd.Series(val_loss_list).plot(ax=ax, label='Val')
fig.set_size_inches(fig.get_size_inches() * 1.5)
ax.legend()

In [None]:
pd.Series(accuracy_list).plot()

In [None]:
tst_set = NewsText(l_tst, labs_tst)
_, tst_set = enumerate(DataLoader(tst_set, batch_size=len(tst_set), collate_fn=collate_fn)).__next__()
tst_features, tst_labels = tst_set
tst_labels = tst_labels.unsqueeze(-1).float()

In [None]:
with torch.no_grad():
    tst_out = net(tst_features)
    tst_loss = loss_fn(tst_out, tst_labels)
    tst_guesses = torch.round(nn.Sigmoid()(tst_out))
    accuracy_tst = (tst_guesses == tst_labels).float().mean().item()

In [None]:
accuracy_tst