In [1]:
import pandas as pd
from nltk import WhitespaceTokenizer
from nltk.corpus import stopwords, words, wordnet
from nltk.lm import Vocabulary
from collections import OrderedDict
import numpy as np
import torch
from torch import nn
import concurrent.futures
from torch.optim import Adam
from data_loading import process_text_df
from tqdm import tqdm as pbar
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch import tensor
import matplotlib.pyplot as plt
import tqdm
stopwords = stopwords.words()
words = words.words() 
wordnet = wordnet.words()

In [2]:
fake, true = pd.read_csv('Fake.csv'), pd.read_csv('True.csv')
fake['label'] = 'fake'
true['label'] = 'true'
news = pd.concat((fake, true))
news = news.sample(frac=1)
news.reset_index(inplace=True, drop=True)

In [3]:
p1 = process_text_df(news, ['title', 'text'])

In [4]:
%%time
p1.process_text_col()

CPU times: user 2.87 s, sys: 1.27 s, total: 4.13 s
Wall time: 3min 43s


In [5]:
p1.tokenize_sentences()

In [6]:
p1.df

Unnamed: 0,title,text,subject,date,label
0,"[10293, 10292, 10291, 10290, 10289, 10288, 102...","[8677, 6304, 10212, 1637, 10288, 9801, 10232, ...",politicsNews,"May 4, 2016",true
1,"[10285, 10284, 10283, 0, 10282, 10281, 10280, ...","[864, 9471, 9025, 10284, 10283, 5381, 0, 0, 0,...",News,"January 20, 2016",fake
2,"[10288, 10277, 10276, 10275, 10274, 10273, 102...","[8677, 6304, 10212, 10288, 10277, 10276, 5082,...",politicsNews,"November 2, 2017",true
3,"[10293, 0, 0, 10269, 0, 10268, 10267, 10266]","[5962, 7391, 1604, 6304, 10212, 10271, 4931, 9...",politicsNews,"December 27, 2016",true
4,"[10265, 10264, 0, 10263, 10262, 10261, 10260, ...","[0, 8406, 8708, 8544, 8053, 4745, 10263, 8983,...",politics,"Apr 19, 2017",fake
...,...,...,...,...,...
44893,"[8862, 8861, 9716, 6651, 9820, 7208, 0, 9868, ...","[9592, 9877, 9716, 5800, 10166, 7546, 9984, 84...",left-news,"Aug 8, 2015",fake
44894,"[10293, 10209, 10201, 9514, 8627, 8530, 10094]","[8677, 6304, 10212, 9716, 9622, 10293, 8334, 7...",politicsNews,"May 17, 2017",true
44895,"[0, 0, 9040, 10088, 0, 0, 10287, 8708]","[6278, 2011, 6304, 10212, 0, 3855, 7696, 10271...",worldnews,"October 4, 2017",true
44896,"[9949, 0, 8737, 10103, 10027, 8877, 9263, 9948...","[5239, 9056, 5051, 4477, 9948, 10076, 9245, 96...",left-news,"Mar 2, 2016",fake


In [7]:
l, labs = [torch.tensor(text) for text in p1.df.title],\
tensor(p1.df.label.apply(lambda l: 1 if l == 'true' else 0))

In [8]:
l

[tensor([10293, 10292, 10291, 10290, 10289, 10288, 10287, 10286]),
 tensor([10285, 10284, 10283,     0, 10282, 10281, 10280, 10279, 10278]),
 tensor([10288, 10277, 10276, 10275, 10274, 10273, 10272, 10271, 10270]),
 tensor([10293,     0,     0, 10269,     0, 10268, 10267, 10266]),
 tensor([10265, 10264,     0, 10263, 10262, 10261, 10260, 10259,     0, 10258,
         10257]),
 tensor([10256, 10255, 10254, 10253, 10252, 10251]),
 tensor([10293, 10250, 10249, 10251, 10248, 10247, 10246, 10245]),
 tensor([10244, 10243, 10242, 10241, 10240, 10239]),
 tensor([    0, 10238, 10293, 10237, 10236, 10235, 10234, 10233, 10232, 10231]),
 tensor([10230, 10229, 10228, 10227, 10226, 10225, 10224, 10223, 10222, 10226,
         10221, 10220]),
 tensor([10219, 10271, 10218, 10217, 10216, 10215, 10214, 10213, 10212, 10211]),
 tensor([10271,     0, 10210, 10209, 10208, 10207, 10206, 10205,     0, 10204,
         10203]),
 tensor([10202, 10201, 10200, 10199, 10198, 10197, 10196, 10195]),
 tensor([10194, 10

In [9]:
class NewsText(Dataset):

    def __init__(self, news_text_list, labels):
        self.news_text_list = news_text_list
        self.labels = labels

    def __len__(self):
        assert(len(self.news_text_list) == len(self.labels))
        return len(self.labels)

    def __getitem__(self, idx):
        sample = self.news_text_list[idx], self.labels[idx]
        return sample

In [10]:
def pad_sent(sents, max_seq_len):
    max_seq_len = min(100, max_seq_len)
    out = []
    for i in range(len(sents)):
        sent = sents[i]
        append_tensor = tensor([sent[j] if j < len(sent) else 0 for j in range(max_seq_len)]).unsqueeze(0)
        out.append(append_tensor)
    out = torch.cat(out)
    return out
    

def collate_fn(sample):

    labels = tensor([s[1] for s in sample])
    sents = [s[0] for s in sample]
    max_seq_len = max([sent.shape[0] for sent in sents])
    sents = pad_sent(sents, max_seq_len)
    return sents, labels

In [11]:
class LinearBlock(nn.Module):
    
    def __init__(self, layer_sequence, add_relu=False):
        super(LinearBlock, self).__init__()
        num_layers = len(layer_sequence) - 1
        layers = []
        names = []
        for i in range(num_layers):
            layers.append(nn.Linear(in_features=layer_sequence[i],
                                    out_features=layer_sequence[i + 1],
                                    bias=False)
                         )
            names.append(f'fc{i + 1}')
                
            if add_relu and i != num_layers - 1:
                layers.append(nn.ReLU())
                names.append(f'relu{i + 1}')
        
        self.module_dict = OrderedDict(zip(names, layers))
        self.block = nn.Sequential(self.module_dict)
        
    def forward(self, x):
        out = self.block(x)
        
        return out

In [12]:
class NewsNet(nn.Module):
    
    def __init__(self, vocab, hidden_size=10, embedding_dim=16, num_layers=2):
        super(NewsNet, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.word_embeddings = nn.Embedding(num_embeddings=len(vocab), embedding_dim=self.embedding_dim)
        self.lstm = nn.LSTM(input_size=self.embedding_dim, 
                            bias=False, 
                            hidden_size=self.hidden_size, 
                            batch_first=True,
                            num_layers=self.num_layers)
        self.linear_block = LinearBlock([self.hidden_size, self.hidden_size // 2, self.hidden_size, 1])
        
    def forward(self, s):
        
        out = self.word_embeddings(s)
        out, _ = self.lstm(out)
        out = out[:, -1, :]
        out = self.linear_block(out)
        
        return out

In [None]:
net = NewsNet(p1.vocab, hidden_size=4, embedding_dim=8)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = Adam(net.parameters(), lr=1e-4, weight_decay=1e-6)
idx = len(l) // 4

l_tr, labs_tr = l[:-2 * idx], labs[: -2 * idx]
l_val, labs_val = l[-2 * idx:-idx], labs[-2 * idx:-idx]
l_tst, labs_tst = l[-idx:], labs[-idx:]
l_tr, l_val, labs_tr, labs_val = l[:idx], l[idx:], labs[:idx], labs[idx:]
tr_set = NewsText(l_tr, labs_tr)
val_set = NewsText(l_val, labs_val)
_, val_set = enumerate(DataLoader(val_set, batch_size=len(val_set), collate_fn=collate_fn)).__next__()
val_features, val_labels = val_set
val_labels = val_labels.unsqueeze(-1).float()
loader = DataLoader(tr_set, batch_size=8, collate_fn=collate_fn)
loss_list = []
val_loss_list = []
accuracy_list = []
epochs = 80
for i in range(epochs):
    print(f'Epoch {i + 1}')
    for sents, labels in pbar(loader):
        net.train()
        labels = labels.float().unsqueeze(-1)
        out = net(sents)
        loss = loss_fn(out, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        net.eval()
    with torch.no_grad():
        val_out = net(val_features)
        val_loss = loss_fn(val_out, val_labels)
        val_loss_list.append(val_loss.item())
        val_guesses = torch.round(nn.Sigmoid()(val_out))
        accuracy = (val_guesses == val_labels).float().mean().item()
        
        accuracy_list.append(accuracy)
        
    loss_list.append(loss.item())

  1%|          | 15/1403 [00:00<00:09, 143.56it/s]

Epoch 1


100%|██████████| 1403/1403 [00:08<00:00, 170.24it/s]
  2%|▏         | 35/1403 [00:00<00:08, 169.46it/s]

Epoch 2


100%|██████████| 1403/1403 [00:08<00:00, 168.40it/s]
  1%|          | 17/1403 [00:00<00:08, 162.69it/s]

Epoch 3


  5%|▍         | 67/1403 [00:00<00:08, 158.14it/s]

In [None]:
fig, ax = plt.subplots()
pd.Series(loss_list).plot(ax=ax, label='Tr')
pd.Series(val_loss_list).plot(ax=ax, label='Val')
fig.set_size_inches(fig.get_size_inches() * 1.5)
ax.legend()

In [None]:
pd.Series(accuracy_list).plot()

In [None]:
tst_set = NewsText(l_tst, labs_tst)
_, tst_set = enumerate(DataLoader(tst_set, batch_size=len(tst_set), collate_fn=collate_fn)).__next__()
tst_features, tst_labels = tst_set
tst_labels = tst_labels.unsqueeze(-1).float()

In [None]:
with torch.no_grad():
    tst_out = net(tst_features)
    tst_loss = loss_fn(tst_out, tst_labels)
#     tst_loss_list.append(tst_loss.item())
    tst_guesses = torch.round(nn.Sigmoid()(tst_out))
    accuracy_tst = (tst_guesses == tst_labels).float().mean().item()

In [None]:
accuracy_tst