In [182]:
import pandas as pd
from nltk import WhitespaceTokenizer
from nltk.corpus import stopwords, words, wordnet
from nltk.lm import Vocabulary
from collections import OrderedDict
import numpy as np
import torch
from torch import nn
import concurrent.futures
from torch.optim import Adam
from tqdm import tqdm as pbar
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch import tensor
import matplotlib.pyplot as plt
import tqdm
stopwords = stopwords.words()
words = words.words() 
wordnet = wordnet.words()

In [2]:
fake, true = pd.read_csv('Fake.csv'), pd.read_csv('True.csv')
fake['label'] = 'fake'
true['label'] = 'true'
news = pd.concat((fake, true))
news = news.sample(frac=1)
news.reset_index(inplace=True, drop=True)

In [3]:
class process_text_df():
    
    def __init__(self, df, text_cols):
        self.df = df.copy()
        self.text_cols = text_cols
        
    def word_only(self, l):
        nopunkt = lambda w: ''.join([char for char in w if char.isalnum()])
        l = [nopunkt(w) for w in l]
        return l

    def clean_text_col(self, text_col):
        text_col = text_col.apply(lambda text: WhitespaceTokenizer().tokenize(text))
        text_col = text_col.apply(lambda sent: [word.lower() for word in sent])
        text_col = text_col.apply(lambda sent: [word for word in sent if word not in stopwords])
        text_col = text_col.apply(lambda sent: self.word_only(sent))
        return text_col

    def chunk_arr(self, arr, n_partitions=8):
        size = len(arr) // n_partitions
        out = [arr[i * size:(i + 1) * size] for i in range(n_partitions + 1)]
        return out

    def clean_tokenize(self, text_col):
        with concurrent.futures.ProcessPoolExecutor(4) as executor:
            chunks = self.chunk_arr(self.df[text_col], 4)
            results = executor.map(self.clean_text_col, chunks)
            out = [result for result in results]
        out = pd.concat(out)
        return out

    def process_text_col(self):
        for text_col in self.text_cols:
            self.df[text_col] = self.clean_tokenize(text_col)
            
    def build_vocab(self):
        out = []
        for col in self.text_cols:
            col_ = self.df[col]
            extend = [w for sent in col_ for w in sent]
            out.extend(extend)
        out = list(Vocabulary(out, unk_cutoff=100))
        out = {out[i]:len(out) - (i + 1) for i in range(len(out))}
        self.vocab = out
    
    def tokenize_sentences(self):
        self.build_vocab()
        for text_col in self.text_cols:
            self.df[text_col] =\
            self.df[text_col].apply(lambda sent: [word if word in self.vocab else '<UNK>' for word in sent])
            self.df[text_col] =\
            self.df[text_col].apply(lambda sent: [self.vocab[word] for word in sent])
        

In [4]:
p1 = process_text_df(news, ['title', 'text'])

In [5]:
%%time
p1.process_text_col()

CPU times: user 2.92 s, sys: 1.21 s, total: 4.14 s
Wall time: 3min 40s


In [6]:
p1.tokenize_sentences()

In [7]:
p1.df

Unnamed: 0,title,text,subject,date,label
0,"[10293, 10292, 10291, 10290, 10289, 10288, 10287]","[8224, 7596, 9965, 10260, 10007, 10243, 9210, ...",politicsNews,"June 15, 2017",true
1,"[10286, 10285, 0, 0, 10284, 10283, 10282, 1028...","[9652, 3168, 10064, 7384, 10286, 9972, 0, 7253...",left-news,"Feb 22, 2017",fake
2,"[10277, 10276, 10275, 10274, 10273, 0, 0, 0, 1...","[10274, 10273, 0, 9600, 5522, 10270, 10244, 86...",politics,"Jun 30, 2016",fake
3,"[10269, 10273, 0, 10268, 10267, 10266, 10265, ...","[6860, 10237, 10260, 9824, 4825, 0, 10268, 439...",News,"February 5, 2016",fake
4,"[10260, 10259, 10258, 10257, 10256, 10260, 102...","[10066, 7596, 9965, 10259, 7936, 9132, 8830, 7...",politicsNews,"April 25, 2017",true
...,...,...,...,...,...
44893,"[9832, 10056, 0, 0, 9899, 9761, 9760]","[9832, 10184, 3898, 3636, 4133, 0, 10282, 6523...",News,"July 9, 2016",fake
44894,"[0, 10290, 10193, 8801, 10241, 9159, 9155, 0, ...","[0, 7596, 9965, 0, 10290, 10193, 4550, 6550, 7...",worldnews,"August 25, 2017",true
44895,"[0, 6555, 9772, 8142, 7193, 8919, 5561]","[3693, 9477, 9723, 8947, 8946, 10141, 9215, 83...",News,"August 8, 2017",fake
44896,"[10259, 8859, 9999, 9526, 5290, 2020, 10269, 3...","[10066, 7596, 9965, 10260, 10259, 6057, 5217, ...",politicsNews,"June 20, 2016",true


In [146]:
l, labs = [torch.tensor(text) for text in p1.df.title],\
tensor(p1.df.label.apply(lambda l: 1 if l == 'true' else 0))

In [147]:
l

[tensor([10293, 10292, 10291, 10290, 10289, 10288, 10287]),
 tensor([10286, 10285,     0,     0, 10284, 10283, 10282, 10281, 10280, 10279,
             0, 10278]),
 tensor([10277, 10276, 10275, 10274, 10273,     0,     0,     0, 10272, 10271,
         10270]),
 tensor([10269, 10273,     0, 10268, 10267, 10266, 10265, 10264, 10263, 10262,
             0, 10261]),
 tensor([10260, 10259, 10258, 10257, 10256, 10260, 10255, 10254]),
 tensor([10260,     0, 10253, 10252, 10251, 10250]),
 tensor([10249, 10248, 10247, 10253, 10246, 10245,     0]),
 tensor([10244, 10243, 10242, 10241, 10240, 10239, 10274,     0, 10238, 10237,
         10236, 10235]),
 tensor([10234, 10286, 10233,     0, 10232, 10231, 10230]),
 tensor([10229, 10228, 10227, 10226, 10225, 10224, 10223, 10222, 10221]),
 tensor([10220, 10219, 10218, 10217, 10216, 10215, 10214, 10213, 10212]),
 tensor([10269, 10211, 10210, 10209, 10208, 10207, 10206, 10205, 10204, 10203,
         10202]),
 tensor([10201, 10200, 10199, 10198, 10197, 10

In [148]:
class NewsText(Dataset):

    def __init__(self, news_text_list, labels):
        self.news_text_list = news_text_list
        self.labels = labels

    def __len__(self):
        assert(len(self.news_text_list) == len(self.labels))
        return len(self.labels)

    def __getitem__(self, idx):
        sample = self.news_text_list[idx], self.labels[idx]
        return sample

In [149]:
def pad_sent(sents, max_seq_len):
    max_seq_len = min(100, max_seq_len)
    out = []
    for i in range(len(sents)):
        sent = sents[i]
        append_tensor = tensor([sent[j] if j < len(sent) else 0 for j in range(max_seq_len)]).unsqueeze(0)
        out.append(append_tensor)
    out = torch.cat(out)
    return out
    

def collate_fn(sample):

    labels = tensor([s[1] for s in sample])
    sents = [s[0] for s in sample]
    max_seq_len = max([sent.shape[0] for sent in sents])
    sents = pad_sent(sents, max_seq_len)
    return sents, labels

In [199]:
class LinearBlock(nn.Module):
    
    def __init__(self, layer_sequence, add_relu=False):
        super(LinearBlock, self).__init__()
        num_layers = len(layer_sequence) - 1
        layers = []
        names = []
        for i in range(num_layers):
            layers.append(nn.Linear(in_features=layer_sequence[i],
                                    out_features=layer_sequence[i + 1],
                                    bias=False)
                         )
            names.append(f'fc{i + 1}')
                
            if add_relu and i != num_layers - 1:
                layers.append(nn.ReLU())
                names.append(f'relu{i + 1}')
        
        self.module_dict = OrderedDict(zip(names, layers))
        self.block = nn.Sequential(self.module_dict)
        
    def forward(self, x):
        out = self.block(x)
        
        return out

In [224]:
class NewsNet(nn.Module):
    
    def __init__(self, vocab, hidden_size=10, embedding_dim=16, num_layers=2):
        super(NewsNet, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.word_embeddings = nn.Embedding(num_embeddings=len(vocab), embedding_dim=self.embedding_dim)
        self.lstm = nn.LSTM(input_size=self.embedding_dim, 
                            bias=False, 
                            hidden_size=self.hidden_size, 
                            batch_first=True,
                            num_layers=self.num_layers)
        self.linear_block = LinearBlock([self.hidden_size, self.hidden_size // 2, self.hidden_size, 1])
        
    def forward(self, s):
        
        out = self.word_embeddings(s)
        out, _ = self.lstm(out)
        out = out[:, -1, :]
        out = self.linear_block(out)
        
        return out

In [None]:
net = NewsNet(p1.vocab, hidden_size=4, embedding_dim=8)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = Adam(net.parameters(), lr=1e-4, weight_decay=1e-6)
idx = len(l) // 4

l_tr, labs_tr = l[:-2 * idx], labs[: -2 * idx]
l_val, labs_val = l[-2 * idx:-idx], labs[-2 * idx:-idx]
l_tst, labs_tst = l[-idx:], labs[-idx:]
l_tr, l_val, labs_tr, labs_val = l[:idx], l[idx:], labs[:idx], labs[idx:]
tr_set = NewsText(l_tr, labs_tr)
val_set = NewsText(l_val, labs_val)
_, val_set = enumerate(DataLoader(val_set, batch_size=len(val_set), collate_fn=collate_fn)).__next__()
val_features, val_labels = val_set
val_labels = val_labels.unsqueeze(-1).float()
loader = DataLoader(tr_set, batch_size=8, collate_fn=collate_fn)
loss_list = []
val_loss_list = []
accuracy_list = []
epochs = 80
for i in range(epochs):
    print(f'Epoch {i + 1}')
    for sents, labels in pbar(loader):
        net.train()
        labels = labels.float().unsqueeze(-1)
        out = net(sents)
        loss = loss_fn(out, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        net.eval()
    with torch.no_grad():
        val_out = net(val_features)
        val_loss = loss_fn(val_out, val_labels)
        val_loss_list.append(val_loss.item())
        val_guesses = torch.round(nn.Sigmoid()(val_out))
        accuracy = (val_guesses == val_labels).float().mean().item()
        
        accuracy_list.append(accuracy)
        
    loss_list.append(loss.item())

  1%|          | 14/1403 [00:00<00:10, 134.24it/s]

Epoch 1


100%|██████████| 1403/1403 [00:08<00:00, 164.12it/s]
  1%|          | 17/1403 [00:00<00:08, 166.98it/s]

Epoch 2


100%|██████████| 1403/1403 [00:07<00:00, 178.05it/s]
  1%|▏         | 18/1403 [00:00<00:08, 172.77it/s]

Epoch 3


100%|██████████| 1403/1403 [00:08<00:00, 161.06it/s]
  1%|          | 17/1403 [00:00<00:08, 162.86it/s]

Epoch 4


100%|██████████| 1403/1403 [00:08<00:00, 174.12it/s]
  1%|▏         | 18/1403 [00:00<00:07, 175.02it/s]

Epoch 5


100%|██████████| 1403/1403 [00:08<00:00, 174.96it/s]
  1%|▏         | 18/1403 [00:00<00:07, 176.36it/s]

Epoch 6


100%|██████████| 1403/1403 [00:08<00:00, 172.73it/s]
  1%|▏         | 19/1403 [00:00<00:07, 181.84it/s]

Epoch 7


100%|██████████| 1403/1403 [00:07<00:00, 178.29it/s]
  1%|          | 17/1403 [00:00<00:08, 164.52it/s]

Epoch 8


100%|██████████| 1403/1403 [00:08<00:00, 169.97it/s]
  1%|          | 17/1403 [00:00<00:08, 167.94it/s]

Epoch 9


100%|██████████| 1403/1403 [00:08<00:00, 169.44it/s]
  1%|          | 17/1403 [00:00<00:08, 167.89it/s]

Epoch 10


100%|██████████| 1403/1403 [00:08<00:00, 169.89it/s]
  1%|          | 17/1403 [00:00<00:08, 165.49it/s]

Epoch 11


100%|██████████| 1403/1403 [00:08<00:00, 166.15it/s]
  1%|          | 17/1403 [00:00<00:08, 169.86it/s]

Epoch 12


100%|██████████| 1403/1403 [00:09<00:00, 152.35it/s]
  1%|          | 11/1403 [00:00<00:12, 108.06it/s]

Epoch 13


100%|██████████| 1403/1403 [00:08<00:00, 168.55it/s]
  1%|          | 17/1403 [00:00<00:08, 167.07it/s]

Epoch 14


100%|██████████| 1403/1403 [00:08<00:00, 166.43it/s]
  1%|          | 17/1403 [00:00<00:08, 163.72it/s]

Epoch 15


100%|██████████| 1403/1403 [00:08<00:00, 166.72it/s]
  1%|          | 17/1403 [00:00<00:08, 166.63it/s]

Epoch 16


100%|██████████| 1403/1403 [00:08<00:00, 158.38it/s]
  2%|▏         | 32/1403 [00:00<00:08, 160.86it/s]

Epoch 17


100%|██████████| 1403/1403 [00:08<00:00, 169.14it/s]
  1%|          | 17/1403 [00:00<00:08, 162.41it/s]

Epoch 18


100%|██████████| 1403/1403 [00:08<00:00, 169.09it/s]
  1%|          | 16/1403 [00:00<00:09, 153.51it/s]

Epoch 19


100%|██████████| 1403/1403 [00:08<00:00, 164.12it/s]
  1%|          | 17/1403 [00:00<00:08, 163.63it/s]

Epoch 20


100%|██████████| 1403/1403 [00:08<00:00, 157.00it/s]
  1%|          | 16/1403 [00:00<00:08, 156.78it/s]

Epoch 21


100%|██████████| 1403/1403 [00:09<00:00, 153.53it/s]
  1%|          | 16/1403 [00:00<00:08, 157.02it/s]

Epoch 22


100%|██████████| 1403/1403 [00:09<00:00, 155.53it/s]
  1%|          | 16/1403 [00:00<00:09, 150.82it/s]

Epoch 23


100%|██████████| 1403/1403 [00:09<00:00, 152.96it/s]
  1%|          | 16/1403 [00:00<00:09, 150.14it/s]

Epoch 24


100%|██████████| 1403/1403 [00:09<00:00, 154.71it/s]
  1%|          | 17/1403 [00:00<00:08, 162.03it/s]

Epoch 25


100%|██████████| 1403/1403 [00:08<00:00, 156.77it/s]
  1%|          | 15/1403 [00:00<00:09, 139.78it/s]

Epoch 26


100%|██████████| 1403/1403 [00:08<00:00, 168.30it/s]
  1%|          | 17/1403 [00:00<00:08, 168.09it/s]

Epoch 27


100%|██████████| 1403/1403 [00:08<00:00, 168.73it/s]
  1%|          | 17/1403 [00:00<00:08, 166.87it/s]

Epoch 28


100%|██████████| 1403/1403 [00:09<00:00, 147.98it/s]
  1%|          | 16/1403 [00:00<00:08, 157.09it/s]

Epoch 29


100%|██████████| 1403/1403 [00:08<00:00, 158.27it/s]
  1%|          | 17/1403 [00:00<00:08, 159.62it/s]

Epoch 30


100%|██████████| 1403/1403 [00:08<00:00, 164.45it/s]
  1%|          | 17/1403 [00:00<00:08, 161.59it/s]

Epoch 31


100%|██████████| 1403/1403 [00:09<00:00, 145.81it/s]
  1%|          | 17/1403 [00:00<00:08, 165.50it/s]

Epoch 32


100%|██████████| 1403/1403 [00:09<00:00, 150.06it/s]
  1%|          | 16/1403 [00:00<00:08, 157.81it/s]

Epoch 33


100%|██████████| 1403/1403 [00:09<00:00, 145.73it/s]
  1%|          | 16/1403 [00:00<00:08, 155.44it/s]

Epoch 34


100%|██████████| 1403/1403 [00:09<00:00, 150.45it/s]
  1%|          | 14/1403 [00:00<00:10, 133.65it/s]

Epoch 35


100%|██████████| 1403/1403 [00:09<00:00, 149.48it/s]
  1%|          | 16/1403 [00:00<00:08, 157.78it/s]

Epoch 36


100%|██████████| 1403/1403 [00:09<00:00, 147.41it/s]
  1%|          | 17/1403 [00:00<00:08, 167.79it/s]

Epoch 37


100%|██████████| 1403/1403 [00:09<00:00, 151.64it/s]
  1%|          | 16/1403 [00:00<00:08, 158.81it/s]

Epoch 38


100%|██████████| 1403/1403 [00:09<00:00, 151.92it/s]
  1%|          | 16/1403 [00:00<00:09, 153.76it/s]

Epoch 39


100%|██████████| 1403/1403 [00:10<00:00, 138.26it/s]
  1%|          | 16/1403 [00:00<00:08, 159.61it/s]

Epoch 40


100%|██████████| 1403/1403 [00:10<00:00, 137.41it/s]
  1%|          | 17/1403 [00:00<00:08, 160.42it/s]

Epoch 41


100%|██████████| 1403/1403 [00:09<00:00, 141.94it/s]
  1%|          | 15/1403 [00:00<00:09, 145.23it/s]

Epoch 42


100%|██████████| 1403/1403 [00:09<00:00, 149.28it/s]
  1%|          | 15/1403 [00:00<00:09, 141.57it/s]

Epoch 43


100%|██████████| 1403/1403 [00:09<00:00, 145.00it/s]
  1%|          | 17/1403 [00:00<00:08, 160.09it/s]

Epoch 44


100%|██████████| 1403/1403 [00:09<00:00, 144.62it/s]
  1%|          | 16/1403 [00:00<00:08, 156.26it/s]

Epoch 45


100%|██████████| 1403/1403 [00:09<00:00, 145.44it/s]
  1%|          | 16/1403 [00:00<00:09, 151.91it/s]

Epoch 46


100%|██████████| 1403/1403 [00:09<00:00, 148.44it/s]
  1%|          | 16/1403 [00:00<00:08, 158.33it/s]

Epoch 47


100%|██████████| 1403/1403 [00:09<00:00, 154.60it/s]
  1%|          | 16/1403 [00:00<00:09, 151.49it/s]

Epoch 48


100%|██████████| 1403/1403 [00:09<00:00, 149.46it/s]
  1%|          | 15/1403 [00:00<00:09, 149.45it/s]

Epoch 49


100%|██████████| 1403/1403 [00:09<00:00, 143.18it/s]
  1%|          | 13/1403 [00:00<00:10, 128.69it/s]

Epoch 50


100%|██████████| 1403/1403 [00:09<00:00, 150.84it/s]
  1%|          | 16/1403 [00:00<00:08, 156.41it/s]

Epoch 51


100%|██████████| 1403/1403 [00:09<00:00, 148.17it/s]
  1%|          | 16/1403 [00:00<00:08, 156.22it/s]

Epoch 52


100%|██████████| 1403/1403 [00:09<00:00, 150.64it/s]
  1%|          | 16/1403 [00:00<00:08, 158.49it/s]

Epoch 53


100%|██████████| 1403/1403 [00:09<00:00, 155.70it/s]
  1%|          | 17/1403 [00:00<00:08, 161.48it/s]

Epoch 54


100%|██████████| 1403/1403 [00:09<00:00, 142.11it/s]
  1%|          | 16/1403 [00:00<00:09, 149.72it/s]

Epoch 55


100%|██████████| 1403/1403 [00:10<00:00, 133.29it/s]
  1%|          | 17/1403 [00:00<00:08, 161.09it/s]

Epoch 56


100%|██████████| 1403/1403 [00:09<00:00, 145.10it/s]
  1%|          | 16/1403 [00:00<00:08, 159.30it/s]

Epoch 57


100%|██████████| 1403/1403 [00:09<00:00, 144.51it/s]
  1%|          | 17/1403 [00:00<00:08, 163.75it/s]

Epoch 58


100%|██████████| 1403/1403 [00:09<00:00, 148.92it/s]
  1%|          | 17/1403 [00:00<00:08, 160.20it/s]

Epoch 59


100%|██████████| 1403/1403 [00:08<00:00, 161.31it/s]
  2%|▏         | 32/1403 [00:00<00:08, 158.50it/s]

Epoch 60


100%|██████████| 1403/1403 [00:08<00:00, 161.66it/s]
  1%|          | 17/1403 [00:00<00:08, 160.34it/s]

Epoch 61


100%|██████████| 1403/1403 [00:10<00:00, 138.84it/s]
  1%|          | 16/1403 [00:00<00:09, 151.73it/s]

Epoch 62


100%|██████████| 1403/1403 [00:09<00:00, 144.06it/s]
  1%|          | 16/1403 [00:00<00:08, 158.06it/s]

Epoch 63


100%|██████████| 1403/1403 [00:09<00:00, 154.32it/s]
  1%|          | 17/1403 [00:00<00:08, 165.37it/s]

Epoch 64


100%|██████████| 1403/1403 [00:08<00:00, 161.09it/s]
  1%|          | 15/1403 [00:00<00:09, 145.45it/s]

Epoch 65


100%|██████████| 1403/1403 [00:11<00:00, 123.98it/s]
  1%|          | 9/1403 [00:00<00:16, 86.24it/s]

Epoch 66


100%|██████████| 1403/1403 [00:10<00:00, 130.99it/s]


In [None]:
fig, ax = plt.subplots()
pd.Series(loss_list).plot(ax=ax, label='Tr')
pd.Series(val_loss_list).plot(ax=ax, label='Val')
fig.set_size_inches(fig.get_size_inches() * 1.5)
ax.legend()

In [None]:
pd.Series(accuracy_list).plot()

In [None]:
tst_set = NewsText(l_tst, labs_tst)
_, tst_set = enumerate(DataLoader(tst_set, batch_size=len(tst_set), collate_fn=collate_fn)).__next__()
tst_features, tst_labels = tst_set
tst_labels = tst_labels.unsqueeze(-1).float()

In [None]:
with torch.no_grad():
    tst_out = net(tst_features)
    tst_loss = loss_fn(tst_out, tst_labels)
#     tst_loss_list.append(tst_loss.item())
    tst_guesses = torch.round(nn.Sigmoid()(tst_out))
    accuracy_tst = (tst_guesses == tst_labels).float().mean().item()

In [None]:
accuracy_tst