In [None]:
import pandas as pd
from nltk import WhitespaceTokenizer
from nltk.corpus import stopwords, words, wordnet
from nltk.lm import Vocabulary
from collections import OrderedDict
import numpy as np
import torch
from torch import nn
import concurrent.futures
from torch.optim import Adam
from data_loading import process_text_df, NewsText, tensorize_sentences, collate_fn
from tqdm import tqdm as pbar
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch import tensor
import matplotlib.pyplot as plt
import tqdm
stopwords = stopwords.words()
words = words.words() 
wordnet = wordnet.words()

In [None]:
fake, true = pd.read_csv('Fake.csv'), pd.read_csv('True.csv')
fake['label'] = 'fake'
true['label'] = 'true'
news = pd.concat((fake, true))
news = news.sample(frac=1)
news.reset_index(inplace=True, drop=True)

In [None]:
p1 = process_text_df(news, ['title', 'text'])

In [None]:
%%time
p1.process_text_col()

In [None]:
p1.tokenize_sentences()

In [None]:
p1.df

In [None]:
l, labs = tensorize_sentences(p1.df.title, p1.df.label)

In [None]:
class LinearBlock(nn.Module):
    
    def __init__(self, layer_sequence, add_relu=False):
        super(LinearBlock, self).__init__()
        num_layers = len(layer_sequence) - 1
        layers = []
        names = []
        for i in range(num_layers):
            layers.append(nn.Linear(in_features=layer_sequence[i],
                                    out_features=layer_sequence[i + 1],
                                    bias=False)
                         )
            names.append(f'fc{i + 1}')
                
            if add_relu and i != num_layers - 1:
                layers.append(nn.ReLU())
                names.append(f'relu{i + 1}')
        
        self.module_dict = OrderedDict(zip(names, layers))
        self.block = nn.Sequential(self.module_dict)
        
    def forward(self, x):
        out = self.block(x)
        
        return out

In [None]:
class NewsNet(nn.Module):
    
    def __init__(self, vocab, hidden_size=10, embedding_dim=16, num_layers=2):
        super(NewsNet, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.word_embeddings = nn.Embedding(num_embeddings=len(vocab), embedding_dim=self.embedding_dim)
        self.lstm = nn.LSTM(input_size=self.embedding_dim, 
                            bias=False, 
                            hidden_size=self.hidden_size, 
                            batch_first=True,
                            num_layers=self.num_layers)
        self.linear_block = LinearBlock([self.hidden_size, self.hidden_size // 2, self.hidden_size, 1])
        
    def forward(self, s):
        
        out = self.word_embeddings(s)
        out, _ = self.lstm(out)
        out = out[:, -1, :]
        out = self.linear_block(out)
        
        return out

In [None]:
net = NewsNet(p1.vocab, hidden_size=4, embedding_dim=8)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = Adam(net.parameters(), lr=1e-4, weight_decay=1e-6)
idx = len(l) // 4

l_tr, labs_tr = l[:-2 * idx], labs[: -2 * idx]
l_val, labs_val = l[-2 * idx:-idx], labs[-2 * idx:-idx]
l_tst, labs_tst = l[-idx:], labs[-idx:]
l_tr, l_val, labs_tr, labs_val = l[:idx], l[idx:], labs[:idx], labs[idx:]
tr_set = NewsText(l_tr, labs_tr)
val_set = NewsText(l_val, labs_val)
_, val_set = enumerate(DataLoader(val_set, batch_size=len(val_set), collate_fn=collate_fn)).__next__()
val_features, val_labels = val_set
val_labels = val_labels.unsqueeze(-1).float()
loader = DataLoader(tr_set, batch_size=8, collate_fn=collate_fn)
loss_list = []
val_loss_list = []
accuracy_list = []
epochs = 80
for i in range(epochs):
    print(f'Epoch {i + 1}')
    for sents, labels in pbar(loader):
        net.train()
        labels = labels.float().unsqueeze(-1)
        out = net(sents)
        loss = loss_fn(out, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        net.eval()
    with torch.no_grad():
        val_out = net(val_features)
        val_loss = loss_fn(val_out, val_labels)
        val_loss_list.append(val_loss.item())
        val_guesses = torch.round(nn.Sigmoid()(val_out))
        accuracy = (val_guesses == val_labels).float().mean().item()
        
        accuracy_list.append(accuracy)
        
    loss_list.append(loss.item())

In [None]:
fig, ax = plt.subplots()
pd.Series(loss_list).plot(ax=ax, label='Tr')
pd.Series(val_loss_list).plot(ax=ax, label='Val')
fig.set_size_inches(fig.get_size_inches() * 1.5)
ax.legend()

In [None]:
pd.Series(accuracy_list).plot()

In [None]:
tst_set = NewsText(l_tst, labs_tst)
_, tst_set = enumerate(DataLoader(tst_set, batch_size=len(tst_set), collate_fn=collate_fn)).__next__()
tst_features, tst_labels = tst_set
tst_labels = tst_labels.unsqueeze(-1).float()

In [None]:
with torch.no_grad():
    tst_out = net(tst_features)
    tst_loss = loss_fn(tst_out, tst_labels)
#     tst_loss_list.append(tst_loss.item())
    tst_guesses = torch.round(nn.Sigmoid()(tst_out))
    accuracy_tst = (tst_guesses == tst_labels).float().mean().item()

In [None]:
accuracy_tst