In [1]:
import pandas as pd
from nltk import WhitespaceTokenizer
from nltk.corpus import stopwords, words, wordnet
from nltk.lm import Vocabulary
from collections import OrderedDict
import numpy as np
import torch
from torch import nn
import concurrent.futures
from torch.optim import Adam
from data_loading import process_text_df, NewsText, tensorize_sentences, collate_fn
from tqdm import tqdm as pbar
from models import NewsNet
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch import tensor
import matplotlib.pyplot as plt
import tqdm
stopwords = stopwords.words()
words = words.words() 
wordnet = wordnet.words()

In [2]:
import pandas as pd
from nltk import WhitespaceTokenizer
from nltk.corpus import stopwords, words, wordnet
from nltk.lm import Vocabulary
import numpy as np
import torch
from torch import nn
import concurrent.futures
from torch.utils.data import Dataset, DataLoader
from nltk.stem.snowball import EnglishStemmer
from torch import tensor
stopwords = stopwords.words()
words = words.words() 
wordnet = wordnet.words()

class process_text_df():
    
    def __init__(self, df, text_cols):
        self.df = df.copy()
        self.text_cols = text_cols
        self.stemmer = EnglishStemmer()
        
    def word_only(self, l):
        nopunkt = lambda w: ''.join([char for char in w if char.isalnum()])
        l = [nopunkt(w) for w in l]
        return l

    def clean_text_col(self, text_col):
        text_col = text_col.apply(lambda text: WhitespaceTokenizer().tokenize(text))
        text_col = text_col.apply(lambda sent: [word.lower() for word in sent])
        text_col = text_col.apply(lambda sent: [word for word in sent if word not in stopwords])
        text_col = text_col.apply(lambda sent: self.word_only(sent))
        text_col = text_col.apply(lambda sent: [self.stemmer.stem(word) for word in sent])
        return text_col

    def chunk_arr(self, arr, n_partitions=8):
        size = len(arr) // n_partitions
        out = [arr[i * size:(i + 1) * size] for i in range(n_partitions + 1)]
        return out

    def clean_tokenize(self, text_col):
        with concurrent.futures.ProcessPoolExecutor(4) as executor:
            chunks = self.chunk_arr(self.df[text_col], 4)
            results = executor.map(self.clean_text_col, chunks)
            out = [result for result in results]
        out = pd.concat(out)
        return out

    def process_text_col(self):
        for text_col in self.text_cols:
            self.df[text_col] = self.clean_tokenize(text_col)
            
    def build_vocab(self):
        out = []
        for col in self.text_cols:
            col_ = self.df[col]
            extend = [w for sent in col_ for w in sent]
            out.extend(extend)
        out = list(Vocabulary(out, unk_cutoff=100))
        out = {out[i]:len(out) - (i + 1) for i in range(len(out))}
        self.vocab = out
    
    def tokenize_sentences(self):
        self.build_vocab()
        for text_col in self.text_cols:
            self.df[text_col] =\
            self.df[text_col].apply(lambda sent: [word if word in self.vocab else '<UNK>' for word in sent])
            self.df[text_col] =\
            self.df[text_col].apply(lambda sent: [self.vocab[word] for word in sent])
        
def tensorize_sentences(text_series, labels):
    sentences, labels = [torch.tensor(text) for text in text_series], \
                        tensor(labels.apply(lambda l: 1 if l == 'true' else 0))
    return sentences, labels

class NewsText(Dataset):

    def __init__(self, news_text_list, labels):
        self.news_text_list = news_text_list
        self.labels = labels

    def __len__(self):
        assert(len(self.news_text_list) == len(self.labels))
        return len(self.labels)

    def __getitem__(self, idx):
        sample = self.news_text_list[idx], self.labels[idx]
        return sample

def pad_sent(sents, max_seq_len):
    max_seq_len = min(100, max_seq_len)
    out = []
    for i in range(len(sents)):
        sent = sents[i]
        append_tensor = tensor([sent[j] if j < len(sent) else 0 for j in range(max_seq_len)]).unsqueeze(0)
        out.append(append_tensor)
    out = torch.cat(out)
    return out
    

def collate_fn(sample):

    labels = tensor([s[1] for s in sample])
    sents = [s[0] for s in sample]
    max_seq_len = max([sent.shape[0] for sent in sents])
    sents = pad_sent(sents, max_seq_len)
    return sents, labels



In [3]:
fake, true = pd.read_csv('Fake.csv'), pd.read_csv('True.csv')
fake['label'] = 'fake'
true['label'] = 'true'
news = pd.concat((fake, true))
news = news.sample(frac=1)
news.reset_index(inplace=True, drop=True)

In [4]:
p1 = process_text_df(news, ['title', 'text'])

In [5]:
%%time
p1.process_text_col()

KeyboardInterrupt: 

In [6]:
p1.df

Unnamed: 0,title,text,subject,date,label
0,"[mladic, verdict, carri, messag, syria, beyond...",GENEVA (Reuters) - The conviction of former Bo...,worldnews,"November 22, 2017",true
1,"[watch, chuck, schumer, fake, cri, muslim, pol...",Chuck Schumer s crying over Muslim refugees.Af...,politics,"Jan 29, 2017",fake
2,"[muslim, invas, updat, gang, member, germani, ...","Deep down, they just want to assimilate Eight ...",politics,"Oct 21, 2015",fake
3,"[german, court, rule, favor, third, gender, ca...",BERLIN (Reuters) - Germany s highest court rul...,worldnews,"November 8, 2017",true
4,"[russian, foreign, ministri, meet, visit, nort...",MOSCOW (Reuters) - Russia s foreign ministry p...,worldnews,"September 26, 2017",true
...,...,...,...,...,...
44893,"[you, donald, trump, worst, nightmar, john, ke...",U.S. Secretary of State John Kerry delivered a...,News,"May 8, 2016",fake
44894,"[germani, antitrump, open, border, angela, mer...",In what has to be considered an historic about...,left-news,"Nov 28, 2016",fake
44895,"[watch, new, documentari, explor, trump, russi...",This past week the world has been hit with man...,News,"May 16, 2017",fake
44896,"[turkey, say, us, isol, jerusalem, issu, threat]",ANKARA (Reuters) - Turkey said on Wednesday th...,worldnews,"December 20, 2017",true


In [None]:
p1.tokenize_sentences()

In [None]:
l, labs = tensorize_sentences(p1.df.text.apply(lambda sent: sent[:50]), p1.df.label)

In [None]:
# l, labs = tensorize_sentences(p1.df.text.apply(lambda sent: sent[:100]), p1.df.label)

In [None]:
net = NewsNet(p1.vocab, hidden_size=4, embedding_dim=8, num_layers=2)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = Adam(net.parameters(), lr=1e-4, weight_decay=5e-6)
idx = len(l) // 4

l_tr, labs_tr = l[:-2 * idx], labs[: -2 * idx]
l_val, labs_val = l[-2 * idx:-idx], labs[-2 * idx:-idx]
l_tst, labs_tst = l[-idx:], labs[-idx:]
l_tr, l_val, labs_tr, labs_val = l[:idx], l[idx:], labs[:idx], labs[idx:]
tr_set = NewsText(l_tr, labs_tr)
val_set = NewsText(l_val, labs_val)
_, val_set = enumerate(DataLoader(val_set, batch_size=len(val_set), collate_fn=collate_fn)).__next__()
val_features, val_labels = val_set
val_labels = val_labels.unsqueeze(-1).float()
loader = DataLoader(tr_set, batch_size=4, collate_fn=collate_fn)
loss_list = []
val_loss_list = []
accuracy_list = []
epochs = 50
for i in range(epochs):
    print(f'Epoch {i + 1}')
    for sents, labels in pbar(loader):
        net.train()
        labels = labels.float().unsqueeze(-1)
        out = net(sents)
        loss = loss_fn(out, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        net.eval()
    with torch.no_grad():
        val_out = net(val_features)
        val_loss = loss_fn(val_out, val_labels)
        val_loss_list.append(val_loss.item())
        val_guesses = torch.round(nn.Sigmoid()(val_out))
        accuracy = (val_guesses == val_labels).float().mean().item()
        
        accuracy_list.append(accuracy)
        
    loss_list.append(loss.item())

In [None]:
fig, ax = plt.subplots()
pd.Series(loss_list).plot(ax=ax, label='Tr')
pd.Series(val_loss_list).plot(ax=ax, label='Val')
fig.set_size_inches(fig.get_size_inches() * 1.5)
ax.legend()

In [None]:
pd.Series(accuracy_list).plot()

In [None]:
tst_set = NewsText(l_tst, labs_tst)
_, tst_set = enumerate(DataLoader(tst_set, batch_size=len(tst_set), collate_fn=collate_fn)).__next__()
tst_features, tst_labels = tst_set
tst_labels = tst_labels.unsqueeze(-1).float()

In [None]:
with torch.no_grad():
    tst_out = net(tst_features)
    tst_loss = loss_fn(tst_out, tst_labels)
    tst_guesses = torch.round(nn.Sigmoid()(tst_out))
    accuracy_tst = (tst_guesses == tst_labels).float().mean().item()

In [None]:
accuracy_tst

In [None]:
pretrained_embeddings = net.word_embeddings
net2 = NewsNet(p1.vocab, hidden_size=4, embedding_dim=8, num_layers=2, pretrained_embeddings=pretrained_embeddings)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = Adam(net2.parameters(), lr=1e-4, weight_decay=5e-6)
idx = len(l) // 4

l_tr, labs_tr = l[:-2 * idx], labs[: -2 * idx]
l_val, labs_val = l[-2 * idx:-idx], labs[-2 * idx:-idx]
l_tst, labs_tst = l[-idx:], labs[-idx:]
l_tr, l_val, labs_tr, labs_val = l[:idx], l[idx:], labs[:idx], labs[idx:]
tr_set = NewsText(l_tr, labs_tr)
val_set = NewsText(l_val, labs_val)
_, val_set = enumerate(DataLoader(val_set, batch_size=len(val_set), collate_fn=collate_fn)).__next__()
val_features, val_labels = val_set
val_labels = val_labels.unsqueeze(-1).float()
loader = DataLoader(tr_set, batch_size=4, collate_fn=collate_fn)
loss_list = []
val_loss_list = []
accuracy_list = []
epochs = 40
for i in range(epochs):
    print(f'Epoch {i + 1}')
    for sents, labels in pbar(loader):
        net2.train()
        labels = labels.float().unsqueeze(-1)
        out = net2(sents)
        loss = loss_fn(out, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        net2.eval()
    with torch.no_grad():
        val_out = net2(val_features)
        val_loss = loss_fn(val_out, val_labels)
        val_loss_list.append(val_loss.item())
        val_guesses = torch.round(nn.Sigmoid()(val_out))
        accuracy = (val_guesses == val_labels).float().mean().item()
        
        accuracy_list.append(accuracy)
        
    loss_list.append(loss.item())

In [None]:
fig, ax = plt.subplots()
pd.Series(loss_list).plot(ax=ax, label='Tr')
pd.Series(val_loss_list).plot(ax=ax, label='Val')
fig.set_size_inches(fig.get_size_inches() * 1.5)
ax.legend()

In [None]:
pd.Series(accuracy_list).plot()

In [None]:
val_loss_list

In [None]:
tst_set = NewsText(l_tst, labs_tst)
_, tst_set = enumerate(DataLoader(tst_set, batch_size=len(tst_set), collate_fn=collate_fn)).__next__()
tst_features, tst_labels = tst_set
tst_labels = tst_labels.unsqueeze(-1).float()

In [None]:
with torch.no_grad():
    tst_out = net2(tst_features)
    tst_loss = loss_fn(tst_out, tst_labels)
    tst_guesses = torch.round(nn.Sigmoid()(tst_out))
    accuracy_tst = (tst_guesses == tst_labels).float().mean().item()

In [None]:
accuracy_tst