In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from IPython.display import clear_output

!pip install kaggle
%env KAGGLE_USERNAME=xerefic
%env KAGGLE_KEY=83aac7088c3bb8150fcf8197ab22c67b

!kaggle competitions download -c fake-news
!unzip /content/train.csv.zip
!unzip /content/test.csv.zip
!rm *.zip

clear_output()

In [None]:
!wget https://nlp.stanford.edu/data/glove.840B.300d.zip
!mkdir embeddings 
!mkdir embeddings/glove.840B.300d
!unzip /content/glove.840B.300d.zip -d "/content/embeddings/glove.840B.300d"

clear_output()

---

In [None]:
import torch

In [None]:
PATH = '/content/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import pandas as pd
import os

In [None]:
data = pd.read_csv("/content/train.csv")
data = data.drop(columns=["id", "title", "author"])
print(len(data))
data.head()

20800


Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


In [None]:
o_class = data.loc[data.label == 0, :]
l_class = data.loc[data.label == 1, :]
print(len(o_class))
print(len(l_class))

10387
10413


In [None]:
valid_o = o_class.iloc[:1000, :]
valid_l = l_class.iloc[:1000, :]

train_o = o_class.iloc[1000:, :]
train_l = l_class.iloc[1000:, :]

In [None]:
train = pd.concat([train_o, train_l], axis=0)
print(train.shape)

valid = pd.concat([valid_o, valid_l], axis=0)
print(valid.shape)

(18800, 2)
(2000, 2)


In [None]:
!mkdir inputs

train.to_csv("/content/inputs/train.csv", index=False)
valid.to_csv("/content/inputs/valid.csv", index=False)

In [None]:
del data, train, valid, train_l, train_o, valid_l, valid_o, o_class, l_class

---

In [None]:
import torch

import pandas as pd
import numpy as np
!pip install pyprind
import pyprind
import random
import os
import gc
import torch
import torch.nn as nn

import torch.nn as nn
import torch.nn.functional as F


import os
import spacy
import torchtext
import matplotlib.pyplot as plt 
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
%matplotlib inline

Collecting pyprind
  Downloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)
Installing collected packages: pyprind
Successfully installed pyprind-2.11.3


In [None]:
class CreateDataset(torch.utils.data.Dataset):

    def __init__(self, PATH, batch_size=32, mode='train'):
        self.PATH = PATH
        self.mode = mode + ".csv"
        self.batch_size = batch_size
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.spacy = spacy.load("en_core_web_sm")

        self.TEXT = torchtext.legacy.data.Field(sequential=True, tokenize="spacy")
        self.LABEL = torchtext.legacy.data.LabelField(dtype=torch.long, sequential=False)

        self.initData()
        self.initEmbed()

        self.makeData()

    def initData(self):
        DATA = os.path.join(self.PATH, 'inputs/')

        self.data = torchtext.legacy.data.TabularDataset(
                        path=os.path.join(DATA, self.mode), 
                        format="csv", 
                        skip_header=True, 
                        fields=[('Text', self.TEXT), ('Label', self.LABEL)])

    def initEmbed(self):
        EMBED = os.path.join(self.PATH, "embeddings/glove.840B.300d/glove.840B.300d.txt")

        self.TEXT.build_vocab(self.data,
                         vectors=torchtext.vocab.Vectors(EMBED), 
                         max_size=25000,
                         min_freq=10)
        self.LABEL.build_vocab(self.data)

    def makeData(self):
        self.iterator = torchtext.legacy.data.Iterator(
                        self.data, 
                        sort_key=lambda x: len(x.Text), 
                        batch_size=self.batch_size,
                        device=self.device)

    def lengthData(self):
        return len(self.data)
    
    def lengthVocab(self):
        return len(self.TEXT.vocab), len(self.LABEL.vocab)

    def freqLABEL(self):
        return self.LABEL.vocab.freqs

    def getData(self):
        return self.iterator

    def getEmbeddings(self):
        return self.TEXT.vocab.vectors

In [None]:
train_data = CreateDataset("/content/", batch_size=16, mode='train')
valid_data = CreateDataset("/content/", batch_size=16, mode='valid')

100%|█████████▉| 2195783/2196017 [03:50<00:00, 10347.64it/s]

In [None]:
trainloader = train_data.getData()
valloader = valid_data.getData()

---

In [None]:
class LSTM(torch.nn.Module):
    def __init__(self, input_dim, embedding_dim, num_layers, hidden_dim, static=False, dropout=0.2):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim

        self.dropout = torch.nn.Dropout(p=dropout)

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        if static:
            self.embedding.weight.requires_grad = False

        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, 
                                         num_layers=num_layers,
                                         bidirectional=True, 
                                         dropout=dropout, 
                                         batch_first=True)
        self.linear = torch.nn.Linear(hidden_dim*num_layers*2, 1)
    
    def forward(self, text):
        embedded = self.embedding(text)
        embedded = torch.transpose(embedded, dim0=1, dim1=0)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        out = self.linear(self.dropout(torch.cat([cell[i,:, :] for i in range(cell.shape[0])], dim=1)))
        return out

---

In [None]:
pretrained_embeddings = train_data.getEmbeddings()
input_dim = train_data.lengthVocab()[0]
embedding_dim = 300
hidden_dim = 384//2
output_dim = 2
num_layers = 1
batch_size = 16

In [None]:
model = LSTM(input_dim, embedding_dim, hidden_dim, num_layers)
model.embedding.weight.data = pretrained_embeddings.to(device)
class_weights = torch.tensor([1.0, 15.0]).to(device)
model = model.to(device)
pass

In [None]:
optimizer = optim.SGD(model.parameters(), lr=1e-4)
criterion = nn.BCEWithLogitsLoss().to(device)

In [None]:
start_epochs = 0
total_epochs = 100

In [None]:
CHECKPOINT = "/content/drive/MyDrive/Projects/Hackathons/FakeNews-Team_Hackers/checkpoints/LSTM"

if os.path.exists(os.path.join(CHECKPOINT, "model.pth")):
    checkpoints = torch.load(os.path.join(CHECKPOINT, "model.pth"))

    model.load_state_dict(checkpoints['model_state_dict'])
    optimizer.load_state_dict(checkpoints['optimizer_state_dict'])
    start_epochs = checkpoints['epoch']

In [None]:
def binary_accuracy(preds, y):

    preds = torch.sigmoid(preds)
    preds = torch.round(preds)

    correct = (preds == y).float()
    acc = correct.sum()/float(len(correct))
    return acc

In [None]:
epoch_train_losses = []
accu_train_epoch = []
epoch_val_losses = []
accu_val_epoch = []

In [None]:
def train(model, iterator, optimizer, criterion):
    
    train_loss_batch = []
    accu_train_batch = []
    model.train()

    gc.collect()
    torch.cuda.empty_cache()

    bar = pyprind.ProgBar(len(iterator), bar_char='█')
    for idx, batch in enumerate(iterator, 1):
        optimizer.zero_grad()
                
        predictions = model.forward(batch.Text).view(-1)
        batch.Label = (batch.Label).type_as(predictions)
        train_loss = criterion(predictions, batch.Label)
        acc = binary_accuracy(predictions, batch.Label)
        
        train_loss.backward()
        optimizer.step()
        
        train_loss_batch.append(train_loss.item())
        accu_train_batch.append(acc)
        bar.update()
        gc.collect()
        torch.cuda.empty_cache()

    epoch_train_losses.append(sum(train_loss_batch)/len(iterator))
    accu_train_epoch.append(sum(accu_train_batch)/len(iterator))

    return epoch_train_losses[-1], accu_train_epoch[-1]

In [None]:
def evaluate(model, iterator, criterion):
    
    val_loss_batch = []
    accu_val_batch = []
    model.eval()

    gc.collect()
    torch.cuda.empty_cache()
    
    with torch.no_grad():
        bar = pyprind.ProgBar(len(iterator), bar_char='█')
        for idx, batch in enumerate(iterator, 1):

            predictions = model.forward(batch.Text).view(-1)
            batch.Label = (batch.Label).type_as(predictions)
            val_loss = criterion(predictions, batch.Label)
            
            acc = binary_accuracy(predictions, batch.Label)

            val_loss_batch.append(val_loss.item())
            accu_val_batch.append(acc)
            bar.update()
            gc.collect()
            torch.cuda.empty_cache()
            
        epoch_val_losses.append(sum(val_loss_batch)/len(iterator))
        accu_val_epoch.append(sum(accu_val_batch)/len(iterator))
    return epoch_val_losses[-1], accu_val_epoch[-1]

In [None]:
for epoch in range(start_epochs+1, total_epochs+start_epochs+1):

    train_loss, train_acc = train(model, trainloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valloader, criterion)

    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': epoch_train_losses[-1],
            }, os.path.join(CHECKPOINT, "model.pth"))
    
    print(f'| Epoch: [{epoch:02}/{total_epochs+start_epochs+1}] | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')