In [1]:
import torch 
import torch.nn as nn 
from torch.nn import functional as F 
from torch.autograd import Variable 
import os 
import sys 
from torchtext import data 
from torchtext.data import Field
from torchtext import datasets 
from torchtext.vocab import Vectors, GloVe 
from torch import optim as optim 
import nltk 

In [52]:
def load_data(path):
    TEXT = Field(sequential=True, tokenize=nltk.word_tokenize, init_token='<sos>', eos_token='<eos>', lower=True)
    LABEL = Field(sequential=False, use_vocab=False)
    train_data = data.TabularDataset(path=path, format='csv',fields=[('ID', None),('text', TEXT),('label', LABEL)], skip_header= True)
    TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
    #LABEL.build_vocab(train_data)
    word_embeddings = TEXT.vocab.vectors
    train_data, valid_data = train_data.split() 
    train_iterator, valid_iterator= data.BucketIterator.splits((train_data, valid_data), sort_key=lambda x: len(x.text),batch_size=100)
    vocab_size = len(TEXT.vocab)
    return TEXT, vocab_size, word_embeddings, train_iterator, valid_iterator, train_data, valid_data #(batch_size, sen_len, embedding_dim)

In [53]:
TEXT, vocab_size, word_embeddings, train_iterator, valid_iterator, trn, vld= load_data('./cleaned_dataset.csv')

In [55]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [56]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
if torch.cuda.is_available():
    model.cuda()
optimizer = torch.optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.1988,  0.3391, -0.2995,  ...,  0.2258, -0.6136,  0.4946],
        [-0.3995, -0.1673,  0.0869,  ...,  0.3244, -0.5704,  0.6110],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [47]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        x = batch.label.float()
        loss = criterion(predictions, x)
        
        acc = binary_accuracy(predictions, x)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [50]:


def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            x = batch.label.float()
            loss = criterion(predictions, x)
            
            acc = binary_accuracy(predictions, x)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)



In [57]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')


| Epoch: 01 | Train Loss: 0.315 | Train Acc: 94.22% | Val. Loss: 0.141 | Val. Acc: 98.12% |
| Epoch: 02 | Train Loss: 0.091 | Train Acc: 98.65% | Val. Loss: 0.104 | Val. Acc: 98.12% |
| Epoch: 03 | Train Loss: 0.072 | Train Acc: 98.67% | Val. Loss: 0.093 | Val. Acc: 98.12% |
| Epoch: 04 | Train Loss: 0.075 | Train Acc: 98.59% | Val. Loss: 0.094 | Val. Acc: 98.12% |
| Epoch: 05 | Train Loss: 0.071 | Train Acc: 98.63% | Val. Loss: 0.095 | Val. Acc: 98.12% |
