<a href="https://colab.research.google.com/github/shivammehta007/NLPResearch/blob/master/Tutorials/Natural%20Language%20Processing/PyTorch%20Sentimental%20Analysis/BiDirectional_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
from torchtext import data
from torchtext import datasets
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random

In [0]:
# Fixing SSL Bug Strange :3 
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [0]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


In [0]:
TEXT = data.Field(tokenize='spacy', include_lengths = True)
LABEL = data.LabelField(dtype=torch.float)

In [5]:
trainset, testset = datasets.IMDB.splits(TEXT, LABEL)

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:02<00:00, 34.9MB/s]


In [0]:
traindata, validdata = trainset.split(random_state=random.seed(SEED))

In [7]:
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(traindata, max_size=MAX_VOCAB_SIZE, vectors='glove.6B.100d', unk_init=torch.Tensor.normal_)
LABEL.build_vocab(traindata)

.vector_cache/glove.6B.zip: 862MB [07:21, 1.95MB/s]                           
100%|█████████▉| 399617/400000 [00:21<00:00, 19179.90it/s]

In [8]:
torch.cuda.is_available()

True

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
BATCH_SIZE = 64

# Generate Iterators 

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (traindata, validdata, testset),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    device=device)

In [0]:
class Model(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, bidirectional, pad_idx):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output , (hidden, cell) = self.lstm(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden.squeeze(0))






In [0]:
# Hyperparameters

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = Model(INPUT_DIM,
              EMBEDDING_DIM,
              HIDDEN_DIM,
              OUTPUT_DIM,
              N_LAYERS,
              DROPOUT,
              BIDIRECTIONAL,
              PAD_IDX)



In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [14]:
count_parameters(model)

4810857

In [15]:
pretrained_embeddings = TEXT.vocab.vectors
pretrained_embeddings.shape

torch.Size([25002, 100])

In [16]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.3844, -0.7040, -0.9940,  ..., -0.1486,  2.2851,  0.9289],
        [ 1.2595, -0.0763, -0.1569,  ..., -1.9728,  0.4010,  0.6415],
        [ 0.3555, -1.3517, -0.0088,  ...,  0.5276, -0.5721, -0.8456]])

In [0]:
# This is done to remove the unk and pad to zero making our model understand that these words are not important for sentimental analysis classification
UNK_TOKEN_LOCATION = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_TOKEN_LOCATION] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [18]:
model.embedding.weight.data

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.3844, -0.7040, -0.9940,  ..., -0.1486,  2.2851,  0.9289],
        [ 1.2595, -0.0763, -0.1569,  ..., -1.9728,  0.4010,  0.6415],
        [ 0.3555, -1.3517, -0.0088,  ...,  0.5276, -0.5721, -0.8456]])

In [0]:
optimizer = optim.Adam(model.parameters())

In [0]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y ).float()
    return correct.sum() / len(correct)

In [0]:
# Training the Model 

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    # Put the model into train mode and enable dropout
    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        text, text_length = batch.text
        prediction = model(text, text_length).squeeze(1)
        loss = criterion(prediction, batch.label)
        acc = binary_accuracy(prediction, batch.label)
        
        loss.backward()
        optimizer.step()

        epoch_loss += loss
        epoch_acc += acc

    return epoch_loss/len(iterator) , epoch_acc/len(iterator)



In [0]:
# Evaluating Function 
def evaluate(model, iterator, criterion):
    epoch_loss =0 
    epoch_acc =0 
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            text, text_length = batch.text
            prediction = model(text, text_length).squeeze(1)
            loss = criterion(prediction, batch.label)
            acc = binary_accuracy(prediction, batch.label)

            epoch_loss += loss
            epoch_acc += acc
    
    return epoch_loss/len(iterator) , epoch_acc/len(iterator)



In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [25]:
# Training the Model 
 
N_EPOCHS = 5

best_valid_loss = float("inf")

for i in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()

    print('Epoch: {:02} , Time Taken : {}'.format(i+1 , end_time - start_time))
    print('Train Loss  : {} , Train Accuracy : {}'.format(train_loss, train_acc))
    print('Validation Loss : {}, Validation Accuracy : {}'.format(valid_loss, valid_acc))




100%|█████████▉| 399617/400000 [00:40<00:00, 19179.90it/s]

Epoch: 01 , Time Taken : 97.33841633796692
Train Loss  : 0.5880817770957947 , Train Accuracy : 0.6749950647354126
Validation Loss : 0.49155768752098083, Validation Accuracy : 0.7676112055778503
Epoch: 02 , Time Taken : 97.12589526176453
Train Loss  : 0.5150703191757202 , Train Accuracy : 0.7543013691902161
Validation Loss : 0.697087824344635, Validation Accuracy : 0.6127736568450928
Epoch: 03 , Time Taken : 97.27387523651123
Train Loss  : 0.5928990244865417 , Train Accuracy : 0.689634382724762
Validation Loss : 0.6385604739189148, Validation Accuracy : 0.6214689612388611
Epoch: 04 , Time Taken : 97.33372020721436
Train Loss  : 0.4901302456855774 , Train Accuracy : 0.7570466995239258
Validation Loss : 0.3640497624874115, Validation Accuracy : 0.8429113626480103
Epoch: 05 , Time Taken : 97.30928921699524
Train Loss  : 0.3028857409954071 , Train Accuracy : 0.8762219548225403
Validation Loss : 0.3144793212413788, Validation Accuracy : 0.8704096078872681


In [27]:
# Check for test Loss now :3 

test_loss, test_acc = evaluate(model, test_iterator, criterion)
print('Test Loss : {} \nTest Accuracy : {}'.format(test_loss, test_acc*100))

Test Loss : 0.3329935371875763 
Test Accuracy : 86.45060729980469
