<a href="https://colab.research.google.com/github/sanjeevr5/NLP_Excercises/blob/main/DL_NLP_With_Torch_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Binary classification problem using RNNs

This exercise is performed on IMDB movie reviews dataset. There are two sentiments here 0. Negative 1. Positive. 

<b> Architecture details </b>

1. We shall train our embedding representations.
2. Using an RNN architecture with pad_sequences, pack_pad_sequences and pad_packed_sequences.
3. Dataloaders to have generator kind of data feed to the model.
4. The sequences should not be too long. There should be a clipping factor and hence choose the vocabulary wisely.LSTMs cannot handle very long input sequence.
5. Passing outputs of output layer or the hidden state to the dense layer.

## Downloading the data

In [None]:
%%capture
#!pip install bpemb
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xvzf aclImdb_v1.tar.gz

## Importing the essentials

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_sequence, pad_sequence, pad_packed_sequence


device = 'cuda' if torch.cuda.is_available() else 'cpu'
SEED= 20
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

## Tokenizer and pre-processing

A minimal pre-processing to retain only characters.

In [None]:
from torchtext.vocab import build_vocab_from_iterator
import re

tokenizer = lambda x : [token for token in re.sub(r'[^a-z\s]', '', x.lower()).split() if token !=' ']

In [None]:
train_list = [[(tokenizer(open(f'/content/aclImdb/train/{cat}/{file}', 'r').read()), 1 if cat == 'pos' else 0) for file in os.listdir(f'/content/aclImdb/train/{cat}')] for cat in ('pos', 'neg')]
test_list = [[(tokenizer(open(f'/content/aclImdb/test/{cat}/{file}', 'r').read()) ,1 if cat == 'pos' else 0) for file in os.listdir(f'/content/aclImdb/test/{cat}')] for cat in ('pos', 'neg')]

train_list = [*train_list[0], *train_list[1]]
test_list = [*test_list[0], *test_list[1]]

In [None]:
print('Sample train data', train_list[0]) #review tokenized, label
print('Sample test data', test_list[0]) #review tokenized, label

Sample train data (['this', 'is', 'the', 'best', 'movie', 'ive', 'come', 'across', 'in', 'a', 'long', 'while', 'not', 'only', 'is', 'this', 'the', 'best', 'movie', 'of', 'its', 'kindschool', 'shootingthe', 'way', 'ben', 'cocciothe', 'director', 'decided', 'to', 'film', 'it', 'was', 'magnificent', 'he', 'filmed', 'it', 'using', 'teenage', 'actors', 'who', 'were', 'still', 'attending', 'high', 'school', 'he', 'filmed', 'it', 'in', 'the', 'actors', 'own', 'rooms', 'and', 'used', 'the', 'actors', 'real', 'parents', 'as', 'their', 'parents', 'in', 'the', 'film', 'also', 'the', 'actors', 'were', 'filming', 'too', 'using', 'camcorders', 'making', 'it', 'seem', 'much', 'more', 'like', 'a', 'video', 'diary', 'it', 'is', 'almost', 'artfulif', 'that', 'is', 'indeed', 'a', 'wordthere', 'are', 'a', 'few', 'slip', 'ups', 'however', 'for', 'example', 'when', 'cal', 'calls', 'brads', 'land', 'rover', 'a', 'range', 'roveror', 'vice', 'versa', 'its', 'been', 'awhile', 'since', 'ive', 'seen', 'it'], 1)
S

## Build vocabulary

In [None]:
from tqdm import tqdm_notebook

def vocabGenerator(lst):
  for data, _ in tqdm_notebook(lst):
    yield data

vocab = build_vocab_from_iterator(vocabGenerator(train_list), min_freq = 5, specials = ['<unk>', '<pad>'], max_tokens = 20002, special_first = False)
vocab.set_default_index(vocab['<unk>']) #unknown tokens will have this index
print(f'The length of the vocabulary is', len(vocab)) #max tokens 20k + 2 spl tokens

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/25000 [00:00<?, ?it/s]

The length of the vocabulary is 20002


## Data loader with padding

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence, pad_packed_sequence

def rnnInputsCollate(batch, clip = 256, pad_value = vocab['<pad>']):
  batch = sorted(batch, key = lambda x : len(x[0]), reverse = True)
  review = [torch.tensor(vocab(review))[:clip] for review, _ in batch]
  lens = torch.LongTensor([len(r) for r in review])
  review_padded = pad_sequence(review, padding_value = pad_value)
  label = torch.LongTensor([label for _, label in batch])
  return review_padded, lens, label

trainDataloader = DataLoader(train_list, batch_size = 32, shuffle = True, collate_fn = rnnInputsCollate)
testDataloader = DataLoader(test_list, batch_size = 128, shuffle = True, collate_fn = rnnInputsCollate)

## A general RNN class to accomodate different configurations

n_classes is 1 for binary problems

In [None]:
class IMDBClassifier(nn.Module):
  def __init__(self, vocab_weights, rnn_type, vocab_size, n_classes, embed_size, rnn_units, 
               n_layers, bi_dir, rnn_drop, drop_r, padding_index, use_output = True):
    super().__init__()
    self.rnn_units = rnn_units
    self.n_classes = n_classes
    self.rnn_type = rnn_type
    self.bi_dir = bi_dir
    self.n_layers = n_layers
    self.use_output = use_output
    if vocab_weights:
      self.embedding = nn.Embedding.from_pretrained(torch.as_tensor(vocab_weights))
    else:
      self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx = padding_index)
    if rnn_type == 'LSTM':
      self.rnn = nn.LSTM(embed_size, rnn_units, num_layers = n_layers, bidirectional = bi_dir, dropout = rnn_drop)
    elif rnn_type == 'GRU':
      self.rnn = nn.GRU(embed_size, rnn_units, num_layers = n_layers, bidirectional = bi_dir, dropout = rnn_drop)
    else:
      raise NotImplementError('Only LSTM and GRU supported!')
    self.fc = nn.Linear(2 * rnn_units if bi_dir else rnn_units, self.n_classes)
    self.drp = nn.Dropout(drop_r)
  
  def forward(self, data, lens):
    #-------------#
    #The data will have the following dimensions (time_sequences, batch_size, embed_dim)
    x_embed = self.embedding(data)
    #-------------#

    #-------------#
    #packing sequences and passing to RNN unit to save computations
    x_packed = pack_padded_sequence(x_embed, lens.cpu(), enforce_sorted = False)
    #-------------#

    #-------------#
    if self.rnn_type == 'LSTM':
      #output is packed and cannot be fed to linear layers
      output_packed, (hidden,cell) = self.rnn(x_packed) 
    else:
      #For GRU there is only hidden state
      output_packed, hidden = self.rnn(x_packed) 
    #-------------#

    #-------------#
    #output is padded to be fed to linear layer (padded_lens, batch size, hidden_units)
    output_padded, _ = pad_packed_sequence(output_packed)
    #-------------#

    #-------------#

    if self.use_output:
        drp = self.drp(output_padded[-1])
    else:
      drp = self.drp(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1) if self.bi_dir else hidden[-1])
    
    return self.fc(drp)

In [None]:
VOCAB_WEIGHTS = None
RNN_TYPE = 'LSTM'
VOCAB_SIZE = len(vocab) 
N_CLASSES = 1
EMBED_SIZE = 128
RNN_UNITS = 256
N_LAYERS = [1, 2]
BI_DIR = [True, False]
RNN_DROP = 0.0
DROP_RATE = 0.3
PADDING_INDEX = vocab['<pad>']

In [None]:
singleBiDir = IMDBClassifier(VOCAB_WEIGHTS, RNN_TYPE, VOCAB_SIZE, N_CLASSES, EMBED_SIZE, RNN_UNITS, N_LAYERS[0], BI_DIR[1], RNN_DROP, DROP_RATE, PADDING_INDEX)
print(f'The total number of trainable parameters for single layered bi-directional LSTM are : {sum(p.numel() for p in singleBiDir.parameters() if p.requires_grad):,}')

The total number of trainable parameters for single layered bi-directional LSTM are : 2,955,777


In [None]:
twoBiDir = IMDBClassifier(VOCAB_WEIGHTS, RNN_TYPE, VOCAB_SIZE, N_CLASSES, EMBED_SIZE, RNN_UNITS, N_LAYERS[1], BI_DIR[1], DROP_RATE, DROP_RATE, PADDING_INDEX)
print(f'The total number of trainable parameters for two layered bi-directional LSTM are : {sum(p.numel() for p in twoBiDir.parameters() if p.requires_grad):,}')

The total number of trainable parameters for two layered bi-directional LSTM are : 3,482,113


In [None]:
optimizer = optim.Adam(singleBiDir.parameters())
criterion = nn.BCEWithLogitsLoss()
singleBiDir = singleBiDir.to(device)
criterion = criterion.to(device)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def binary_accuracy(preds, true):
  return (((preds >= 0.5) == (true == True)).sum().float())/len(true)

def train(model, iterator = trainDataloader, loss_fn = criterion, optimizer = None):
  e_loss = e_acc = i = 0
  model.train()
  for inputs, leng, labels in iterator:
    inputs, leng, labels = inputs.to(device), leng.to(device), labels.to(device)
    optimizer.zero_grad()
    preds = model(inputs, leng).squeeze(1)
    loss = loss_fn(preds
                   , labels.float())
    acc = binary_accuracy(preds, labels)
    loss.backward()
    optimizer.step()
    e_loss += loss.item()
    e_acc += acc.item()
    i += 1
  return e_loss/i, e_acc/i

def predict(model, iterator = testDataloader, loss_fn = criterion):
  e_loss = e_acc = i = 0
  model.eval()
  with torch.no_grad():
    for inputs, leng, labels in iterator:
      inputs, leng, labels = inputs.to(device), leng.to(device), labels.to(device)
      preds = model(inputs, leng).squeeze(1)
      loss = loss_fn(preds, labels.float())
      acc = binary_accuracy(preds, labels)
      e_loss += loss.item()
      e_acc += acc.item()
      i += 1
  return e_loss/i, e_acc/i

In [None]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(singleBiDir, optimizer = optimizer)
    valid_loss, valid_acc = predict(singleBiDir)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} / {N_EPOCHS} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 / 10 | Epoch Time: 0m 20s
	Train Loss: 0.693 | Train Acc: 50.19%
	 Val. Loss: 0.693 |  Val. Acc: 50.29%
Epoch: 02 / 10 | Epoch Time: 0m 19s
	Train Loss: 0.681 | Train Acc: 52.68%
	 Val. Loss: 0.691 |  Val. Acc: 51.47%
Epoch: 03 / 10 | Epoch Time: 0m 19s
	Train Loss: 0.641 | Train Acc: 57.04%
	 Val. Loss: 0.698 |  Val. Acc: 53.11%
Epoch: 04 / 10 | Epoch Time: 0m 19s
	Train Loss: 0.581 | Train Acc: 61.01%
	 Val. Loss: 0.749 |  Val. Acc: 52.96%
Epoch: 05 / 10 | Epoch Time: 0m 19s
	Train Loss: 0.526 | Train Acc: 63.63%
	 Val. Loss: 0.810 |  Val. Acc: 53.35%
Epoch: 06 / 10 | Epoch Time: 0m 19s
	Train Loss: 0.503 | Train Acc: 64.59%
	 Val. Loss: 0.891 |  Val. Acc: 53.32%
Epoch: 07 / 10 | Epoch Time: 0m 19s
	Train Loss: 0.496 | Train Acc: 64.79%
	 Val. Loss: 0.975 |  Val. Acc: 52.89%
Epoch: 08 / 10 | Epoch Time: 0m 20s
	Train Loss: 0.499 | Train Acc: 64.67%
	 Val. Loss: 0.971 |  Val. Acc: 53.13%
Epoch: 09 / 10 | Epoch Time: 0m 19s
	Train Loss: 0.499 | Train Acc: 64.62%
	 Val. Loss: 

In [None]:
optimizer = optim.Adam(twoBiDir.parameters())
twoBiDir = twoBiDir.to(device)

N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(twoBiDir, optimizer = optimizer)
    valid_loss, valid_acc = predict(twoBiDir)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} / {N_EPOCHS} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 / 10 | Epoch Time: 0m 28s
	Train Loss: 0.694 | Train Acc: 50.08%
	 Val. Loss: 0.689 |  Val. Acc: 50.64%
Epoch: 02 / 10 | Epoch Time: 0m 28s
	Train Loss: 0.689 | Train Acc: 51.00%
	 Val. Loss: 0.689 |  Val. Acc: 50.06%
Epoch: 03 / 10 | Epoch Time: 0m 28s
	Train Loss: 0.678 | Train Acc: 53.37%
	 Val. Loss: 0.692 |  Val. Acc: 50.00%
Epoch: 04 / 10 | Epoch Time: 0m 28s
	Train Loss: 0.671 | Train Acc: 53.79%
	 Val. Loss: 0.687 |  Val. Acc: 52.57%
Epoch: 05 / 10 | Epoch Time: 0m 28s
	Train Loss: 0.616 | Train Acc: 59.08%
	 Val. Loss: 0.691 |  Val. Acc: 54.71%
Epoch: 06 / 10 | Epoch Time: 0m 28s
	Train Loss: 0.573 | Train Acc: 61.56%
	 Val. Loss: 0.690 |  Val. Acc: 55.85%
Epoch: 07 / 10 | Epoch Time: 0m 28s
	Train Loss: 0.536 | Train Acc: 63.21%
	 Val. Loss: 0.721 |  Val. Acc: 55.13%
Epoch: 08 / 10 | Epoch Time: 0m 28s
	Train Loss: 0.519 | Train Acc: 63.98%
	 Val. Loss: 0.780 |  Val. Acc: 55.86%
Epoch: 09 / 10 | Epoch Time: 0m 32s
	Train Loss: 0.512 | Train Acc: 64.18%
	 Val. Loss: 

In [None]:
twoBiDirHidden = IMDBClassifier(VOCAB_WEIGHTS, RNN_TYPE, VOCAB_SIZE, N_CLASSES, EMBED_SIZE, RNN_UNITS, N_LAYERS[1], BI_DIR[1], DROP_RATE, DROP_RATE, PADDING_INDEX, use_output= False)
print(f'The total number of trainable parameters for two layered bi-directional LSTM are : {sum(p.numel() for p in twoBiDirHidden.parameters() if p.requires_grad):,}')

The total number of trainable parameters for two layered bi-directional LSTM are : 3,482,113


In [None]:
optimizer = optim.Adam(twoBiDirHidden.parameters())
twoBiDirHidden = twoBiDirHidden.to(device)

N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(twoBiDirHidden, optimizer = optimizer)
    valid_loss, valid_acc = predict(twoBiDirHidden)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} / {N_EPOCHS} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 / 10 | Epoch Time: 0m 26s
	Train Loss: 0.662 | Train Acc: 56.23%
	 Val. Loss: 0.580 |  Val. Acc: 66.91%
Epoch: 02 / 10 | Epoch Time: 0m 26s
	Train Loss: 0.465 | Train Acc: 77.61%
	 Val. Loss: 0.421 |  Val. Acc: 82.44%
Epoch: 03 / 10 | Epoch Time: 0m 26s
	Train Loss: 0.324 | Train Acc: 86.11%
	 Val. Loss: 0.350 |  Val. Acc: 83.24%
Epoch: 04 / 10 | Epoch Time: 0m 25s
	Train Loss: 0.234 | Train Acc: 90.78%
	 Val. Loss: 0.363 |  Val. Acc: 84.64%
Epoch: 05 / 10 | Epoch Time: 0m 25s
	Train Loss: 0.181 | Train Acc: 92.91%
	 Val. Loss: 0.366 |  Val. Acc: 85.81%
Epoch: 06 / 10 | Epoch Time: 0m 25s
	Train Loss: 0.123 | Train Acc: 95.65%
	 Val. Loss: 0.432 |  Val. Acc: 85.97%
Epoch: 07 / 10 | Epoch Time: 0m 25s
	Train Loss: 0.079 | Train Acc: 97.44%
	 Val. Loss: 0.501 |  Val. Acc: 85.44%
Epoch: 08 / 10 | Epoch Time: 0m 26s
	Train Loss: 0.063 | Train Acc: 97.85%
	 Val. Loss: 0.513 |  Val. Acc: 84.33%
Epoch: 09 / 10 | Epoch Time: 0m 25s
	Train Loss: 0.048 | Train Acc: 98.41%
	 Val. Loss: 