<a href="https://colab.research.google.com/github/saurabh-parkar/Sentiment_Analysis/blob/master/IMDB_Sentiment_Analysis_using_Torchtext_and_BiLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [117]:
import time
import torch
import random
import torch.nn as nn
import torch.optim as optim

from torchtext import data
from torchtext import datasets
from tqdm import tqdm
import matplotlib.pyplot as plt

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [93]:
TEXT = data.Field(lower=True, batch_first=True, include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [94]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)


In [95]:
train_data, valid_data = train_data.split(random_state = random.seed(SEED))


###**build_vocab**

build_vocab will create the Vocab object for Field, which contains the information to convert word into word index and vice versa.

The build_vocab also helps to download the word embeddings and associate them with the words in the vocabulary. The word embedding will save as Field.Vocab.vectors. vectors contains all of the word embedding. These be used later using embed.weight.data.copy_(torch.from_numpy(pretrained_weight)) when we define the nn.embedding layer.

By default torchtext will initialize the unknown word vectors not in the vocabulary (pretrained embeddings) to zero, we initialize them with a random Gaussian distribution using unk_init argument

In [96]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

### **BucketIterator**

**data.BucketIterator.splits** returns iterators that loads batches of data from datasets each having a text attribute and a label attribute, and the text in same batch will have similar lengths. 

In [97]:

BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

### **Model**

If your input data is of shape (seq_len, batch_size, features) then you donâ€™t need batch_first=True and your LSTM will give output of shape (seq_len, batch_size, hidden_size).

If your input data is of shape (batch_size, seq_len, features) then you need batch_first=True and your LSTM will give output of shape (batch_size, seq_len, hidden_size).

The -1 in .view means that it infer the actual value for this dimension based on the other values.

In [98]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable

class BiLSTM(nn.Module):

    # define all the layers used in model
  def __init__(self, vocab_size, embedding_dim, hidden_dim1, hidden_dim2, output_dim, n_layers,
                bidirectional, dropout, pad_index):
    # Constructor
    super().__init__()

    # embedding layer
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_index)

    # lstm layer
    self.lstm = nn.LSTM(embedding_dim,
                        hidden_dim1,
                        num_layers=n_layers,
                        bidirectional= bidirectional,
                        batch_first=True,
                        dropout = dropout)
    
    self.fc1 = nn.Linear(hidden_dim1 * 2, output_dim)
    self.fc2 = nn.Linear(hidden_dim2, output_dim)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(dropout)
    # activation function
    self.act = nn.Softmax() #\ F.log_softmax(outp)



  def forward(self, text, text_lengths):

    # text = [batch size,sent_length]
    embedded = self.embedding(text)
    # embedded = [batch size, sent_len, emb dim]

    # packed sequence
    packed_embedded = pack_padded_sequence(embedded, text_lengths, batch_first=True) # unpad

    packed_output, (hidden, cell) = self.lstm(packed_embedded)
    # packed_output shape = (batch, seq_len, num_directions * hidden_size)
    # hidden shape  = (num_layers * num_directions, batch, hidden_size)
   
    # batch_first doesnt affect the hidden and the cell states

    # concat the final forward and backward hidden state
    cat = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
    # output, output_lengths = pad_packed_sequence(packed_output)  # pad the sequence to the max length in the batch

    res = self.dropout(cat)
    preds = self.fc1(res)

    # Final activation function
    # preds = self.act(preds)
    # preds = preds.argmax(dim=1).unsqueeze(0)
    return preds

In [99]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM1 = 256
HIDDEN_DIM2 = 100
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = BiLSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM1, HIDDEN_DIM2, OUTPUT_DIM, N_LAYERS,
                BIDIRECTIONAL, DROPOUT, PAD_IDX)
model

BiLSTM(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (lstm): LSTM(100, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc1): Linear(in_features=512, out_features=1, bias=True)
  (fc2): Linear(in_features=100, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (act): Softmax(dim=None)
)

In [100]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model)} trainable paramters')

The model has 4810958 trainable paramters


In [101]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [102]:
model.embedding.weight.data.copy_(pretrained_embeddings)


tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.3114, -0.7845,  0.5564,  ...,  0.5830, -0.9701, -0.8863],
        [-0.3194,  0.0241, -0.0103,  ...,  0.3187,  0.3611, -0.0141],
        [-0.0060,  0.0117, -0.2508,  ...,  0.3381,  0.7595, -0.3132]])

In [103]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.3114, -0.7845,  0.5564,  ...,  0.5830, -0.9701, -0.8863],
        [-0.3194,  0.0241, -0.0103,  ...,  0.3187,  0.3611, -0.0141],
        [-0.0060,  0.0117, -0.2508,  ...,  0.3381,  0.7595, -0.3132]])


In [104]:
optimizer = optim.Adam(model.parameters())


In [105]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [106]:

def accuracy(preds, y):
    predicted = torch.round(torch.sigmoid(preds))
    correct = (predicted == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [107]:
def train(model, iterator, optimizer, criterion):

    model.train()
    epoch_loss = 0
    epoch_acc = 0
    for batch in iterator:
        optimizer.zero_grad()
        # retrieve text and no. of words
        text, text_lengths = batch.text

        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label.squeeze())

        acc = accuracy(predictions, batch.label)

        # perform backpropagation
        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [108]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text

            predictions = model(text, text_lengths).squeeze(1)

            loss = criterion(predictions, batch.label.squeeze())

            acc = accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [109]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

### **Training**

In [110]:
N_EPOCHS = 6

def run_train(epochs, model, train_iterator, valid_iterator, optimizer, criterion, model_type):
    best_valid_loss = float('inf')

    for epoch in range(epochs):

        start_time = time.time()
      
        # train the model
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

        # evaluate the model
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)


        # save the best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'saved_weights'+'_'+model_type+'.pt')

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')

In [121]:
run_train(N_EPOCHS, model, train_iterator, valid_iterator, optimizer, criterion, "BiLSTM")

Epoch: 01 | Epoch Time: 0m 34s
	Train Loss: 0.173 | Train Acc: 93.38%
	 Val. Loss: 0.333 |  Val. Acc: 86.87%
Epoch: 02 | Epoch Time: 0m 34s
	Train Loss: 0.120 | Train Acc: 95.75%
	 Val. Loss: 0.378 |  Val. Acc: 86.78%
Epoch: 03 | Epoch Time: 0m 34s
	Train Loss: 0.078 | Train Acc: 97.37%
	 Val. Loss: 0.434 |  Val. Acc: 86.67%
Epoch: 04 | Epoch Time: 0m 34s
	Train Loss: 0.048 | Train Acc: 98.57%
	 Val. Loss: 0.466 |  Val. Acc: 86.45%
Epoch: 05 | Epoch Time: 0m 34s
	Train Loss: 0.029 | Train Acc: 99.25%
	 Val. Loss: 0.551 |  Val. Acc: 87.22%


### **Testing**

In [123]:
# Testing

model.load_state_dict(torch.load('/content/saved_weights_BiLSTM.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.356 | Test Acc: 85.87%


### **Plots**

In [114]:
def plot_loss_and_accuracy(history):
    fig, axs = plt.subplots(1, 2, sharex=True)

    axs[0].plot(history.history['loss'])
    axs[0].plot(history.history['val_loss'])
    axs[0].set_title('Model Loss')
    axs[0].legend(['Train', 'Validation'], loc='upper left')

    axs[1].plot(history.history['acc'])
    axs[1].plot(history.history['val_acc'])
    axs[1].set_title('Model Accuracy')
    axs[1].legend(['Train', 'Validation'], loc='upper left')

    fig.tight_layout()
    plt.show()