# IDMb Sentiment Analysis

Modeling the sentiment of movie reviews is an exercise in NLP based analyses, for which RNNs are a very suitable tool. This notebook will consider the IMDb dataset from torchvision and perform an RNN based sentiment analysis on it.

In [1]:
import torch
import torch.nn as nn

from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader

# torchtext is deprecated and will not be compatible with pytorch with cuda
# from torchtext.data import Field, LabelField
from torchtext.datasets import IMDB
from torchtext.vocab import vocab

import re

from collections import Counter, OrderedDict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# There are 25,000 examples in the training dataset
train_data = IMDB(split='train')
test_data = IMDB(split='test')

torch.manual_seed(42)

train_data, valid_data = random_split(list(train_data), [20000, 5000])

In [3]:
# Finding unique tokens/words
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

token_counts = Counter()

for label, line in train_data:
    tokens = tokenizer(line)
    token_counts.update(tokens)

print(f'Vocabulary size: {len(token_counts)}')

Vocabulary size: 69209


In [4]:
# Text-encoding: tokens to integers
sorted_by_freq = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)

ordered_dict = OrderedDict(sorted_by_freq)

vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 460]


In [5]:
# transformation functions

# tokenize the text inputs
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
# convert 'pos' and 'neg' labels to 1, 0
label_pipeline = lambda x: 1. if x == 'pos' else 0.


# Create a wrapping for the transformation and encoding
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []

    for _label, _text in batch:
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        
        text_list.append(processed_text)
        label_list.append(label_pipeline(_label))
        lengths.append(processed_text.size(0))
    
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    
    # Pad to identical length for all inputs
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)

    return padded_text_list, label_list, lengths

dataloader = DataLoader(train_data, batch_size=4, shuffle=False, collate_fn=collate_batch)

In [6]:
text_batch, label_batch, length_batch = next(iter(dataloader))

print(text_batch)
print(label_batch)
print(text_batch.shape)

tensor([[   10,   218,    11,  ...,     0,     0,     0],
        [ 7067,    31,    10,  ...,   117,  3042, 43116],
        [  260,    64,   110,  ...,     0,     0,     0],
        [   10,   140,   110,  ...,     0,     0,     0]])
tensor([0., 0., 0., 0.])
torch.Size([4, 261])


In [15]:
batch_size = 32

train_dl = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_data, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(list(test_data), batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [8]:
# We have generated sequences of tokens and padded them to be identical length
# Now, these need to be encoded - one-hot encoding would create vectors the size of the vocabulary - 10^4 or 10^5 words

# A more compact embedding can rely on real valued vectors - the embedding dimension can be far smaller than the vocab
# This also allows for more slient feature extraction since these embedding layers can be optimized
# Index 0 is reserved for padding, and index 1 is reserved for words not present in the token set

# Given a set of tokens of size n + 2 (the + 2 is for padding and unseen words), generate an embedding
# matrix of size (n + 2) x dim(embedding) - this will serve as the input to the NN

embedding = nn.Embedding(num_embeddings=10, embedding_dim=3, padding_idx=0)
text_encoded_input = torch.LongTensor([[1, 2, 4, 5], [4, 1, 0, 3]])
print(embedding(text_encoded_input))

tensor([[[-1.0324,  1.4106,  1.0425],
         [ 0.8986, -0.5944, -0.9850],
         [-1.6266, -1.0227,  1.0709],
         [ 1.8281, -0.8004, -0.9363]],

        [[-1.6266, -1.0227,  1.0709],
         [-1.0324,  1.4106,  1.0425],
         [ 0.0000,  0.0000,  0.0000],
         [-0.0713, -0.3351,  1.5477]]], grad_fn=<EmbeddingBackward0>)


In [9]:
class ExampleRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        # Actual RNN layers
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=2, batch_first=True)
        # Alternatives to the RNN layers
        ## GRU: Gated Recurrent Unit - an alternative to LSTMs
        # self.rnn = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=2, batch_first=True)
        ## LSTM: Uses LSTM cells
        # self.rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=2, batch_first=True)

        # Then add a fully connected non-recurrent layer at the end
        self.fc = nn.Linear(in_features=hidden_size, out_features=1)
    
    def forward(self, x):
        _, hidden = self.rnn(x)
    
        out = hidden[-1, :, :] # Take the final hidden state from the last hidden layer as the input
        out = self.fc(out)

        return out
    
model = ExampleRNN(64, 32)
print(model)
print(model(torch.randn(5, 3, 64)))

ExampleRNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)
tensor([[-0.0734],
        [-0.1649],
        [-0.2072],
        [-0.1533],
        [-0.5973]], grad_fn=<AddmmBackward0>)


In [10]:
# Implement the actual RNN model

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) -> None:
        super().__init__()

        # Start with an embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim=embed_dim, padding_idx=0)
        # Implement the RNN layer - uses an LSTM here
        self.rnn = nn.LSTM(input_size=embed_dim, hidden_size=rnn_hidden_size)

        # On to regular fully connected layers
        self.fc1 = nn.Linear(in_features=rnn_hidden_size, out_features=fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(in_features=fc_hidden_size, out_features=1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, text, lengths):
        out = self.embedding(text)
        # pad sequences to be of the same length, leaving them unsorted
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)

        out, (hidden, cell) = self.rnn(out)
        # Take the last time step output
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)

        return out

vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(42)

model = RNN(vocab_size=vocab_size, embed_dim=embed_dim, rnn_hidden_size=rnn_hidden_size, fc_hidden_size=fc_hidden_size)
print(model)

RNN(
  (embedding): Embedding(69211, 20, padding_idx=0)
  (rnn): LSTM(20, 64)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [11]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0

    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()

        total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item() * label_batch.size(0)
    
    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0

    for text_batch, label_batch, lengths in dataloader:
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        

        total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item() * label_batch.size(0)
    
    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

In [12]:
num_epochs = 2
torch.manual_seed(42)
loss_hist = {'train': [], 'valid': []}
acc_hist = {'train': [], 'valid': []}

for epoch in range(num_epochs):
    acc, loss = train(train_dl)
    loss_hist['train'].append(loss)
    acc_hist['train'].append(acc)

    acc, loss = evaluate(valid_dl)
    loss_hist['valid'].append(loss)
    acc_hist['valid'].append(acc)

    print(f"Epoch {epoch}\nTrain Acc: {acc_hist['train'][-1]:.4f}, Valid Acc: {acc_hist['valid'][-1]:.4f} ")

Epoch 0
Train Acc: 0.9895, Valid Acc: 1.0000 
Epoch 1
Train Acc: 1.0000, Valid Acc: 1.0000 


In [16]:
acc_test, _ = evaluate(test_dl)
print(f'Test accuracy: {acc_test:.4f}')

Test accuracy: 1.0000
