<a href="https://colab.research.google.com/github/sailu1997/Semantic-Analysis-of-IMDB-Rating/blob/main/IMDB_rating.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [2]:
import functools
import sys

import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm

In [3]:
torch.manual_seed(42)

<torch._C.Generator at 0x788c07cd38d0>

In [4]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

In [5]:
train_data , test_data = datasets.load_dataset('imdb' , split=['train' , 'test'])

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

# New Section

**Tokenising**

Next step after downloading and splitting the required dataset is Tokenising

In [6]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [9]:
def tokenize_data(example , tokenizer , max_length):
  tokens = tokenizer(example['text'])[:max_length]
  length = len(tokens)
  return {'tokens': tokens , 'length': length}

In [10]:
max_length = 256

train_data = train_data.map(tokenize_data , fn_kwargs = {'tokenizer': tokenizer , 'max_length':max_length})
test_data = test_data.map(tokenize_data , fn_kwargs = {'tokenizer': tokenizer , 'max_length':max_length})


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Split the train and test data to avoid data leakage

Data Leakage happens when test data is used in training phase

In [11]:
test_size = 0.25

train_valid_data = train_data.train_test_split(test_size = test_size)
train_data = train_valid_data['train']
valid_data = train_valid_data['test']

In [12]:
min_freq = 5
special_tokens = ['<unk>' , '<pad>']

vocab = torchtext.vocab.build_vocab_from_iterator(train_data['tokens'] ,
                                                  min_freq = min_freq ,
                                                  specials = special_tokens)

In [13]:
unk_index = vocab['<unk>']
pad_index = vocab['<pad>']

In [14]:
#Tokenized value for unknown is set to 1
vocab.set_default_index(unk_index)

In [15]:
def numericalize_data(example , vocab):
  ids = [vocab[token] for token in example['tokens']]
  return {'ids' : ids}

Prepare the dataset for the model

In [16]:
train_data = train_data.map(numericalize_data , fn_kwargs = {'vocab':vocab})
valid_data = valid_data.map(numericalize_data , fn_kwargs = {'vocab':vocab})
test_data = test_data.map(numericalize_data , fn_kwargs = {'vocab' : vocab})

Map:   0%|          | 0/18750 [00:00<?, ? examples/s]

Map:   0%|          | 0/6250 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [17]:
train_data = train_data.with_format(type = 'torch' , columns = ['ids' , 'label' , 'length'])
valid_data = valid_data.with_format(type = 'torch' , columns = ['ids' , 'label' , 'length'])
test_data = test_data.with_format(type = 'torch' , columns = ['ids' , 'label' , 'length'])

In [18]:
train_data

Dataset({
    features: ['text', 'label', 'tokens', 'length', 'ids'],
    num_rows: 18750
})

In [19]:
train_data[1]

{'label': tensor(1),
 'length': tensor(132),
 'ids': tensor([ 3216,    50,   229,    62,    21,    14,   118,    39,  3973,   619,
          8449,   559,  5607,     6,   612,  7305,     8,  2851,     2,   181,
             7,    14,   118,     3,    38,    81,     4,     2,   118,    17,
          1828,     6,    67,   654,    20,   293,  3097,     6, 16877, 18900,
             3,   293,     9,    16,  1936,  2286,     2,   186,     3, 16877,
             6,     2,   305,   338,    30,    68,     5,  1179,  1554,   181,
             3,     2,   558,  1984,   106,   293,     9,    16,   367,    13,
          3394,  3351,   332,     8,    76,     3,    12,   435,    61,   331,
            38,  3361,   150,   164,     6,   101,  6172,     6,   375,    77,
          1523,     6,  7539,     3,  3216,   149,    35,  2585,     8,  4751,
            14,   118,     6,    67, 15831,    14,    19,   293,     9,    16,
           580,   616,     3,    14,   118,    50, 20544,     6,  1389,    91,


Model Building

In [55]:
class LSTM(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional,
               dropout_rate, pad_index):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, bidirectional=bidirectional,
                        dropout = dropout_rate , batch_first=True)
    self.fc = nn.Linear(hidden_dim*2 if bidirectional else hidden_dim, output_dim)
    self.dropout = nn.Dropout(dropout_rate)

  def forward(self, ids, length):
    embedded = self.dropout(self.embedding(ids))
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, length, batch_first=True,
                                                        enforce_sorted=False)
    packed_output, (hidden, cell) = self.lstm(packed_embedded)
    output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)
    if self.lstm.bidirectional:
      hidden = self.dropout(torch.cat([hidden[-1] , hidden[-2]], dim=-1))
    else:
      hidden = self.dropout(hidden[-1])

    prediction = self.fc(hidden)
    return prediction


In [56]:
vocab_size = len(vocab)
embedding_dim = 300 #can be any value (64,128,256)
hidden_dim = 300 #can be any value (64,128,256)
output_dim = len(train_data.unique('label')) #either 0 or 1
n_layers = 2
bidirectional = True
dropout = 0.5

model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout,
             pad_index)

In [57]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameteres')

The model has 10,077,302 trainable parameteres


In [58]:
def initialize_weights(m):
  if isinstance(m , nn.Linear):
    nn.init.xavier_normal_(m.weight)
    nn.init.zeros_(m.bias)
  elif isinstance(m, nn.LSTM):
    for name, param in m.named_parameters():
      if 'bias' in name:
        nn.init.zeros_(param)
      elif 'weight' in name:
        nn.init.orthogonal_(param)

In [59]:
model.apply(initialize_weights)

LSTM(
  (embedding): Embedding(21555, 300, padding_idx=1)
  (lstm): LSTM(300, 300, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=600, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [60]:
vectors = torchtext.vocab.FastText() #extra pre-trained embeddings

In [61]:
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())

In [62]:
model.embedding.weight.data = pretrained_embedding

In [63]:
lr = 1e-4

optimizer = optim.Adam(model.parameters(), lr=lr) #algorithm for gradient descent [ADAM , SGD, RMSProp]
criterion = nn.CrossEntropyLoss() #our loss function [Binary Cross Entropy, Cross Entropy]

model = model.to(device) #switch our model training in GPU
criterion = criterion.to(device) #Switch our loss in GPU

In [64]:
def collate(batch, pad_index):
  batch_ids = [i['ids'] for i in batch]
  batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
  batch_length = [i['length'] for i in batch]
  batch_length = torch.stack(batch_length)
  batch_label = [i['label'] for i in batch]
  batch_label = torch.stack(batch_label)
  batch = {'ids' : batch_ids,
           'length' : batch_length,
           'label' : batch_label}
  return batch

In [65]:
batch_size = 256

collate = functools.partial(collate, pad_index=pad_index)

train_dataloader = torch.utils.data.DataLoader(train_data,
                                               batch_size=batch_size,
                                               collate_fn=collate,
                                               shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_data,
                                               batch_size=batch_size,
                                               collate_fn=collate,
                                               )
test_dataloader = torch.utils.data.DataLoader(test_data,
                                               batch_size=batch_size,
                                               collate_fn=collate,
                                               )

In [66]:
def train(dataloader, model, criterion, optimizer, device):

  model.train()
  epoch_losses = []
  epoch_accs = []

  for batch in tqdm.tqdm(dataloader, desc="training....",file=sys.stdout):
    ids = batch['ids'].to(device)

    #batch length - dataloader
    length = batch['length']
    label = batch['label'].to(device)

    #y-hat = prediction from model
    prediction = model(ids, length)
    #lossfunction -> actual value , predicted value
    #actual value - label
    #predicted value - prediction
    loss = criterion(prediction , label)

    accuracy = get_accuracy(prediction , label) #accuracy score

    #adam->gradient descent
    optimizer.zero_grad()

    loss.backward()
    optimizer.step() #we will update the weights with learning rate

    epoch_losses.append(loss.item())
    epoch_accs.append(accuracy.item())

  return epoch_losses, epoch_accs

In [67]:
def evaluate(dataloader, model, criterion, device):

  model.eval()
  epoch_losses = []
  epoch_accs = []

#weights updated only duirng training, not during inferencing.. by inferencing we mean evaluation
  with torch.no_grad(): #no optimization -> no update in weights
    for batch in tqdm.tqdm(dataloader, desc="evaluating....",file=sys.stdout):
      ids = batch['ids'].to(device)

      #batch length - dataloader
      length = batch['length']
      label = batch['label'].to(device)

      #y-hat = prediction from model
      prediction = model(ids, length)
      #lossfunction -> actual value , predicted value
      #actual value - label
      #predicted value - prediction
      loss = criterion(prediction , label)

      accuracy = get_accuracy(prediction , label) #accuracy score

      epoch_losses.append(loss.item())
      epoch_accs.append(accuracy.item())

  return epoch_losses, epoch_accs

In [68]:
def get_accuracy(prediction, label):
  batch_size, _=prediction.shape
  prediction_classes = prediction.argmax(dim=-1)
  correct_predictions = prediction_classes.eq(label).sum()
  accuracy = correct_predictions/batch_size
  return accuracy

In [72]:
n_epochs = 3
best_valid_loss = float('inf')

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(n_epochs):

  train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, device)
  valid_loss, valid_acc = evaluate(valid_dataloader, model, criterion, device)

  train_losses.extend(train_loss)
  train_accs.extend(train_acc)
  valid_losses.extend(valid_loss)
  valid_accs.extend(valid_acc)

  epoch_train_loss = np.mean(train_loss)
  epoch_train_acc = np.mean(train_acc)
  epoch_valid_loss = np.mean(valid_loss)
  epoch_valid_acc = np.mean(valid_acc)

  if epoch_valid_loss < best_valid_loss:
    best_valid_loss = epoch_valid_loss
    torch.save(model.state_dict(), 'lstm.pt')

  print(f'epoch: {epoch+1}')
  print(f'train_loss: {epoch_train_loss:.3f}, train_acc: {epoch_train_acc:.3f}')
  print(f'valid_loss: {epoch_valid_loss:.3f}, valid_acc: {epoch_valid_acc:.3f}')

training....: 100%|██████████| 74/74 [00:38<00:00,  1.93it/s]
evaluating....: 100%|██████████| 25/25 [00:03<00:00,  6.47it/s]
epoch: 1
train_loss: 0.397, train_acc: 0.825
valid_loss: 0.374, valid_acc: 0.836
training....: 100%|██████████| 74/74 [00:36<00:00,  2.02it/s]
evaluating....: 100%|██████████| 25/25 [00:03<00:00,  6.90it/s]
epoch: 2
train_loss: 0.366, train_acc: 0.839
valid_loss: 0.345, valid_acc: 0.849
training....: 100%|██████████| 74/74 [00:37<00:00,  1.99it/s]
evaluating....: 100%|██████████| 25/25 [00:03<00:00,  6.74it/s]
epoch: 3
train_loss: 0.351, train_acc: 0.849
valid_loss: 0.335, valid_acc: 0.855


Evaluation of Model

In [73]:
model.load_state_dict(torch.load('lstm.pt')) #save the models

test_loss, test_acc = evaluate(test_dataloader, model, criterion, device)

epoch_test_loss = np.mean(test_loss)
epoch_test_acc = np.mean(test_acc)

print(f'test_loss: {epoch_test_loss:.3f}, test_acc:')

evaluating....: 100%|██████████| 98/98 [00:14<00:00,  6.62it/s]
test_loss: 0.352, test_acc:


In [74]:
def predict_sentiment(text, model, tokenizer, vocab, device):
  tokens = tokenizer(text)
  ids = [vocab[t] for t in tokens]
  length = torch.LongTensor([len(ids)])
  tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
  prediction = model(tensor, length).squeeze(dim=0)
  probability = torch.softmax(prediction, dim=-1)
  predicted_class = prediction.argmax(dim=-1).item()
  predicted_probability = probability[predicted_class].item()
  return predicted_class, predicted_probability

In [75]:
text = "Amazing movie, loved it"
predict_sentiment(text, model, tokenizer, vocab, device)

(1, 0.7935748100280762)

In [76]:
text = "Worst Movie Ever"
predict_sentiment(text, model, tokenizer, vocab, device)

(0, 0.6552034616470337)