In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from tqdm.auto import tqdm
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')

import plotly
from plotly.offline import init_notebook_mode, iplot
plotly.offline.init_notebook_mode(connected=True)
import plotly.graph_objects as go

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import json
import os

api_token = {"username": "<-- Your Username>", 
             "key": "<-- Your Key -->"}

with open('/content/kaggle.json', 'w') as file:
    json.dump(api_token, file)

os.environ["KAGGLE_CONFIG_DIR"] = "/content/"

!chmod 600 /content/kaggle.json
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
!unzip /content/imdb-dataset-of-50k-movie-reviews.zip

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 19% 5.00M/25.7M [00:00<00:02, 8.27MB/s]
100% 25.7M/25.7M [00:00<00:00, 38.5MB/s]
Archive:  /content/imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [4]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.5)
train_df.shape, test_df.shape

((25000, 2), (25000, 2))

In [6]:
MAX_LEN = 30
EMBEDDING_DIM = 16
BATCH_SIZE = 64
UNITS = 10

In [7]:
def tokenize_text(text, maxlen):
    text = text.lower()
    text = text.replace('<br />', ' ')
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub('\s+', ' ', text)
    tokenized_text = word_tokenize(text)
    tokenized_text = tokenized_text[:maxlen]
    tokenized_text += ['<PAD>'] * (maxlen - len(tokenized_text))
    return tokenized_text


def build_vocab(texts, maxlen):
    vocab = {'<PAD>': 0}
    for text in tqdm(texts, desc='Building vocab'):
        tokenized_text = tokenize_text(text, maxlen)
        for token in tokenized_text:
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

In [8]:
class DataIter(torch.utils.data.Dataset):
    
    def __init__(self, dataset, maxlen, vocab):
        self.dataset = dataset
        self.maxlen = maxlen
        self.vocab = vocab
        self.unique_classes = sorted(list(dataset.sentiment.unique()))

    
    def __len__(self):
        return len(self.dataset)
    
    
    def __getitem__(self, index):
        text = self.dataset.iloc[index].review
        label = self.dataset.iloc[index].sentiment
        
        # preprocess and tokenize text
        tokenized_text = tokenize_text(text, self.maxlen)
        
        vectorized_text = []
        for token in tokenized_text:
            vectorized_text.append(
                self.vocab[token]
            )
        
        # encode label
        label_encoded = self.unique_classes.index(label)
        return (vectorized_text, label_encoded)


def detokenize(tensor, vocab):
    texts = []
    keys, values = list(vocab.keys()), list(vocab.values())
    for row in tensor:
        text = ""
        for token in row:
            text += keys[values.index(int(token))] + ' '
        texts.append(text)
    return texts

In [9]:
vocab = build_vocab(list(df.review), MAX_LEN)
vocab_size = len(vocab)
print('Vocab size:', vocab_size)

Building vocab:   0%|          | 0/50000 [00:00<?, ?it/s]

Vocab size: 52022


In [10]:
train_dataset = DataIter(train_df, MAX_LEN, vocab)
test_dataset  = DataIter(test_df, MAX_LEN, vocab)

train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=BATCH_SIZE)

test_loader = torch.utils.data.DataLoader(test_dataset, 
                                          batch_size=BATCH_SIZE)

In [11]:
train_dataset.unique_classes == test_dataset.unique_classes

True

In [12]:
class AdditiveAttention(nn.Module):
    
    def __init__(self, key_size, query_size, num_hiddens, dropout=0):
        super().__init__()
        self.W_k = nn.Linear(key_size, num_hiddens, bias=False)
        self.W_q = nn.Linear(query_size, num_hiddens, bias=False)
        self.w_v = nn.Linear(num_hiddens, 1, bias=False)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, queries, keys, values):
        queries, keys = self.W_q(queries), self.W_k(keys)
        features = queries.unsqueeze(2) + keys.unsqueeze(1)
        features = torch.tanh(features)
        scores = self.w_v(features).squeeze(-1)
        attention_weights = nn.Softmax(dim=-1)(scores)
        context = torch.bmm(self.dropout(attention_weights), values)
        return context, attention_weights

In [13]:
tmp = torch.stack(next(iter(train_loader))[0], axis=1)
print('Input shape:', tmp.shape)

tmp_emb = nn.Embedding(len(vocab), EMBEDDING_DIM)(tmp)
print('Embedding Shape:', tmp_emb.shape)

att_fn = AdditiveAttention(EMBEDDING_DIM, EMBEDDING_DIM, UNITS)
context, att_weights = att_fn(tmp_emb, tmp_emb, tmp_emb)
print('Context Vector Shape:', context.shape)
print('Attention Weights Shape:', att_weights.shape)

Input shape: torch.Size([64, 30])
Embedding Shape: torch.Size([64, 30, 16])
Context Vector Shape: torch.Size([64, 30, 16])
Attention Weights Shape: torch.Size([64, 30, 30])


In [51]:
class Model(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, units):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.attention = AdditiveAttention(
            embedding_dim, embedding_dim, units, 0.5)
        self.rnn = nn.GRU(2*embedding_dim, units, batch_first=True)
        self.fc = nn.Linear(units, 2)
        self.activation = nn.Softmax(dim=-1)
    
    def forward(self, x):
        # embedding
        x = self.embedding(x)
        
        # calculating context and attention weights
        context, att_weights = self.attention(x, x, x)
        
        # giving context to GRU
        context = torch.concat([x, context], axis=-1)
        hidden_states, _ = self.rnn(context)
        hidden_state = hidden_states[:, 0, :]
        
        # final layer output
        x = self.activation(self.fc(hidden_state))
        
        return x, att_weights

In [52]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')

net = Model(
    vocab_size=vocab_size,
    embedding_dim=EMBEDDING_DIM,
    units=UNITS,
).to(device)


def loss_fn(y_true, y_pred):
    loss = torch.nn.CrossEntropyLoss(
        reduction='none')(y_pred, y_true)
    # loss = loss * mask
    return loss.mean()

trainer = torch.optim.Adam(net.parameters())
num_epochs = 10

In [53]:
for epoch in range(num_epochs):
    with tqdm(total=len(train_loader)) as pbar:
        losses = []
        accuracy = []

        for x, y in train_loader:
            # tensors to cuda
            x = torch.stack(x, axis=1)
            x = x.to(device)
            y = y.to(device)

            # setting net to train mode
            net.train()

            # forward and backward pass
            pred, _ = net(x)
            l = loss_fn(y, pred)
            trainer.zero_grad()
            l.backward()
            trainer.step()
            
            # keeping log
            losses.append(l)
            accuracy.append(
                float(sum(torch.argmax(pred, axis=1) == y)/len(y))
            )
            
            pbar.update(1)

        print(f'Epoch: {epoch + 1} | ' + 
              f'Train Loss: {sum(losses)/len(losses):.4f} | ' +
              f'Train Acc: {sum(accuracy)/len(accuracy):.4f}')

  0%|          | 0/391 [00:00<?, ?it/s]

Epoch: 1 | Train Loss: 0.6928 | Train Acc: 0.5148


  0%|          | 0/391 [00:00<?, ?it/s]

Epoch: 2 | Train Loss: 0.6810 | Train Acc: 0.5654


  0%|          | 0/391 [00:00<?, ?it/s]

Epoch: 3 | Train Loss: 0.6549 | Train Acc: 0.6091


  0%|          | 0/391 [00:00<?, ?it/s]

Epoch: 4 | Train Loss: 0.6336 | Train Acc: 0.6458


  0%|          | 0/391 [00:00<?, ?it/s]

Epoch: 5 | Train Loss: 0.6150 | Train Acc: 0.6696


  0%|          | 0/391 [00:00<?, ?it/s]

Epoch: 6 | Train Loss: 0.5999 | Train Acc: 0.6893


  0%|          | 0/391 [00:00<?, ?it/s]

Epoch: 7 | Train Loss: 0.5909 | Train Acc: 0.6992


  0%|          | 0/391 [00:00<?, ?it/s]

Epoch: 8 | Train Loss: 0.5802 | Train Acc: 0.7130


  0%|          | 0/391 [00:00<?, ?it/s]

Epoch: 9 | Train Loss: 0.5688 | Train Acc: 0.7258


  0%|          | 0/391 [00:00<?, ?it/s]

Epoch: 10 | Train Loss: 0.5589 | Train Acc: 0.7386


In [54]:
def check_performance(data_iter, net, loss_fn):
    losses = []
    accuracy = []
    
    net.eval()
    
    with tqdm(total=len(data_iter)) as pbar:
        for x, y in data_iter:
            x = torch.stack(x, axis=1)
            x = x.to(device)
            y = y.to(device)
            with torch.no_grad():
                pred, _ = net(x)
            l = loss_fn(y, pred)

            losses.append(l)
            accuracy.append(
                float(sum(torch.argmax(pred, axis=1) == y)/len(y))
            )
            pbar.update(1)
    
    return sum(accuracy)/len(accuracy), sum(losses)/len(losses)

In [55]:
test_acc, test_loss = check_performance(test_loader, net, loss_fn)
print('Test Accuracy:', test_acc, 'Test Loss:', test_loss)

  0%|          | 0/391 [00:00<?, ?it/s]

Test Accuracy: 0.7109654731762683 Test Loss: tensor(0.5797, device='cuda:0')


In [56]:
def vectorize(tokens):
    vectorized = []
    for t in tokens:
        vectorized.append(vocab[t])
    return torch.tensor(vectorized).unsqueeze(0)

In [57]:
def predict(tokens, net):
    net.eval()
    if type(tokens) == str:
        tokens = vectorize(tokenize_text(tokens, MAX_LEN))

    tokens = tokens.to(device)
    with torch.no_grad():
        pred, att_weights = net(tokens)
    pred = torch.argmax(pred.squeeze(0))
    return pred.cpu().numpy(), att_weights.squeeze(0).cpu()

In [37]:
test_batch = next(iter(train_loader))
tokens = torch.stack(test_batch[0], axis=1)
labels = test_batch[1]

In [68]:
idx = 7
inp_tokens = tokens[idx].unsqueeze(0)
pred, att_weights = predict(inp_tokens, net)

detokenized_sent = detokenize(inp_tokens, vocab)[0]
print(f'Input Text: {detokenized_sent}\n')
print('Predicted Class:', train_dataset.unique_classes[pred])
print('Actual Class:', train_dataset.unique_classes[labels[idx].numpy()])

Input Text: what a shame this movie was never released it is now playing on cable i tuned in based on my high regard for the stars and was rewarded by seeing 

Predicted Class: positive
Actual Class: positive


In [69]:
import IPython
display(IPython.core.display.HTML('''
    <script src="/static/components/requirejs/require.js"></script>
    <script>
        requirejs.config({
        paths: {
            base: '/static/base',
            plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
        },
        });
    </script>
    '''))

trace = go.Heatmap(z=att_weights, 
                   x=detokenized_sent.strip().split(' '), 
                   y=detokenized_sent.strip().split(' '), 
                   colorscale='Reds')
iplot([trace])

On GitHub the above plotly plot is not visible, that's why I am attaching the image below.

<center><img src="https://i.ibb.co/Dp5Y05D/newplot.png" max-width=100% height=400px></center>

As you can see, it is automatically paying attention to the "best" token in the review. That's why, it has predicted this as "positive" review.