In [1]:
import numpy as np
import pandas as pd
import nltk
import re

# Read dataset

In [2]:
df = pd.read_csv("~/Data/IMDB/IMDB_Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Load BERT

In [4]:
import torch

In [5]:
from transformers import BertTokenizer, BertModel

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

# Text Preprocessing

In [7]:
def clean_html(text):
    cleaner_regex = re.compile('<.*?>')
    clean_text = re.sub(cleaner_regex, '', text)
    return clean_text
    

In [8]:
def sentiment_mapper(sent):
    if sent == "positive":
        return 1
    else:
        return 0

In [9]:
def bert_formatting(text):
    sent_text = nltk.sent_tokenize(text)
    sent = "[CLS] "
    for j in sent_text:
        sent = sent + j + " [SEP]"
        
    return sent

In [10]:
# the first element of output is the hidden state of the last layer of the bert model
def bert_encoder(text):
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    if len(indexed_tokens) > 512:
        indexed_tokens = indexed_tokens[:512]
        
    segment_ids = [1] * len(indexed_tokens)
    
    assert len(indexed_tokens) == len(segment_ids)
    
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segment_ids])
    
    assert tokens_tensor.shape == segments_tensors.shape
    
    with torch.no_grad():
        outputs = model(tokens_tensor, token_type_ids=segments_tensors)
        
    return outputs[0].numpy()

In [11]:
df['Clean_Text'] = df['review'].apply(clean_html)

In [12]:
df.head()

Unnamed: 0,review,sentiment,Clean_Text
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is..."


In [13]:
df['y'] = df['sentiment'].apply(sentiment_mapper)

In [14]:
df.head()

Unnamed: 0,review,sentiment,Clean_Text,y
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [15]:
df['Bert_Ready_Text'] = df['Clean_Text'].apply(bert_formatting)

In [16]:
df['Bert_Ready_Text'].head()

0    [CLS] One of the other reviewers has mentioned...
1    [CLS] A wonderful little production. [SEP]The ...
2    [CLS] I thought this was a wonderful way to sp...
3    [CLS] Basically there's a family where a littl...
4    [CLS] Petter Mattei's "Love in the Time of Mon...
Name: Bert_Ready_Text, dtype: object

In [30]:
%%time
data = {}
for j in range(df.shape[0]):
    
    x = bert_encoder(df['Bert_Ready_Text'][j])
    print(x.shape)
    break
    if (j+1)%200 == 0:
        print(f'{j+1}/{df.shape[0]}')
        
    data[j] = x
        

(1, 402, 768)
CPU times: user 2.31 s, sys: 56.2 ms, total: 2.37 s
Wall time: 610 ms


In [18]:
import pickle
with open('~/Data/IMDB/imdb_encoded_data.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

FileNotFoundError: [Errno 2] No such file or directory: '~/Data/IMDB/imdb_encoded_data.pickle'

In [19]:
# implement lstm network in pytorch

In [20]:
import torch
import torch.nn as nn

In [31]:
input_dim = 768
hidden_dim = 256
n_layers = 1

In [32]:
lstm_layer = nn.LSTM(input_dim, 
                    hidden_dim,
                    n_layers,batch_first=True)

In [33]:
batch_size = 1
seq_len = 512

In [34]:
inps = torch.randn(batch_size, seq_len, input_dim)

In [35]:
hidden_state = torch.randn(n_layers, batch_size, hidden_dim)

In [36]:
cell_state = torch.randn(n_layers, batch_size, hidden_dim)

In [41]:
hidden = (hidden_state, cell_state)

In [42]:
inps.shape

torch.Size([1, 512, 768])

In [43]:
hidden[0].shape

torch.Size([1, 1, 256])

In [44]:
hidden[1].shape

torch.Size([1, 1, 256])

In [45]:
out, hidden = lstm_layer(inps, hidden)

In [46]:
out.shape

torch.Size([1, 512, 256])

In [48]:
hidden[0].shape

torch.Size([1, 1, 256])

In [49]:
hidden[1].shape

torch.Size([1, 1, 256])

In [51]:
seq_len = 512
inps = torch.randn(batch_size, seq_len, input_dim)
out, hidden = lstm_layer(inps, hidden)

In [52]:
out = out.squeeze()[-1, :]

# Sentiment Classifier

In [59]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building your LSTM
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # 28 time steps
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # Index hidden state of last time step
        # out.size() --> 100, 28, 100
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
        out = self.fc(out[:, -1, :]) 
        # out.size() --> 100, 10
        return out


In [64]:
input_dim = 768
hidden_dim = 256
layer_dim = 1
output_dim = 1
num_epochs = 5

In [61]:
model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)

In [62]:
criterion = nn.BCELoss()

In [63]:
learning_rate = 0.1

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  

In [None]:
# Number of steps to unroll
seq_dim = 512

iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images as a torch tensor with gradient accumulation abilities
        images = images.view(-1, seq_dim, input_dim).requires_grad_()

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        outputs = model(images)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                # Resize images
                images = images.view(-1, seq_dim, input_dim)

                # Forward pass only to get logits/output
                outputs = model(images)

                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (predicted == labels).sum()

            accuracy = 100 * correct / total

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))