### Imports


In [1]:
import pyprind
import pandas as pd
import os
import random
import time

In [2]:
import pickle as pkl
from tqdm import tqdm_notebook

In [3]:
basepath = './aclImdb'

In [4]:
from tensorboard import TensorBoard
from AdamW import AdamW

### Load Data

##### We will start by downloading IMDB Movie review dataset text dataset:

```http://ai.stanford.edu/~amaas/data/sentiment/```

In [5]:
labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
train_df = pd.DataFrame()
test_df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r',
                      encoding='utf-8') as infile:
                txt = infile.read()
            if s == 'test':
                train_df = train_df.append([[txt, labels[l]]], ignore_index=True)
            elif s == 'train':
                test_df = test_df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()

train_df.columns = ['review', 'sentiment']
test_df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:59


In [6]:
# train test split
from sklearn.model_selection import train_test_split

trn_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [7]:
train_data = trn_df.review.values
train_targets = trn_df.sentiment.values

val_data = val_df.review.values
val_targets = val_df.sentiment.values

test_data = test_df.review.values
test_targets = test_df.sentiment.values

In [8]:
print ("Train dataset size is: {}".format(train_data.shape[0]))
print ("Val dataset size is: {}".format(val_data.shape[0]))
print ("Test dataset size is: {}".format(test_df.shape[0]))

Train dataset size is: 20000
Val dataset size is: 5000
Test dataset size is: 25000


In [9]:
# Random sample from train dataset
import random
print (train_data[random.randint(0, len(train_data) - 1)])

(r#64)<br /><br />Unredeemable, merit-less, and above all dreary trash. You know a movie is going to be bad when its sole star power is Lance Henriksen. The French title for this movie says it all: "Inexplicable". How can you possibly make a movie this unbelievably bad in this day and age? Whatever Jonas Quastel's trick is, it worked. This is über-trash, I'm talking 'Manos'-level crap, meaningless, unwatchable, not-even-so-bad-it's-good, cinematic bile of the highest order.<br /><br />Lance Henriksen IS Harlan Knowles, a character who could have been interesting if he wasn't so utterly devoid of characteristics or personality. He, along with a bunch of morons, goes on a field trip to search for an evil Sasquatch which is believed to have attacked a plane which crashed out in the woods, or something. Not much else happens. There's some soft-core (meaning: Teletubbie level) nudity and some blatant rip-offs of "Predator". After 92 minutes of utter pain and another ripped off scene, this t

Before training the classifier, first we are going to tokenize the dataset using spacy.io

Run (shown in the cell below):

* ```pip install spacy```
* ```python -m spacy download en_core_web_sm```

In [10]:
# Let's write the tokenization function 

import spacy
import string

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# lowercase and remove punctuation
def tokenize(sent, n=2):
    tokens = tokenizer(sent)
    unigrams = [token.text.lower() for token in tokens if (token.text not in punctuations)]
    ngrams = []
    for k in range(1, n+1):
        for i in range(len(unigrams) - k + 1):
            ngrams.append('_'.join([unigrams[i+j] for j in range(k)]))
    return ngrams

# Example
tokens = tokenize(u'Apple is looking at buying U.K. startup for $1 billion')
print (tokens)

['apple', 'is', 'looking', 'at', 'buying', 'u.k.', 'startup', 'for', '1', 'billion', 'apple_is', 'is_looking', 'looking_at', 'at_buying', 'buying_u.k.', 'u.k._startup', 'startup_for', 'for_1', '1_billion']


### Add n-gram tokenization

##### Remove HTML Tags, Tokenization

In [35]:
# Alternatively try running the following multi-threaded version of tokenization
# Credit to Ilya Kulikov

def tokenize_n_grams(parsed, n=1):    
    unigrams = [token.text.lower() for token in parsed if (token.text not in punctuations)]
    
    # replace HTML symbols
    unigrams = [token.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<") for token in unigrams]
    
    ngrams = []
    for k in range(1, n+1):
        for i in range(len(unigrams) - k + 1):
            ngrams.append('_'.join([unigrams[i+j] for j in range(k)]))
    return ngrams

def tokenize_dataset(dataset, n):
    token_dataset = []
    # we are keeping track of all tokens in dataset
    # in order to create vocabulary later
    all_tokens = []

    for sample in tqdm_notebook(tokenizer.pipe(dataset, disable=['parser', 'tagger', 'ner'], batch_size=512, n_threads=8)):
        tokens = tokenize_n_grams(sample, n)
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens


n_gram = 2

# val set tokens
print ("Tokenizing val data")
# val_data_tokens, _ = tokenize_dataset(val_data, n_gram)
# pkl.dump(val_data_tokens, open("val_data_tokens.p", "wb"))

# test set tokens
print ("Tokenizing test data")
# test_data_tokens, _ = tokenize_dataset(test_data, n_gram)
# pkl.dump(test_data_tokens, open("test_data_tokens.p", "wb"))

# train set tokens
print ("Tokenizing train data")
# train_data_tokens, all_train_tokens = tokenize_dataset(train_data, n_gram)
# pkl.dump(train_data_tokens, open("train_data_tokens.p", "wb"))
# pkl.dump(all_train_tokens, open("all_train_tokens.p", "wb"))

Tokenizing val data
Tokenizing test data
Tokenizing train data


In [90]:
# len(val_data[0].split()), len(val_data_tokens[0])

In [13]:
# Then, load preprocessed train, val and test datasets
train_data_tokens = pkl.load(open("train_data_tokens.p", "rb"))
all_train_tokens = pkl.load(open("all_train_tokens.p", "rb"))

val_data_tokens = pkl.load(open("val_data_tokens.p", "rb"))
test_data_tokens = pkl.load(open("test_data_tokens.p", "rb"))

# double checking
print ("Train dataset size is {}".format(len(train_data_tokens)))
print ("Val dataset size is {}".format(len(val_data_tokens)))
print ("Test dataset size is {}".format(len(test_data_tokens)))

print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000
Total number of tokens in train dataset is 9390700


Now, we are going to create the vocabulary of most common 10,000 tokens in the training set.

In [14]:
from collections import Counter

max_vocab_size = 10000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens)

In [20]:
# Lets check the dictionary by loading random token from it

random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 1818 ; token lee
Token lee; token id 1818


In [21]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

train_data_indices = token2index_dataset(train_data_tokens)
val_data_indices = token2index_dataset(val_data_tokens)
test_data_indices = token2index_dataset(test_data_tokens)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices)))
print ("Val dataset size is {}".format(len(val_data_indices)))
print ("Test dataset size is {}".format(len(test_data_indices)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


##### Now we are going to create PyTorch DataLoader 

In [23]:
MAX_SENTENCE_LENGTH = 200

import numpy as np
import torch
from torch.utils.data import Dataset

class MovieDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of sentence tokens 
        @param target_list: list of sentence targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def sent_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

# create pytorch dataloader
#train_loader = MovieDataset(train_data_indices, train_targets)
#val_loader = MovieDataset(val_data_indices, val_targets)
#test_loader = MovieDataset(test_data_indices, test_targets)

BATCH_SIZE = 32
train_dataset = MovieDataset(train_data_indices, train_targets)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=sent_collate_func,
                                           shuffle=True)

val_dataset = MovieDataset(val_data_indices, val_targets)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=sent_collate_func,
                                           shuffle=True)

test_dataset = MovieDataset(test_data_indices, test_targets)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=sent_collate_func,
                                           shuffle=False)

# for i, (data, lengths, labels) in enumerate(train_loader):
#     print (data)
#     print (labels)
#     break

Here we will define Bag-of-N-Grams model in PyTorch

In [26]:
# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class BagOfNGrams(nn.Module):
    """
    BagOfNGrams classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfNGrams, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim, 2)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

emb_dim = 100
model = BagOfNGrams(len(id2token), emb_dim)

In [27]:
model

BagOfNGrams(
  (embed): Embedding(10002, 100, padding_idx=0)
  (linear): Linear(in_features=100, out_features=2, bias=True)
)

In [28]:
learning_rate = 1e-3
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# optimizer = AdamW(model.parameters(), lr=learning_rate)


model_dir = f"runs/n/{1}/{time.asctime(time.localtime())}/"
tb = TensorBoard(model_dir)

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

step = 0
for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}, Loss: {}'.format( 
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc, loss.item()))
            
            if tb is not None:
                tb.scalar_summary("metric/loss", loss.item(), step)
                tb.scalar_summary("metric/val_acc", val_acc, step)
        step += 1

Epoch: [1/10], Step: [1/625], Validation Acc: 49.5, Loss: 0.6826670169830322
Epoch: [1/10], Step: [101/625], Validation Acc: 55.88, Loss: 0.6843814849853516
Epoch: [1/10], Step: [201/625], Validation Acc: 67.32, Loss: 0.6269193887710571
Epoch: [1/10], Step: [301/625], Validation Acc: 70.82, Loss: 0.6119614243507385
Epoch: [1/10], Step: [401/625], Validation Acc: 73.66, Loss: 0.5884591341018677
Epoch: [1/10], Step: [501/625], Validation Acc: 75.26, Loss: 0.5575915575027466
Epoch: [1/10], Step: [601/625], Validation Acc: 78.02, Loss: 0.47119784355163574
Epoch: [2/10], Step: [1/625], Validation Acc: 78.4, Loss: 0.4938708245754242
Epoch: [2/10], Step: [101/625], Validation Acc: 79.52, Loss: 0.49663034081459045
Epoch: [2/10], Step: [201/625], Validation Acc: 80.56, Loss: 0.46275240182876587
Epoch: [2/10], Step: [301/625], Validation Acc: 81.68, Loss: 0.36143258213996887
Epoch: [2/10], Step: [401/625], Validation Acc: 82.4, Loss: 0.561408519744873
Epoch: [2/10], Step: [501/625], Validation A

In [29]:
print ("After training for {} epochs".format(num_epochs))
print ("Val Acc {}".format(test_model(val_loader, model)))
print ("Test Acc {}".format(test_model(test_loader, model)))

After training for 10 epochs
Val Acc 86.44
Test Acc 85.216


### Model Evaluation

##### Correct  Predictions

In [31]:
val_dataset = MovieDataset(val_data_indices, val_targets)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=sent_collate_func,
                                           shuffle=False)

step = 0
for index, (data, lengths, labels) in enumerate(val_loader):
    data_batch, length_batch, label_batch = data, lengths, labels
    outputs = F.softmax(model(data_batch, length_batch), dim=1)
    predicted = outputs.max(1, keepdim=True)[1]
    
    predicted_list = predicted.eq(labels.view_as(predicted)).squeeze(1).tolist()

    for idx, value in enumerate(predicted_list):
        if value == 1:
            if len(val_data[idx + index*BATCH_SIZE]) < 200:
                print(f"Predicted: {predicted[idx].item()}, True: {labels[idx].item()}")
                print(val_data[idx + index*BATCH_SIZE], end='\n\n\n')
                step += 1
        
        if step >= 10:
            break
    
    if step >= 3:
        break

Predicted: 0, True: 0
I can't believe this movie has an average rating of 7.0! It is a fiendishly bad movie, and I saw it when it was fairly new, and I was in the age group that is supposed to like it!


Predicted: 1, True: 1
Wonderful movie. Adult content. Lots of erotic scenes plus excellent music and dance scenes. My wife and I absolutely loved this movie and wish they'd make more like it.


Predicted: 0, True: 0
I thought this movie was horrible. I was bored and had to use all the self control I have to not scream at the screen. Mod Squad was beyond cheesy, beyond cliche, and utterly predictable.




##### Incorrect Predictions

In [32]:
val_dataset = MovieDataset(val_data_indices, val_targets)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=sent_collate_func,
                                           shuffle=False)

step = 0
for index, (data, lengths, labels) in enumerate(val_loader):
    data_batch, length_batch, label_batch = data, lengths, labels
    outputs = F.softmax(model(data_batch, length_batch), dim=1)
    predicted = outputs.max(1, keepdim=True)[1]
    
    predicted_list = predicted.eq(labels.view_as(predicted)).squeeze(1).tolist()

    for idx, value in enumerate(predicted_list):
        if value == 0:
            if len(val_data[idx + index*BATCH_SIZE]) < 200:
                print(f"Predicted: {predicted[idx].item()}, True: {labels[idx].item()}")
                print(val_data[idx + index*BATCH_SIZE], end='\n\n\n')
                step += 1
        
        if step >= 10:
            break
    
    if step >= 3:
        break

Predicted: 0, True: 1
In my opinion, this film has wonderful lighting and even better photography. Too bad the story is not all that good and Mr. Cage sometimes loses his accent. But two thumbs up for lighting and the DP!


Predicted: 0, True: 1
I don't care if some people voted this movie to be bad. If you want the Truth this is a Very Good Movie! It has every thing a movie should have. You really should Get this one.


Predicted: 1, True: 0
My first thoughts on this film were of using science fiction as a bad way to show naked women, althought not a brilliant story line it had quite a good ending


