## Libs and Parameters

In [20]:
import numpy as np
import os
import torch
from sklearn.feature_extraction import stop_words
from importlib import reload

import IMDBDatum as imdb_data
import ngrams

from tqdm import tqdm_notebook as tqdm
from tqdm import tnrange
import pickle

#### Hyperparameters

In [2]:
LEARNING_RATE = 0.001
TRAINING_EPOCHS = 5
BATCH_SIZE = 32

NGRAM_SIZE = 2 # (1, 2, 3, 4)
VOC_SIZE = 10000 # takes top n word from the vocab
EMBEDDING_DIM = 100 # dimension size for the ngram embeddings
NGRAM_MODE = 'naive'

#### Other params

In [3]:
PAD_IDX = 0
data_dir = r'./data/aclImdb/'
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
TRAIN_SIZE = 20000
VALIDATION_SIZE = 5000
TEST_SIZE = 25000

### I. Data Loading

In [4]:
!tree -d

[01;34m.[00m
|-- [01;34m__pycache__[00m
`-- [01;34mdata[00m
    `-- [01;34maclImdb[00m
        |-- [01;34mtest[00m
        |   |-- [01;34mneg[00m
        |   `-- [01;34mpos[00m
        `-- [01;34mtrain[00m
            |-- [01;34mneg[00m
            |-- [01;34mpos[00m
            `-- [01;34munsup[00m

10 directories


In [64]:
# Load Dataset - should take less than 1 min
reload(imdb_data)
train_set = imdb_data.construct_dataset(train_dir, TRAIN_SIZE)
validation_set = imdb_data.construct_dataset(train_dir, VALIDATION_SIZE, offset=int(TRAIN_SIZE/2))
test_set = imdb_data.construct_dataset(test_dir, TEST_SIZE)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))




### II. Extracting N-grams

In [63]:
reload(ngrams)
# Note that we are using the train_ngram_indexer to index validation and test dataset. Why? 
train_data, train_ngram_indexer, ngram_counter = ngrams.process_text_dataset(train_set, 
                                                                             NGRAM_SIZE, 
                                                                             VOC_SIZE, 
                                                                             mode=NGRAM_MODE)

validation_data, _, _ = ngrams.process_text_dataset(validation_set, 
                                                    NGRAM_SIZE, 
                                                    ngram_indexer=train_ngram_indexer,
                                                    mode=NGRAM_MODE)

test_data, _, _ = ngrams.process_text_dataset(test_set, 
                                              NGRAM_SIZE, 
                                              ngram_indexer=train_ngram_indexer, 
                                              mode=NGRAM_MODE)

extracting ngrams ...


HBox(children=(IntProgress(value=0, description='extract ngrams', max=20000), HTML(value='')))


constructing ngram_indexer ...


HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))


setting each dataset's token indexes


HBox(children=(IntProgress(value=0, description='token to index', max=20000), HTML(value='')))


extracting ngrams ...


HBox(children=(IntProgress(value=0, description='extract ngrams', max=5000), HTML(value='')))


already have a passed ngram_indexer ...
setting each dataset's token indexes


HBox(children=(IntProgress(value=0, description='token to index', max=5000), HTML(value='')))


extracting ngrams ...


HBox(children=(IntProgress(value=0, description='extract ngrams', max=25000), HTML(value='')))


already have a passed ngram_indexer ...
setting each dataset's token indexes


HBox(children=(IntProgress(value=0, description='token to index', max=25000), HTML(value='')))




### III. Data Pipeline

In [73]:
reload(imdb_data)
imdb_train = imdb_data.IMDBDataset(train_data)
imdb_validation = imdb_data.IMDBDataset(validation_data)
imdb_test = imdb_data.IMDBDataset(test_data)    

train_loader = torch.utils.data.DataLoader(dataset=imdb_train, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_data.imdb_collate_func,
                                           shuffle=True)

val_loader = torch.utils.data.DataLoader(dataset=imdb_validation, 
                                           batch_size=BATCH_SIZE, 
                                           collate_fn=imdb_data.imdb_collate_func,
                                           shuffle=False)

test_loader = torch.utils.data.DataLoader(dataset=imdb_test, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_data.imdb_collate_func,
                                           shuffle=False)

At this point we have loaders for Train, Test, Val. Each of which:
- is a DataLoader Object.
- has a dataset object that implements __len__ and __getitem__
- has a batch_size, shuffle variables passed
- has a collate function that takes the batch of data and returns the Tensors that flow to the model


### IV. Model Definition

In [75]:
# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

model = BagOfWords(len(train_ngram_indexer), EMBEDDING_DIM)

### V. Training Loop

In [76]:
learning_rate = 0.01
num_epochs = 1 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [77]:
for epoch in tqdm(range(num_epochs)):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch: [1/1], Step: [101/625], Validation Acc: 79.98
Epoch: [1/1], Step: [201/625], Validation Acc: 84.34
Epoch: [1/1], Step: [301/625], Validation Acc: 84.06
Epoch: [1/1], Step: [401/625], Validation Acc: 85.54
Epoch: [1/1], Step: [501/625], Validation Acc: 86.56
Epoch: [1/1], Step: [601/625], Validation Acc: 85.6



### Testing Reporting

In [78]:
test_acc = test_model(test_loader, model)
test_acc

85.952