## Libs and Parameters

In [191]:
import numpy as np
import os
import torch
from sklearn.feature_extraction import stop_words
from importlib import reload
import matplotlib.pyplot as plt
%matplotlib inline

import IMDBDatum as imdb_data
import BagOfWords as bow
import ngrams

import torch.nn.functional as F

from tqdm import tqdm_notebook as tqdm
from tqdm import tnrange
import pickle
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2.56 ms


#### Hyperparameters

In [94]:
BATCH_SIZE = 32

NGRAM_SIZE = 2 # (1, 2, 3, 4)
VOC_SIZE = 10000 # takes top n word from the vocab
EMBEDDING_DIM = 100 # dimension size for the ngram embeddings
NGRAM_MODE = 'naive'

time: 593 µs


#### Other params

In [95]:
PAD_IDX = 0
data_dir = r'./data/aclImdb/'
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
TRAIN_SIZE = 20000
VALIDATION_SIZE = 5000
TEST_SIZE = 25000

time: 757 µs


### CUDA

In [201]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

time: 2.42 ms


### I. Data Loading

In [180]:
!tree -d

[01;34m.[00m
|-- [01;34m__pycache__[00m
`-- [01;34mdata[00m
    `-- [01;34maclImdb[00m
        |-- [01;34mtest[00m
        |   |-- [01;34mneg[00m
        |   `-- [01;34mpos[00m
        `-- [01;34mtrain[00m
            |-- [01;34mneg[00m
            |-- [01;34mpos[00m
            `-- [01;34munsup[00m

10 directories
time: 338 ms


In [181]:
# Load Dataset - should take less than 1 min
reload(imdb_data)
train_set = imdb_data.construct_dataset(train_dir, TRAIN_SIZE)
validation_set = imdb_data.construct_dataset(train_dir, VALIDATION_SIZE, offset=int(TRAIN_SIZE/2))
test_set = imdb_data.construct_dataset(test_dir, TEST_SIZE)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))


time: 1.24 s


### II. Extracting N-grams

In [182]:
reload(ngrams)
# Note that we are using the train_ngram_indexer to index validation and test dataset. Why? 
train_data, train_ngram_indexer, ngram_counter = ngrams.process_text_dataset(train_set, 
                                                                             NGRAM_SIZE, 
                                                                             VOC_SIZE, 
                                                                             mode=NGRAM_MODE)

validation_data, _, _ = ngrams.process_text_dataset(validation_set, 
                                                    NGRAM_SIZE, 
                                                    ngram_indexer=train_ngram_indexer,
                                                    mode=NGRAM_MODE)

test_data, _, _ = ngrams.process_text_dataset(test_set, 
                                              NGRAM_SIZE, 
                                              ngram_indexer=train_ngram_indexer, 
                                              mode=NGRAM_MODE)

extracting ngrams ...


HBox(children=(IntProgress(value=0, description='extract ngrams', max=20000), HTML(value='')))


constructing ngram_indexer ...


HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))


setting each dataset's token indexes


HBox(children=(IntProgress(value=0, description='token to index', max=20000), HTML(value='')))


extracting ngrams ...


HBox(children=(IntProgress(value=0, description='extract ngrams', max=5000), HTML(value='')))


already have a passed ngram_indexer ...
setting each dataset's token indexes


HBox(children=(IntProgress(value=0, description='token to index', max=5000), HTML(value='')))


extracting ngrams ...


HBox(children=(IntProgress(value=0, description='extract ngrams', max=25000), HTML(value='')))


already have a passed ngram_indexer ...
setting each dataset's token indexes


HBox(children=(IntProgress(value=0, description='token to index', max=25000), HTML(value='')))


time: 14.2 s


### III. Data Pipeline

In [196]:
reload(imdb_data)
imdb_train = imdb_data.IMDBDataset(train_data)
imdb_validation = imdb_data.IMDBDataset(validation_data)
imdb_test = imdb_data.IMDBDataset(test_data)    

train_loader = torch.utils.data.DataLoader(dataset=imdb_train, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_data.imdb_collate_func,
                                           shuffle=True)

val_loader = torch.utils.data.DataLoader(dataset=imdb_validation, 
                                           batch_size=BATCH_SIZE, 
                                           collate_fn=imdb_data.imdb_collate_func,
                                           shuffle=False)

test_loader = torch.utils.data.DataLoader(dataset=imdb_test, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_data.imdb_collate_func,
                                           shuffle=False)

time: 6.67 ms


At this point we have loaders for Train, Test, Val. Each of which:
- is a DataLoader Object.
- has a dataset object that implements __len__ and __getitem__
- has a batch_size, shuffle variables passed
- has a collate function that takes the batch of data and returns the Tensors that flow to the model
- the collated data is a list of length 3
    - [0] is the concatenated padded token_idx tensor
    - [1] is tensor of the list of lengths
    - [2] is tensor of labels


### IV. Model Definition

In [202]:
reload(bow)
model = bow.BagOfWords(len(train_ngram_indexer), EMBEDDING_DIM).to(device)

time: 8.25 ms


#### Check that the initialized model is roughly random

In [225]:
print("initial validation error: %s" % test_model(val_loader, model))

initial validation error: 49.96
time: 318 ms


### V. Training Loop

In [247]:
LR = 0.01
NEPOCH = 1 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=LR)


# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()  # good practice to set the model to evaluation mode (no dropout)
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)


def earily_stop(val_acc_history, t=2, required_progress=0.01):
    """
    Stop the training if there is no non-trivial progress in k steps
    @param val_acc_history: a list contains all the historical validation acc
    @param required_progress: the next acc should be higher than the previous by 
        at least required_progress amount to be non-trivial
    @param t: number of training steps 
    @return: a boolean indicates if the model should earily stop
    """
    if len(val_acc_history) > t + 1 and val_acc_history[-t - 1] == max(val_acc_history[-t - 1:]):
        return True
    return False



time: 2.56 ms


In [251]:
validation_acc_history = []
stop_training = False

for epoch in tnrange(NEPOCH, desc='Epochs'):
    for i, (data, lengths, labels) in enumerate(tqdm(train_loader)):
        model.train()  # good practice to set the model to training mode (dropout)
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # validate every 4 batches
        if (i+1) % (BATCH_SIZE * 4) == 0:
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                       epoch+1, NEPOCH, i+1, len(train_loader), val_acc))
            
            validation_acc_history.append(val_acc)
            # check if we need to earily stop the model
            stop_training = earily_stop(validation_acc_history)
            if stop_training:
                print("--- earily stop triggered ---")
                break
        if stop_training:
            break

HBox(children=(IntProgress(value=0, description='Epochs', max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=625), HTML(value='')))

Epoch: [1/10], Step: [128/625], Validation Acc: 85.12
Epoch: [1/10], Step: [256/625], Validation Acc: 85.54
Epoch: [1/10], Step: [384/625], Validation Acc: 86.14
Epoch: [1/10], Step: [512/625], Validation Acc: 84.94


HBox(children=(IntProgress(value=0, max=625), HTML(value='')))

Epoch: [2/10], Step: [128/625], Validation Acc: 85.18
--- earily stop triggered ---


HBox(children=(IntProgress(value=0, max=625), HTML(value='')))

HBox(children=(IntProgress(value=0, max=625), HTML(value='')))

HBox(children=(IntProgress(value=0, max=625), HTML(value='')))

HBox(children=(IntProgress(value=0, max=625), HTML(value='')))

HBox(children=(IntProgress(value=0, max=625), HTML(value='')))

HBox(children=(IntProgress(value=0, max=625), HTML(value='')))

HBox(children=(IntProgress(value=0, max=625), HTML(value='')))

HBox(children=(IntProgress(value=0, max=625), HTML(value='')))


time: 5.28 s


### Testing Reporting

In [252]:
test_acc = test_model(test_loader, model)
test_acc

83.576

time: 1.43 s
