In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import re
import os
from torch.utils.data import DataLoader
from argparse import Namespace
import torch
import torch.nn as nn
import torch.optim as optim
# Version descrepancies on this one
from tqdm import tqdm_notebook as notebook

In [24]:
%load_ext autoreload
%autoreload 2

from helper.mlp_classifier import SurnameClassifier
from helper.custom_utils import ModelUtils
from helper.mlp_dataset import SurnameDataset

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
args = Namespace(
surname_csv = "../data/surnames_with_splits.csv",
vectorizer_file = "vectorizer.json",
model_state_file = "model.pth",
save_dir = "../data/model_state",
hidden_dim = 300,
seed = 1337,
num_epochs = 100,
early_stopping_criteria=5,
learning_rate = 0.001,
batch_size=64,
cuda = False,
reload_from_files=False,
expand_filepaths_to_save_dir=True)

In [26]:
# Just creating filepaths..not the actual files
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

Expanded filepaths: 
	../data/model_state/vectorizer.json
	../data/model_state/model.pth


In [27]:
# Check CUDA
if torch.cuda.is_available():
    args.cuda = True
    
args.device = torch.device("cuda" if args.cuda else "cpu")

In [6]:
args.device

device(type='cpu')

In [7]:
ModelUtils.set_seed_everywhere(args.seed,args.cuda)
ModelUtils.handle_dirs(args.save_dir)

In [8]:
# Model Initializations
if args.reload_from_files:
    print("Reloading !")
    dataset = SurnameDataset.load_dataset_and_load_vectorizer(args.surname_csv,
                                                              args.vectorizer_file)
    
else:
    print("Creating fresh!")
    dataset = SurnameDataset.load_dataset_and_make_vectorizer(args.surname_csv)
    dataset.save_vectorizer(args.vectorizer_file)
    
vectorizer = dataset.get_vectorizer()
classifier = SurnameClassifier(input_dim = len(vectorizer.surname_vocab),
                              hidden_dim = args.hidden_dim,
                              output_dim = len(vectorizer.nationality_vocab))

Creating fresh!


In [9]:
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)

# Whyn am I feeding class weights to loss function ??
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(),lr = args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min',factor=0.5,patience=1)

train_state = ModelUtils.make_train_state(args)

In [10]:
epoch_bar = notebook(desc = 'training routine',
                         total = args.num_epochs,
                         position = 0
                         )
dataset.set_split('train')
train_bar = notebook(desc = 'split=train',
                         total = dataset.get_num_batches(args.batch_size),
                         position=1,
                         leave=True)

dataset.set_split('validation')
val_bar = notebook(desc = 'split=validation',
                         total = dataset.get_num_batches(args.batch_size),
                         position=1,
                         leave=True)


try:    
    for epoch_index in range(args.num_epochs):
        # for the first iteration train_state gets default values
        train_state['epoch_index'] = epoch_index
        
        # dataset mode : training
        dataset.set_split('train')
        batch_generator = ModelUtils.generate_batches(dataset,batch_size=args.batch_size,device=args.device)
        
        #running loss and acc will reset to zero
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()
        
        for batch_index, batch_dict in enumerate(batch_generator):
            optimizer.zero_grad()
            
            # predicting value just by passing value to classifier. 
            y_pred = classifier(batch_dict['x_surname'])
            loss = loss_func(y_pred, batch_dict['y_nationality'])
            loss_t = loss.item()
            
            ## why running loss needs to be divided by batch index?
            running_loss += (loss_t - running_loss) / (batch_index + 1)
            
            loss.backward()
            
            optimizer.step()
            
            # compute running accuracy
            acc_t = ModelUtils.compute_accuracy(y_pred,batch_dict['y_nationality'])
            
            # why divide by batch index?
            running_acc += (acc_t - running_acc)/(batch_index + 1)
            
            # update the running loss and running accuracy for training bar. 
            train_bar.set_postfix(loss=running_loss,acc=running_acc, epoch = epoch_index)
            train_bar.update()
        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)
        
        # dataset mode validation
        # cleaning up before validation routine
        batch_generator = ModelUtils.generate_batches(dataset,batch_size=args.batch_size,device=args.device)
        running_loss = 0
        running_acc = 0
        classifier.eval()
        dataset.set_split("validation")
        
        
        for batch_index, batch_dict in enumerate(batch_generator): 
            # predicting value just by passing value to classifier. 
            y_pred = classifier(batch_dict['x_surname'])
            loss = loss_func(y_pred, batch_dict['y_nationality'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)
            
            acc_t = ModelUtils.compute_accuracy(y_pred,batch_dict['y_nationality'])
            running_acc += (acc_t - running_acc)/(batch_index + 1)
            
            val_bar.set_postfix(loss=running_loss,acc=running_acc,epoch = epoch_index)
            val_bar.update()
            
            
        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)
        
        
        train_state = ModelUtils.update_train_state(args=args,model=classifier,train_state=train_state)
        
        # has to do with learning rate adjustment
        scheduler.step(train_state['val_loss'][-1])
        
        if train_state['stop_early']:
            break
        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting Loop")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, description='training routine', style=ProgressStyle(description_width=…

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(FloatProgress(value=0.0, description='split=train', max=120.0, style=ProgressStyle(description_…

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, description='split=validation', max=25.0, style=ProgressStyle(descript…

In [11]:
train_state

{'stop_early': False,
 'early_stopping_step': 0,
 'early_stopping_best_val': 100000000.0,
 'learing_rate': 0.001,
 'epoch_index': 99,
 'train_loss': [2.725182360410691,
  2.2382738063732783,
  1.9208319107691445,
  1.7678079863389333,
  1.667573764920235,
  1.6138932079076773,
  1.554712516069412,
  1.5177146603663765,
  1.4833445241053902,
  1.432928857704004,
  1.4171951919794086,
  1.399843825896581,
  1.3862092037995655,
  1.3778129185239474,
  1.3715325380365053,
  1.368757975101471,
  1.3635253235697742,
  1.3546929424007734,
  1.3466340313355125,
  1.3362543731927874,
  1.3321321263909338,
  1.3288516332705818,
  1.3341496845086418,
  1.3368919665614758,
  1.3383843337496117,
  1.328145331144333,
  1.3340359235803287,
  1.3321303715308517,
  1.3289497936765349,
  1.335713585714499,
  1.3306596636772154,
  1.3326024522384012,
  1.3248027041554447,
  1.3259046648939454,
  1.3312854493657753,
  1.323997878531615,
  1.3190348515907926,
  1.331933689117432,
  1.3217384482423464,
  1.

In [12]:
## Re initiate the classifier
# classifier = SurnameClassifier(input_dim = len(vectorizer.surname_vocab),
#                               hidden_dim = args.hidden_dim,
#                               output_dim = len(vectorizer.nationality_vocab))

In [13]:
# Must use map_location parameter for deserializing to cpu.
# Load pretrained weights to classifier.
classifier.load_state_dict(torch.load(train_state['model_filename'],map_location=torch.device(args.device) ))

<All keys matched successfully>

In [14]:
# Each batch has to be predicted and computed

In [15]:
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = ModelUtils.generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # this is the pretrained classifier aka the model.
    y_pred =  classifier(batch_dict['x_surname'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_nationality'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = ModelUtils.compute_accuracy(y_pred, batch_dict['y_nationality'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [16]:
train_state['test_loss'], train_state['test_acc']

(1.8359884548187257, 44.93749999999999)

In [17]:
# All tensors must go to device

def predict_nationality(surname, classifier, vectorizer, device=args.device):
    """Predict the nationality from a new surname
    
    Args:
        surname (str): the surname to classifier
        classifier (SurnameClassifer): an instance of the classifier
        vectorizer (SurnameVectorizer): the corresponding vectorizer
    Returns:
        a dictionary with the most likely nationality and its probability
    """
    vectorized_surname = vectorizer.vectorize(surname)
    vectorized_surname = torch.tensor(vectorized_surname).view(1, -1)
    # must send the new tensor to device
    vectorized_surname = vectorized_surname.to(device)
    result = classifier(vectorized_surname, apply_softmax=True)

    probability_values, indices = result.max(dim=1)
    index = indices.item()

    predicted_nationality = vectorizer.nationality_vocab.lookup_index(index)
    probability_value = probability_values.item()

    return {'nationality': predicted_nationality, 'probability': probability_value}

In [18]:
new_surname = input("Enter a surname to classify: ")
classifier = classifier.to(args.device)
prediction = predict_nationality(new_surname, classifier, vectorizer)
print("{} -> {} (p={:0.2f})".format(new_surname,
                                    prediction['nationality'],
                                    prediction['probability']))


KeyboardInterrupt: 

In [None]:
def predict_topk_nationality(name, classifier, vectorizer, k=5, device=args.device):
    vectorized_name = vectorizer.vectorize(name)
    vectorized_name = torch.tensor(vectorized_name).view(1, -1)
    vectorized_name = vectorized_name.to(device)
    prediction_vector = classifier(vectorized_name, apply_softmax=True)
    probability_values, indices = torch.topk(prediction_vector, k=k)
    
    # returned size is 1,k
    # cpu conversion is necessary because tensor object cannot directly interact with numpy
    probability_values = probability_values.detach().cpu().numpy()[0]
    indices = indices.detach().cpu().numpy()[0]
    
    results = []
    for prob_value, index in zip(probability_values, indices):
        nationality = vectorizer.nationality_vocab.lookup_index(index)
        results.append({'nationality': nationality, 
                        'probability': prob_value})
    
    return results

In [None]:
new_surname = input("Enter a surname to classify: ")
classifier = classifier.to(args.device)

k = int(input("How many of the top predictions to see? "))
if k > len(vectorizer.nationality_vocab):
    print("Sorry! That's more than the # of nationalities we have.. defaulting you to max size :)")
    k = len(vectorizer.nationality_vocab)
    
predictions = predict_topk_nationality(new_surname, classifier, vectorizer, k=k)

print("Top {} predictions:".format(k))
print("===================")
for prediction in predictions:
    print("{} -> {} (p={:0.2f})".format(new_surname,
                                        prediction['nationality'],
                                        prediction['probability']))

### Limitations of MLP

+ Why bojac and jacob is classified as same Nationality and with identical probability
+ Rahman --> Irish, rahman --> German