<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports-&amp;-Inits" data-toc-modified-id="Imports-&amp;-Inits-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports &amp; Inits</a></span></li><li><span><a href="#Data-Preparation" data-toc-modified-id="Data-Preparation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Preparation</a></span></li><li><span><a href="#Model" data-toc-modified-id="Model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Model</a></span></li><li><span><a href="#Training" data-toc-modified-id="Training-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Training</a></span></li><li><span><a href="#Testing" data-toc-modified-id="Testing-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Testing</a></span><ul class="toc-item"><li><span><a href="#Ignite-Testing" data-toc-modified-id="Ignite-Testing-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Ignite Testing</a></span></li><li><span><a href="#NLPBook-Testing" data-toc-modified-id="NLPBook-Testing-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>NLPBook Testing</a></span></li></ul></li><li><span><a href="#Inference" data-toc-modified-id="Inference-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Inference</a></span><ul class="toc-item"><li><span><a href="#Single-Inference" data-toc-modified-id="Single-Inference-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Single Inference</a></span></li><li><span><a href="#TopK-Inference" data-toc-modified-id="TopK-Inference-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>TopK Inference</a></span></li></ul></li></ul></div>

# Surname Classifier with MLP

Classifying surnames based on national origin.

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import torch

from torch import nn
from torch import optim
from torch.utils.data import DataLoader
from pathlib import Path

In [3]:
from ignite.engine import Events, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from ignite.contrib.handlers import ProgressBar

In [4]:
from surname.dataset import MLPDataset
from surname.model import MLPClassifier
from surname.trainer import Trainer 
from surname.args import mlp_args as args

In [5]:
path = Path('../data/surnames')

## Data Preparation

In [6]:
work_dir = path/'work_dir'
surnames_csv = path/args.proc_dataset_csv
vectorizer_path = work_dir/args.vectorizer_fname
cw_file = work_dir/args.cw_file

args

Namespace(batch_size=64, checkpointer_name='mlp_classifier', checkpointer_prefix='surname', cw_file='class_weights.pt', device='cuda:3', early_stopping_criteria=5, hidden_dim=300, learning_rate=0.001, model_dir='models', num_epochs=100, proc_dataset_csv='surnames_with_splits.csv', raw_dataset_csv='surnames.csv', save_every=2, save_total=5, train_proportion=0.7, vectorizer_fname='mlp_vectorizer.json')

In [7]:
df = pd.read_csv(surnames_csv)
len(df)

10980

In [8]:
is_load = True

In [9]:
if not is_load:
  train_ds = MLPDataset.load_data_and_create_vectorizer(df.loc[df['split'] == 'train'])
  train_ds.save_vectorizer(vectorizer_path)
  vectorizer = train_ds.get_vectorizer()
  class_counts = df['nationality'].value_counts().to_dict()
  sorted_counts = sorted(class_counts.items(), key=lambda x: vectorizer.nationality_vocab.lookup_token(x[0]))
  freq = [count for _, count in sorted_counts]
  class_weights = 1.0/torch.tensor(freq, dtype=torch.float32)
  torch.save(class_weights, cw_file)

In [10]:
train_df = df.loc[df['split'] == 'train']
train_ds = MLPDataset.load_data_and_vectorizer(train_df, vectorizer_path)
vectorizer = train_ds.get_vectorizer()
class_weights = torch.load(cw_file)
train_dl = DataLoader(train_ds, args.batch_size, shuffle=True, drop_last=True)

val_df = df.loc[df['split'] == 'val']
val_ds = MLPDataset.load_data_and_vectorizer(val_df, vectorizer_path)
val_dl = DataLoader(val_ds, args.batch_size, shuffle=True, drop_last=True)

data_bundle = {
  'train_dl': train_dl,
  'val_dl': val_dl
}

test_df = df.loc[df['split'] == 'test']
test_ds = MLPDataset.load_data_and_vectorizer(test_df, vectorizer_path)
test_dl = DataLoader(test_ds, args.batch_size, shuffle=True, drop_last=True)

In [11]:
len(train_dl.dataset), len(val_dl.dataset), len(test_dl.dataset)

(7680, 1640, 1660)

## Model

In [12]:
classifier = MLPClassifier(input_dim=len(vectorizer.surname_vocab), hidden_dim=args.hidden_dim,
                        output_dim=len(vectorizer.nationality_vocab))
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)
class_weights = class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(class_weights)

In [13]:
model_bundle = {
  'module': classifier,
  'optimizer' : optimizer,
  'scheduler': scheduler,
  'loss_fn': loss_func
}

In [14]:
pbar = ProgressBar(persist=True)
metrics = {'accuracy': Accuracy(), 'loss': Loss(loss_func)}

## Training

In [15]:
surname_trainer = Trainer(model_bundle, data_bundle, work_dir, args, pbar, metrics)
surname_trainer.run()

Epoch [1/100]: [120/120] 100%|██████████, loss=2.72e+00 [00:01<00:00]
Epoch [2/100]: [16/120]  13%|█▎        , loss=2.53e+00 [00:00<00:01]

Epoch: 1
Training - Loss: 2.534, Accuracy: 0.396
Validation - Loss: 2.562, Accuracy: 0.398


Epoch [2/100]: [120/120] 100%|██████████, loss=2.34e+00 [00:01<00:00]
Epoch [3/100]: [15/120]  12%|█▎        , loss=2.31e+00 [00:00<00:01]

Epoch: 2
Training - Loss: 2.133, Accuracy: 0.403
Validation - Loss: 2.210, Accuracy: 0.391


Epoch [3/100]: [120/120] 100%|██████████, loss=2.09e+00 [00:01<00:00]
Epoch [4/100]: [16/120]  13%|█▎        , loss=2.01e+00 [00:00<00:01]

Epoch: 3
Training - Loss: 1.903, Accuracy: 0.430
Validation - Loss: 2.056, Accuracy: 0.398


Epoch [4/100]: [120/120] 100%|██████████, loss=1.93e+00 [00:01<00:00]
Epoch [5/100]: [15/120]  12%|█▎        , loss=1.68e+00 [00:00<00:01]

Epoch: 4
Training - Loss: 1.766, Accuracy: 0.465
Validation - Loss: 1.955, Accuracy: 0.416


Epoch [5/100]: [120/120] 100%|██████████, loss=1.81e+00 [00:01<00:00]
Epoch [6/100]: [16/120]  13%|█▎        , loss=1.96e+00 [00:00<00:01]

Epoch: 5
Training - Loss: 1.679, Accuracy: 0.483
Validation - Loss: 1.874, Accuracy: 0.438


Epoch [6/100]: [120/120] 100%|██████████, loss=1.77e+00 [00:01<00:00]
Epoch [7/100]: [15/120]  12%|█▎        , loss=1.75e+00 [00:00<00:01]

Epoch: 6
Training - Loss: 1.605, Accuracy: 0.488
Validation - Loss: 1.846, Accuracy: 0.451


Epoch [7/100]: [120/120] 100%|██████████, loss=1.70e+00 [00:01<00:00]
Epoch [8/100]: [16/120]  13%|█▎        , loss=1.78e+00 [00:00<00:01]

Epoch: 7
Training - Loss: 1.553, Accuracy: 0.480
Validation - Loss: 1.832, Accuracy: 0.433


Epoch [8/100]: [120/120] 100%|██████████, loss=1.66e+00 [00:01<00:00]
Epoch [9/100]: [16/120]  13%|█▎        , loss=1.61e+00 [00:00<00:01]

Epoch: 8
Training - Loss: 1.505, Accuracy: 0.478
Validation - Loss: 1.790, Accuracy: 0.433


Epoch [9/100]: [120/120] 100%|██████████, loss=1.61e+00 [00:01<00:00]
Epoch [10/100]: [16/120]  13%|█▎        , loss=1.62e+00 [00:00<00:01]

Epoch: 9
Training - Loss: 1.480, Accuracy: 0.505
Validation - Loss: 1.794, Accuracy: 0.458


Epoch [10/100]: [120/120] 100%|██████████, loss=1.58e+00 [00:01<00:00]
Epoch [11/100]: [15/120]  12%|█▎        , loss=1.69e+00 [00:00<00:01]

Epoch: 10
Training - Loss: 1.448, Accuracy: 0.492
Validation - Loss: 1.787, Accuracy: 0.450


Epoch [11/100]: [120/120] 100%|██████████, loss=1.55e+00 [00:01<00:00]
Epoch [12/100]: [16/120]  13%|█▎        , loss=1.27e+00 [00:00<00:01]

Epoch: 11
Training - Loss: 1.422, Accuracy: 0.497
Validation - Loss: 1.766, Accuracy: 0.449


Epoch [12/100]: [120/120] 100%|██████████, loss=1.51e+00 [00:01<00:00]
Epoch [13/100]: [15/120]  12%|█▎        , loss=1.41e+00 [00:00<00:01]

Epoch: 12
Training - Loss: 1.390, Accuracy: 0.498
Validation - Loss: 1.733, Accuracy: 0.451


Epoch [13/100]: [120/120] 100%|██████████, loss=1.46e+00 [00:01<00:00]
Epoch [14/100]: [16/120]  13%|█▎        , loss=1.50e+00 [00:00<00:01]

Epoch: 13
Training - Loss: 1.363, Accuracy: 0.506
Validation - Loss: 1.732, Accuracy: 0.459


Epoch [14/100]: [120/120] 100%|██████████, loss=1.49e+00 [00:01<00:00]
Epoch [15/100]: [15/120]  12%|█▎        , loss=1.42e+00 [00:00<00:01]

Epoch: 14
Training - Loss: 1.344, Accuracy: 0.511
Validation - Loss: 1.739, Accuracy: 0.459


Epoch [15/100]: [120/120] 100%|██████████, loss=1.45e+00 [00:01<00:00]
Epoch [16/100]: [16/120]  13%|█▎        , loss=1.49e+00 [00:00<00:01]

Epoch: 15
Training - Loss: 1.316, Accuracy: 0.510
Validation - Loss: 1.748, Accuracy: 0.461


Epoch [16/100]: [120/120] 100%|██████████, loss=1.43e+00 [00:01<00:00]
Epoch [17/100]: [15/120]  12%|█▎        , loss=1.53e+00 [00:00<00:01]

Epoch: 16
Training - Loss: 1.303, Accuracy: 0.527
Validation - Loss: 1.747, Accuracy: 0.472


Epoch [17/100]: [120/120] 100%|██████████, loss=1.42e+00 [00:01<00:00]
Epoch [18/100]: [16/120]  13%|█▎        , loss=1.64e+00 [00:00<00:01]

Epoch: 17
Training - Loss: 1.283, Accuracy: 0.533
Validation - Loss: 1.706, Accuracy: 0.481


Epoch [18/100]: [120/120] 100%|██████████, loss=1.43e+00 [00:01<00:00]
Epoch [19/100]: [14/120]  12%|█▏        , loss=1.53e+00 [00:00<00:01]

Epoch: 18
Training - Loss: 1.282, Accuracy: 0.529
Validation - Loss: 1.716, Accuracy: 0.475


Epoch [19/100]: [120/120] 100%|██████████, loss=1.42e+00 [00:01<00:00]
Epoch [20/100]: [16/120]  13%|█▎        , loss=1.36e+00 [00:00<00:01]

Epoch: 19
Training - Loss: 1.267, Accuracy: 0.534
Validation - Loss: 1.729, Accuracy: 0.477


Epoch [20/100]: [120/120] 100%|██████████, loss=1.39e+00 [00:01<00:00]
Epoch [21/100]: [15/120]  12%|█▎        , loss=1.37e+00 [00:00<00:01]

Epoch: 20
Training - Loss: 1.266, Accuracy: 0.536
Validation - Loss: 1.728, Accuracy: 0.474


Epoch [21/100]: [120/120] 100%|██████████, loss=1.36e+00 [00:01<00:00]
Epoch [22/100]: [16/120]  13%|█▎        , loss=1.93e+00 [00:00<00:01]

Epoch: 21
Training - Loss: 1.252, Accuracy: 0.540
Validation - Loss: 1.729, Accuracy: 0.487


Epoch [22/100]: [120/120] 100%|██████████, loss=1.41e+00 [00:01<00:00]


Epoch: 22
Training - Loss: 1.248, Accuracy: 0.539
Validation - Loss: 1.717, Accuracy: 0.484


## Testing

### Ignite Testing

In [16]:
args.device = 'cpu'
classifier = MLPClassifier(input_dim=len(vectorizer.surname_vocab), hidden_dim=args.hidden_dim,\
                        output_dim=len(vectorizer.nationality_vocab))
state_dict = torch.load(work_dir/'surname_mlp_classifier.pth')
classifier.load_state_dict(state_dict)

class_weights = class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(class_weights)
metrics = {'accuracy': Accuracy(), 'loss': Loss(loss_func)}

In [17]:
evaluator = create_supervised_evaluator(classifier, metrics=metrics)

@evaluator.on(Events.COMPLETED)
def log_testing_results(engine):
  metrics = engine.state.metrics
  print(f"Test loss: {metrics['loss']:0.3f}")
  print(f"Test accuracy: {metrics['accuracy']:0.3f}")

In [18]:
evaluator.run(test_dl)

Test loss: 1.677
Test accuracy: 0.493


<ignite.engine.engine.State at 0x7f1727043f98>

### NLPBook Testing

In [19]:
def compute_accuracy(y_pred, y_target):
  _, y_pred_indices = y_pred.max(dim=1)
  n_correct = torch.eq(y_pred_indices, y_target).sum().item()
  return n_correct / len(y_pred_indices) * 100

In [20]:
running_loss = 0.
running_acc = 0.

classifier.eval()
for i, batch in enumerate(test_dl):
  x,y = batch
  y_pred = classifier(x_in=x.float())
  
  loss = loss_func(y_pred, y)
  loss_t = loss.item()
  running_loss += (loss_t-running_loss)/(i+1)
  
  acc_t = compute_accuracy(y_pred, y)
  running_acc += (acc_t-running_acc)/(i+1)

In [21]:
print(f"Test loss: {running_loss:0.3f}")
print(f"Test acc: {running_acc:0.3f}")

Test loss: 1.686
Test acc: 48.750


## Inference

### Single Inference

In [None]:
def predict_natinoality(surname, classifier, vectorizer):
  """
    Predict the nationality from a new surname
    
    Args:
      surname: the surname to classify
      classifier: an instance of the classifier
      vectorizer: the corresponding vectorizer
      
    Returns:
      a dictionary with most likely natinoality and its probability
  """
  vectorized_surname = vectorizer.vectorize(surname)
  vectorized_surname = torch.tensor(vectorized_surname).view(1,-1)
  result = classifier(vectorized_surname, apply_softmax=True)
  
  probability_values, indices = result.max(dim=1)
  idx = indices.item()
  
  predicted_nationality = vectorizer.nationality_vocab.lookup_idx(idx)
  probability_value = probability_values.item()
  
  return {'nationality': predicted_nationality, 'probability': probability_value}

In [None]:
new_surname = input("Enter a surname to classify: ")
prediction = predict_natinoality(new_surname, classifier, vectorizer)
print(f"{new_surname} -> {prediction['nationality']} p={prediction['probability']:0.2f}")

### TopK Inference

In [None]:
def predict_topk_nationality(name, classifier, vectorizer, k=5):
  vectorized_name = vectorizer.vectorize(name)
  vectorized_name = torch.tensor(vectorized_name).view(1, -1)
  prediction_vector = classifier(vectorized_name, apply_softmax=True)
  probability_values, indices = torch.topk(prediction_vector, k=k)

  # returned size is 1,k
  probability_values = probability_values.detach().numpy()[0]
  indices = indices.detach().numpy()[0]

  results = []
  for prob_value, idx in zip(probability_values, indices):
      nationality = vectorizer.nationality_vocab.lookup_idx(idx)
      results.append({'nationality': nationality, 
                      'probability': prob_value})

  return results

In [None]:
new_surname = input("Enter a surname to classify: ")
classifier = classifier.to("cpu")

k = int(input("How many of the top predictions to see? "))
if k > len(vectorizer.nationality_vocab):
  print("Sorry! That's more than the # of nationalities we have.. defaulting you to max size :)")
  k = len(vectorizer.nationality_vocab)
    
predictions = predict_topk_nationality(new_surname, classifier, vectorizer, k=k)

print("Top {} predictions:".format(k))
print("===================")
for prediction in predictions:
  print(f"{new_surname} -> {prediction['nationality']} p={prediction['probability']:0.2f}")