<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports-&amp;-Inits" data-toc-modified-id="Imports-&amp;-Inits-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports &amp; Inits</a></span></li><li><span><a href="#Data-Preparation" data-toc-modified-id="Data-Preparation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Preparation</a></span></li><li><span><a href="#Model" data-toc-modified-id="Model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Model</a></span></li><li><span><a href="#Training" data-toc-modified-id="Training-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Training</a></span></li><li><span><a href="#Testing" data-toc-modified-id="Testing-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Testing</a></span><ul class="toc-item"><li><span><a href="#Ignite-Testing" data-toc-modified-id="Ignite-Testing-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Ignite Testing</a></span></li><li><span><a href="#NLPBook-Testing" data-toc-modified-id="NLPBook-Testing-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>NLPBook Testing</a></span></li></ul></li><li><span><a href="#Inference" data-toc-modified-id="Inference-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Inference</a></span><ul class="toc-item"><li><span><a href="#Single-Inference" data-toc-modified-id="Single-Inference-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Single Inference</a></span></li><li><span><a href="#TopK-Inference" data-toc-modified-id="TopK-Inference-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>TopK Inference</a></span></li></ul></li></ul></div>

# Surname Classifier with MLP

Classifying surnames based on national origin.

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import torch

from torch import nn
from torch import optim
from torch.utils.data import DataLoader
from pathlib import Path

In [3]:
from ignite.engine import Events, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from ignite.contrib.handlers import ProgressBar

In [4]:
from surname.dataset import MLPDataset as ProjectDataset
from surname.model import MLPClassifier as Classifier
from surname.trainer import Trainer 
from surname.args import mlp_args as args

In [5]:
path = Path('../data/surnames')

## Data Preparation

In [6]:
work_dir = path/'work_dir'
surnames_csv = path/args.proc_dataset_csv
vectorizer_path = work_dir/args.vectorizer_fname
cw_file = work_dir/args.cw_file

args

Namespace(batch_size=64, checkpointer_name='classifier', checkpointer_prefix='surname', cw_file='class_weights.pt', device='cuda:3', early_stopping_criteria=5, hidden_dim=300, learning_rate=0.001, model_dir='models', num_epochs=100, proc_dataset_csv='surnames_with_splits.csv', raw_dataset_csv='surnames.csv', save_every=2, save_total=5, train_proportion=0.7, vectorizer_fname='vectorizer.json')

In [7]:
df = pd.read_csv(surnames_csv)
len(df)

10980

In [8]:
is_load = True

In [9]:
if not is_load:
  train_ds = ProjectDataset.load_data_and_create_vectorizer(df.loc[df['split'] == 'train'])
  train_ds.save_vectorizer(vectorizer_path)
  vectorizer = train_ds.get_vectorizer()
  class_counts = df['nationality'].value_counts().to_dict()
  sorted_counts = sorted(class_counts.items(), key=lambda x: vectorizer.nationality_vocab.lookup_token(x[0]))
  freq = [count for _, count in sorted_counts]
  class_weights = 1.0/torch.tensor(freq, dtype=torch.float32)
  torch.save(class_weights, cw_file)

In [10]:
train_df = df.loc[df['split'] == 'train']
train_ds = ProjectDataset.load_data_and_vectorizer(train_df, vectorizer_path)
vectorizer = train_ds.get_vectorizer()
class_weights = torch.load(cw_file)
train_dl = DataLoader(train_ds, args.batch_size, shuffle=True, drop_last=True)

In [11]:
val_df = df.loc[df['split'] == 'val']
val_ds = ProjectDataset.load_data_and_vectorizer(val_df, vectorizer_path)
val_dl = DataLoader(val_ds, args.batch_size, shuffle=True, drop_last=True)

In [12]:
test_df = df.loc[df['split'] == 'test']
test_ds = ProjectDataset.load_data_and_vectorizer(test_df, vectorizer_path)
test_dl = DataLoader(test_ds, args.batch_size, shuffle=True, drop_last=True)

In [13]:
len(train_dl.dataset), len(val_dl.dataset), len(test_dl.dataset)

(7680, 1640, 1660)

## Model

In [14]:
classifier = Classifier(input_dim=len(vectorizer.surname_vocab), hidden_dim=args.hidden_dim, output_dim=len(vectorizer.nationality_vocab))
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
class_weights = class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(class_weights)

pbar = ProgressBar(persist=True)
metrics = {'accuracy': Accuracy(), 'loss': Loss(loss_func)}

## Training

In [15]:
surname_trainer = Trainer(classifier, optimizer, loss_func, train_dl, val_dl, work_dir, args, pbar, metrics)
surname_trainer.run()

Epoch [1/100]: [120/120] 100%|██████████, loss=2.72e+00 [00:01<00:00]
Epoch [2/100]: [16/120]  13%|█▎        , loss=2.46e+00 [00:00<00:01]

Epoch: 1
Training - Loss: 2.539, Accuracy: 0.391
Validation - Loss: 2.571, Accuracy: 0.383


Epoch [2/100]: [120/120] 100%|██████████, loss=2.33e+00 [00:01<00:00]
Epoch [3/100]: [15/120]  12%|█▎        , loss=2.20e+00 [00:00<00:01]

Epoch: 2
Training - Loss: 2.135, Accuracy: 0.427
Validation - Loss: 2.230, Accuracy: 0.415


Epoch [3/100]: [120/120] 100%|██████████, loss=2.07e+00 [00:01<00:00]
Epoch [4/100]: [16/120]  13%|█▎        , loss=1.84e+00 [00:00<00:01]

Epoch: 3
Training - Loss: 1.906, Accuracy: 0.472
Validation - Loss: 2.060, Accuracy: 0.456


Epoch [4/100]: [120/120] 100%|██████████, loss=1.90e+00 [00:01<00:00]
Epoch [5/100]: [15/120]  12%|█▎        , loss=1.74e+00 [00:00<00:01]

Epoch: 4
Training - Loss: 1.768, Accuracy: 0.430
Validation - Loss: 1.961, Accuracy: 0.408


Epoch [5/100]: [120/120] 100%|██████████, loss=1.79e+00 [00:01<00:00]
Epoch [6/100]: [16/120]  13%|█▎        , loss=1.72e+00 [00:00<00:01]

Epoch: 5
Training - Loss: 1.679, Accuracy: 0.463
Validation - Loss: 1.909, Accuracy: 0.421


Epoch [6/100]: [120/120] 100%|██████████, loss=1.75e+00 [00:01<00:00]
Epoch [7/100]: [14/120]  12%|█▏        , loss=1.60e+00 [00:00<00:01]

Epoch: 6
Training - Loss: 1.610, Accuracy: 0.466
Validation - Loss: 1.870, Accuracy: 0.429


Epoch [7/100]: [120/120] 100%|██████████, loss=1.70e+00 [00:01<00:00]
Epoch [8/100]: [16/120]  13%|█▎        , loss=1.57e+00 [00:00<00:01]

Epoch: 7
Training - Loss: 1.556, Accuracy: 0.490
Validation - Loss: 1.845, Accuracy: 0.451


Epoch [8/100]: [120/120] 100%|██████████, loss=1.65e+00 [00:01<00:00]
Epoch [9/100]: [15/120]  12%|█▎        , loss=1.37e+00 [00:00<00:01]

Epoch: 8
Training - Loss: 1.514, Accuracy: 0.486
Validation - Loss: 1.816, Accuracy: 0.440


Epoch [9/100]: [120/120] 100%|██████████, loss=1.57e+00 [00:01<00:00]
Epoch [10/100]: [16/120]  13%|█▎        , loss=1.58e+00 [00:00<00:01]

Epoch: 9
Training - Loss: 1.469, Accuracy: 0.486
Validation - Loss: 1.780, Accuracy: 0.434


Epoch [10/100]: [120/120] 100%|██████████, loss=1.60e+00 [00:01<00:00]
Epoch [11/100]: [14/120]  12%|█▏        , loss=1.46e+00 [00:00<00:01]

Epoch: 10
Training - Loss: 1.443, Accuracy: 0.489
Validation - Loss: 1.783, Accuracy: 0.434


Epoch [11/100]: [120/120] 100%|██████████, loss=1.53e+00 [00:01<00:00]
Epoch [12/100]: [16/120]  13%|█▎        , loss=1.46e+00 [00:00<00:01]

Epoch: 11
Training - Loss: 1.416, Accuracy: 0.510
Validation - Loss: 1.783, Accuracy: 0.453


Epoch [12/100]: [120/120] 100%|██████████, loss=1.54e+00 [00:01<00:00]
Epoch [13/100]: [15/120]  12%|█▎        , loss=1.50e+00 [00:00<00:01]

Epoch: 12
Training - Loss: 1.382, Accuracy: 0.503
Validation - Loss: 1.770, Accuracy: 0.449


Epoch [13/100]: [120/120] 100%|██████████, loss=1.48e+00 [00:01<00:00]
Epoch [14/100]: [16/120]  13%|█▎        , loss=1.48e+00 [00:00<00:01]

Epoch: 13
Training - Loss: 1.358, Accuracy: 0.516
Validation - Loss: 1.763, Accuracy: 0.458


Epoch [14/100]: [120/120] 100%|██████████, loss=1.48e+00 [00:01<00:00]
Epoch [15/100]: [15/120]  12%|█▎        , loss=1.67e+00 [00:00<00:01]

Epoch: 14
Training - Loss: 1.341, Accuracy: 0.522
Validation - Loss: 1.756, Accuracy: 0.464


Epoch [15/100]: [120/120] 100%|██████████, loss=1.49e+00 [00:01<00:00]
Epoch [16/100]: [16/120]  13%|█▎        , loss=1.72e+00 [00:00<00:01]

Epoch: 15
Training - Loss: 1.313, Accuracy: 0.526
Validation - Loss: 1.757, Accuracy: 0.467


Epoch [16/100]: [120/120] 100%|██████████, loss=1.47e+00 [00:01<00:00]
Epoch [17/100]: [15/120]  12%|█▎        , loss=1.46e+00 [00:00<00:01]

Epoch: 16
Training - Loss: 1.301, Accuracy: 0.521
Validation - Loss: 1.711, Accuracy: 0.453


Epoch [17/100]: [120/120] 100%|██████████, loss=1.43e+00 [00:01<00:00]
Epoch [18/100]: [16/120]  13%|█▎        , loss=1.40e+00 [00:00<00:01]

Epoch: 17
Training - Loss: 1.272, Accuracy: 0.544
Validation - Loss: 1.713, Accuracy: 0.477


Epoch [18/100]: [120/120] 100%|██████████, loss=1.39e+00 [00:01<00:00]
Epoch [19/100]: [15/120]  12%|█▎        , loss=1.36e+00 [00:00<00:01]

Epoch: 18
Training - Loss: 1.260, Accuracy: 0.540
Validation - Loss: 1.731, Accuracy: 0.480


Epoch [19/100]: [120/120] 100%|██████████, loss=1.38e+00 [00:01<00:00]
Epoch [20/100]: [16/120]  13%|█▎        , loss=1.31e+00 [00:00<00:01]

Epoch: 19
Training - Loss: 1.242, Accuracy: 0.540
Validation - Loss: 1.713, Accuracy: 0.475


Epoch [20/100]: [120/120] 100%|██████████, loss=1.34e+00 [00:01<00:00]
Epoch [21/100]: [15/120]  12%|█▎        , loss=1.41e+00 [00:00<00:01]

Epoch: 20
Training - Loss: 1.216, Accuracy: 0.548
Validation - Loss: 1.724, Accuracy: 0.485


Epoch [21/100]: [120/120] 100%|██████████, loss=1.34e+00 [00:01<00:00]


Epoch: 21
Training - Loss: 1.204, Accuracy: 0.556
Validation - Loss: 1.715, Accuracy: 0.482


## Testing

### Ignite Testing

In [16]:
args.device = 'cpu'
classifier = Classifier(input_dim=len(vectorizer.surname_vocab), hidden_dim=args.hidden_dim,\
                        output_dim=len(vectorizer.nationality_vocab))
state_dict = torch.load(work_dir/'surname_mlp_classifier.pth')
classifier.load_state_dict(state_dict)

class_weights = class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(class_weights)
metrics = {'accuracy': Accuracy(), 'loss': Loss(loss_func)}

In [17]:
evaluator = create_supervised_evaluator(classifier, metrics=metrics)

@evaluator.on(Events.COMPLETED)
def log_testing_results(engine):
  metrics = engine.state.metrics
  print(f"Test loss: {metrics['loss']:0.3f}")
  print(f"Test accuracy: {metrics['accuracy']:0.3f}")

In [18]:
evaluator.run(test_dl)

Test loss: 1.724
Test accuracy: 0.489


<ignite.engine.engine.State at 0x7f49ded81ac8>

### NLPBook Testing

In [19]:
def compute_accuracy(y_pred, y_target):
  _, y_pred_indices = y_pred.max(dim=1)
  n_correct = torch.eq(y_pred_indices, y_target).sum().item()
  return n_correct / len(y_pred_indices) * 100

In [20]:
running_loss = 0.
running_acc = 0.

classifier.eval()
for i, batch in enumerate(test_dl):
  x,y = batch
  y_pred = classifier(x_in=x.float())
  
  loss = loss_func(y_pred, y)
  loss_t = loss.item()
  running_loss += (loss_t-running_loss)/(i+1)
  
  acc_t = compute_accuracy(y_pred, y)
  running_acc += (acc_t-running_acc)/(i+1)

In [21]:
print(f"Test loss: {running_loss:0.3f}")
print(f"Test acc: {running_acc:0.3f}")

Test loss: 1.686
Test acc: 49.750


## Inference

### Single Inference

In [22]:
def predict_natinoality(surname, classifier, vectorizer):
  """
    Predict the nationality from a new surname
    
    Args:
      surname: the surname to classify
      classifier: an instance of the classifier
      vectorizer: the corresponding vectorizer
      
    Returns:
      a dictionary with most likely natinoality and its probability
  """
  vectorized_surname = vectorizer.vectorize(surname)
  vectorized_surname = torch.tensor(vectorized_surname).view(1,-1)
  result = classifier(vectorized_surname, apply_softmax=True)
  
  probability_values, indices = result.max(dim=1)
  idx = indices.item()
  
  predicted_nationality = vectorizer.nationality_vocab.lookup_idx(idx)
  probability_value = probability_values.item()
  
  return {'nationality': predicted_nationality, 'probability': probability_value}

In [23]:
new_surname = input("Enter a surname to classify: ")
prediction = predict_natinoality(new_surname, classifier, vectorizer)
print(f"{new_surname} -> {prediction['nationality']} p={prediction['probability']:0.2f}")

Enter a surname to classify: Jacobson
Jacobson -> Scottish p=0.22


  y_out = self.softmax(y_out)


### TopK Inference

In [24]:
def predict_topk_nationality(name, classifier, vectorizer, k=5):
  vectorized_name = vectorizer.vectorize(name)
  vectorized_name = torch.tensor(vectorized_name).view(1, -1)
  prediction_vector = classifier(vectorized_name, apply_softmax=True)
  probability_values, indices = torch.topk(prediction_vector, k=k)

  # returned size is 1,k
  probability_values = probability_values.detach().numpy()[0]
  indices = indices.detach().numpy()[0]

  results = []
  for prob_value, idx in zip(probability_values, indices):
      nationality = vectorizer.nationality_vocab.lookup_idx(idx)
      results.append({'nationality': nationality, 
                      'probability': prob_value})

  return results

In [25]:
new_surname = input("Enter a surname to classify: ")
classifier = classifier.to("cpu")

k = int(input("How many of the top predictions to see? "))
if k > len(vectorizer.nationality_vocab):
  print("Sorry! That's more than the # of nationalities we have.. defaulting you to max size :)")
  k = len(vectorizer.nationality_vocab)
    
predictions = predict_topk_nationality(new_surname, classifier, vectorizer, k=k)

print("Top {} predictions:".format(k))
print("===================")
for prediction in predictions:
  print(f"{new_surname} -> {prediction['nationality']} p={prediction['probability']:0.2f}")

Enter a surname to classify: Jacobson
How many of the top predictions to see? 5
Top 5 predictions:
Jacobson -> Scottish p=0.22
Jacobson -> English p=0.17
Jacobson -> Czech p=0.12
Jacobson -> Italian p=0.09
Jacobson -> Spanish p=0.09
