<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports-&amp;-Inits" data-toc-modified-id="Imports-&amp;-Inits-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports &amp; Inits</a></span></li><li><span><a href="#Data-Loading" data-toc-modified-id="Data-Loading-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Loading</a></span></li><li><span><a href="#Model" data-toc-modified-id="Model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Model</a></span><ul class="toc-item"><li><span><a href="#Column-Gather-Understanding" data-toc-modified-id="Column-Gather-Understanding-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Column Gather Understanding</a></span></li></ul></li></ul></div>

# Surname Classifier Using ElmanRNN

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

import pdb
import pandas as pd
import numpy as np
import torch
import re

from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.utils.data import DataLoader
from pathlib import Path

from ignite.engine import Events, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from ignite.contrib.handlers import ProgressBar

In [3]:
from surname.dataset import SurnameDataset
from surname.containers import DataContainer
from surname.elman import ElmanRNN
from consts import consts
vars(consts)

{'path': PosixPath('../data/surnames'),
 'workdir': PosixPath('../data/surnames/rnn_workdir'),
 'proc_dataset_csv': PosixPath('../data/surnames/surnames_with_splits.csv'),
 'model_dir': PosixPath('../data/surnames/rnn_workdir/models'),
 'vectorizer_json': PosixPath('../data/surnames/rnn_workdir/vectorizer.json'),
 'metrics_file': PosixPath('../data/surnames/rnn_workdir/metrics.csv'),
 'class_weights_pth': PosixPath('../data/surnames/rnn_workdir/class_weights.pth'),
 'char_embedding_sz': 100,
 'rnn_hidden_sz': 64,
 'bs': 64,
 'lr': 0.001,
 'n_epochs': 97,
 'device': 'cuda:3',
 'checkpointer_prefix': 'surname_elman',
 'checkpointer_name': 'classifier',
 'es_patienct': 11,
 'save_every': 2,
 'save_total': 5}

## Data Loading

In [4]:
df = pd.read_csv(consts.proc_dataset_csv)
print(df.shape)
df.head()

(10980, 4)


Unnamed: 0,nationality,nationality_index,split,surname
0,Arabic,15,train,Totah
1,Arabic,15,train,Abboud
2,Arabic,15,train,Fakhoury
3,Arabic,15,train,Srour
4,Arabic,15,train,Sayegh


In [5]:
dc = DataContainer(df, SurnameDataset, consts.vectorizer_json, consts.bs, is_load=True)

In [6]:
try:
  class_weights = torch.load(consts.class_weights_pth)
except FileNotFoundError:
  nationality_vocab = dc.nationality_vocab
  class_counts = df['nationality'].value_counts().to_dict()
  sorted_counts = sorted(class_counts.items(), key=lambda x: nationality_vocab.lookup_token(x[0]))
  freq = [count for _, count in sorted_counts]
  class_weights = 1.0/torch.tensor(freq, dtype=torch.float32)
  torch.save(class_weights, consts.class_weights_pth)

## Model

In [7]:
def column_gather(y_out: torch.FloatTensor, x_lens: torch.FloatTensor) -> torch.FloatTensor:
  """
    Get a specific vector from each batch datapoint in 'y_out'
    Iteratove over batch row indices, get the vector thats at the position
    indicated by the corresponding value in 'x_lens' at the row index
    
    Args:
      y_out: shape (bs, seq_sz, feat_sz)
      x_lens: shape (bs,)
      
    Returns:
      y_out: shape (bs, feat_sz)
  """
  x_lens = x_lens.long().detach().cpu().numpy()-1
  
  out = []
  for batch_idx, column_idx in enumerate(x_lens):
    out.append(y_out[batch_idx, column_idx])
  
  return torch.stack(out)

In [8]:
class SurnameClassifier(nn.Module):
  """
    A Classifier with a RNN to extract features and an MLP to classify
  """
  def __init__(self, emb_sz: int, n_embs: int, n_classes: int, rnn_hidden_sz:int ,
               batch_first: bool=True, padding_idx: int=0) -> None:
    """
      Args:
        emb_sz: the size of the character embeddings
        n_embs: the number of characters to embed (vocabulary size)
        n_classes: the size of the prediction vector
        rnn_hidden_sz: the size of RNN's hidden state
        batch_first: informs wehther the input tensors will have batch or sequence on the 0th dim
        padding_idx: idx for the tensor padding        
    """
    super(SurnameClassifier, self).__init__()
    self.emb = nn.Embedding(n_embs, emb_sz, padding_idx)
    self.rnn = ElmanRNN(inp_sz=emb_sz, hidden_sz=rnn_hidden_sz, batch_first=batch_first)
    self.dropout = nn.Dropout(0.5)
    self.mlp = nn.Sequential(
      nn.Linear(rnn_hidden_sz, rnn_hidden_sz),
      nn.ReLU(),
      self.dropout,
      nn.Linear(rnn_hidden_sz, n_classes)
    )
    self.softmax = nn.Softmax(dim=1)
    
  def forward(self, x_in: torch.Tensor, x_lens: torch.Tensor=None, apply_softmax: bool=False) -> torch.Tensor:
    """
      The forward pass of the classifier
      
      Args:
        x_in: input tensor of shape (bs, input_dim)
        x_lens: lengths of each sequence in the batch used to find the final vector of each sequence
        apply_softmax: flag for softmax activation, should be false when used with nn.CrossEntropy
    """
#     pdb.set_trace()
    x_emb = self.emb(x_in)
    y_out = self.rnn(x_emb)
    
    if x_lens is not None:
      y_out = column_gather(y_out, x_lens)
    else:
      # since batch_first is true, the output of ElmanRNN is of shape (bs, seq_sz, hidden_sz)
      # this grabs the last hidden vector of each sequence of each batch
      # so y_out shape goes from (bs, seq_sz, feat_sz) to (bs, feat_sz)
      y_out = y_out[:, -1, :]
      
    y_out = self.dropout(y_out)
    y_out = self.mlp(y_out)
    
    if apply_softmax:
      y_out = self.softmax(y_out)
      
    return y_out

In [10]:
classifier = SurnameClassifier(consts.char_embedding_sz, dc.vocab_size, dc.n_classes, consts.rnn_hidden_sz, \
                       padding_idx=dc.surname_vocab.mask_idx)
loss_fn = nn.CrossEntropyLoss(class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=consts.lr)
scheduler = optim.lr_scheduler.
classifier

SurnameClassifier(
  (emb): Embedding(80, 100, padding_idx=0)
  (rnn): ElmanRNN(
    (rnn_cell): RNNCell(100, 64)
  )
  (dropout): Dropout(p=0.5)
  (mlp): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
    (3): Linear(in_features=64, out_features=18, bias=True)
  )
  (softmax): Softmax()
)

In [11]:
itr = iter(dc.train_dl)

In [13]:
x,l,y = next(itr)
y_pred = classifier(x,l)

### Column Gather Understanding

In [None]:
bs=3
hidden_sz=7
seq_sz =5 

In [None]:
x_lens = torch.randint(1, seq_sz+1, (bs,))
x_lens = x_lens.long().detach().cpu().numpy()-1
y_out = torch.randn(bs, seq_sz, hidden_sz)

In [None]:
print(x_lens.shape)
x_lens

In [None]:
print(y_out.shape)
y_out

In [None]:
out = []

for batch_idx, column_idx in enumerate(x_lens):
  out.append(y_out[batch_idx, column_idx])
#   print(batch_idx, column_idx)

In [None]:
y = torch.stack(out)

In [None]:
print(y.shape)
y

In [None]:
class SurnameClassifier(nn.Module):
  """ A Classifier with an RNN to extract features and an MLP to classify """
  def __init__(self, embedding_size, num_embeddings, num_classes,
               rnn_hidden_size, batch_first=True, padding_idx=0):
      """
      Args:
          embedding_size (int): The size of the character embeddings
          num_embeddings (int): The number of characters to embed
          num_classes (int): The size of the prediction vector 
              Note: the number of nationalities
          rnn_hidden_size (int): The size of the RNN's hidden state
          batch_first (bool): Informs whether the input tensors will 
              have batch or the sequence on the 0th dimension
          padding_idx (int): The index for the tensor padding; 
              see torch.nn.Embedding
      """
      super(SurnameClassifier, self).__init__()

      self.emb = nn.Embedding(num_embeddings=num_embeddings,
                              embedding_dim=embedding_size,
                              padding_idx=padding_idx)
      self.rnn = ElmanRNN(embedding_size,rnn_hidden_size,batch_first)
      self.fc1 = nn.Linear(in_features=rnn_hidden_size,
                       out_features=rnn_hidden_size)
      self.fc2 = nn.Linear(in_features=rnn_hidden_size,
                        out_features=num_classes)

  def forward(self, x_in, x_lengths=None, apply_softmax=False):
      """The forward pass of the classifier

      Args:
          x_in (torch.Tensor): an input data tensor. 
              x_in.shape should be (batch, input_dim)
          x_lengths (torch.Tensor): the lengths of each sequence in the batch.
              They are used to find the final vector of each sequence
          apply_softmax (bool): a flag for the softmax activation
              should be false if used with the Cross Entropy losses
      Returns:
          the resulting tensor. tensor.shape should be (batch, output_dim)
      """
      x_embedded = self.emb(x_in)
      y_out = self.rnn(x_embedded)

      if x_lengths is not None:
          y_out = column_gather(y_out, x_lengths)
      else:
          y_out = y_out[:, -1, :]

      y_out = F.relu(self.fc1(F.dropout(y_out, 0.5)))
      y_out = self.fc2(F.dropout(y_out, 0.5))

      if apply_softmax:
          y_out = F.softmax(y_out, dim=1)

      return y_out