<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports-&amp;-Inits" data-toc-modified-id="Imports-&amp;-Inits-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports &amp; Inits</a></span></li><li><span><a href="#Data-Loading" data-toc-modified-id="Data-Loading-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Loading</a></span></li><li><span><a href="#Model" data-toc-modified-id="Model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Model</a></span></li><li><span><a href="#Training" data-toc-modified-id="Training-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Training</a></span></li><li><span><a href="#Playground" data-toc-modified-id="Playground-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Playground</a></span></li></ul></div>

# Surname Classifier Using ElmanRNN

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

import pdb
import pandas as pd
import numpy as np
import torch
import re

from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.utils.data import DataLoader
from pathlib import Path

from ignite.engine import Events, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from ignite.contrib.handlers import ProgressBar

In [3]:
from surname.dataset import SurnameDataset
from surname.containers import DataContainer, ModelContainer
from surname.model import SurnameClassifier
from surname.trainer import IgniteTrainer
from consts import consts
vars(consts)

{'path': PosixPath('../data/surnames'),
 'workdir': PosixPath('../data/surnames/rnn_workdir'),
 'proc_dataset_csv': PosixPath('../data/surnames/surnames_with_splits.csv'),
 'model_dir': PosixPath('../data/surnames/rnn_workdir/models'),
 'vectorizer_json': PosixPath('../data/surnames/rnn_workdir/elman_vectorizer.json'),
 'metrics_file': PosixPath('../data/surnames/rnn_workdir/elman_metrics.csv'),
 'class_weights_pth': PosixPath('../data/surnames/rnn_workdir/class_weights.pth'),
 'char_embedding_sz': 100,
 'rnn_hidden_sz': 64,
 'bs': 64,
 'lr': 0.001,
 'n_epochs': 97,
 'device': 'cuda:3',
 'checkpointer_prefix': 'elman',
 'checkpointer_name': 'classifier',
 'es_patience': 11,
 'save_every': 2,
 'save_total': 5}

## Data Loading

In [4]:
df = pd.read_csv(consts.proc_dataset_csv)
print(df.shape)
df.head()

(10980, 4)


Unnamed: 0,nationality,nationality_index,split,surname
0,Arabic,15,train,Totah
1,Arabic,15,train,Abboud
2,Arabic,15,train,Fakhoury
3,Arabic,15,train,Srour
4,Arabic,15,train,Sayegh


In [5]:
dc = DataContainer(df, SurnameDataset, consts.vectorizer_json, consts.bs, is_load=True)

In [6]:
try:
  class_weights = torch.load(consts.class_weights_pth)
except FileNotFoundError:
  nationality_vocab = dc.nationality_vocab
  class_counts = df['nationality'].value_counts().to_dict()
  sorted_counts = sorted(class_counts.items(), key=lambda x: nationality_vocab.lookup_token(x[0]))
  freq = [count for _, count in sorted_counts]
  class_weights = 1.0/torch.tensor(freq, dtype=torch.float32)
  torch.save(class_weights, consts.class_weights_pth)

## Model

In [7]:
classifier = SurnameClassifier(consts.char_embedding_sz, dc.vocab_size, dc.n_classes, consts.rnn_hidden_sz, \
                       padding_idx=dc.surname_vocab.mask_idx)
class_weights = class_weights.to(consts.device)
loss_fn = nn.CrossEntropyLoss(class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=consts.lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', 0.5, patience=1)
mc = ModelContainer(classifier, optimizer, loss_fn, scheduler)
mc.model

SurnameClassifier(
  (emb): Embedding(80, 100, padding_idx=0)
  (rnn): ElmanRNN(
    (rnn_cell): RNNCell(100, 64)
  )
  (dropout): Dropout(p=0.5)
  (mlp): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
    (3): Linear(in_features=64, out_features=18, bias=True)
  )
  (softmax): Softmax()
)

In [None]:
itr = iter(dc.train_dl)

In [None]:
inp,y = next(itr)
y_pred = mc.model(x,l)
loss_fn(y_pred, y)

## Training

In [11]:
pbar = ProgressBar(persist=True)
metrics = {'accuracy': Accuracy(), 'loss': Loss(loss_fn)}

In [12]:
consts.n_epochs=2
ig = IgniteTrainer(mc, dc, consts, pbar, metrics)

In [13]:
ig.run()

Epoch [1/2]: [120/120] 100%|██████████, loss=2.52e+00 [00:02<00:00]
Epoch [2/2]: [9/120]   8%|▊         , loss=2.15e+00 [00:00<00:01]

Epoch: 1
Training - Loss: 2.316, Accuracy: 0.407
Validation - Loss: 2.383, Accuracy: 0.398
Time per batch 1.550[s]


Epoch [2/2]: [120/120] 100%|██████████, loss=2.29e+00 [00:02<00:00]


Epoch: 2
Training - Loss: 2.037, Accuracy: 0.425
Validation - Loss: 2.130, Accuracy: 0.422
Time per batch 1.548[s]
Training Done. Total training time: 0:00:07.820105


## Playground

In [None]:
bs=3
hidden_sz=7
seq_sz =5 

In [None]:
x_lens = torch.randint(1, seq_sz+1, (bs,))
x_lens = x_lens.long().detach().cpu().numpy()-1
y_out = torch.randn(bs, seq_sz, hidden_sz)

In [None]:
print(x_lens.shape)
x_lens

In [None]:
print(y_out.shape)
y_out

In [None]:
out = []

for batch_idx, column_idx in enumerate(x_lens):
  out.append(y_out[batch_idx, column_idx])
#   print(batch_idx, column_idx)

In [None]:
y = torch.stack(out)

In [None]:
print(y.shape)
y

In [None]:
bs=3
hidden_sz=7
seq_sz =5 

In [None]:
from surname.elman import ElmanRNN
e = ElmanRNN(consts.char_embedding_sz, consts.rnn_hidden_sz, batch_first=True)

In [None]:
inp = torch.randn(2,10,100)
e(inp)

In [None]:
inp = inp.to('cuda:3')
e = e.to('cuda:3')

In [None]:
inp = inp.cpu()
e = e.cpu()

In [None]:
e(inp)

In [None]:
x = torch.zeros(5, device='cuda:3')

In [None]:
x