This code is mainly inspired by [this link](https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html)

In [None]:
import os
import glob
import unicodedata
import string

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
!wget https://download.pytorch.org/tutorial/data.zip
!unzip data.zip

--2023-03-12 09:24:39--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 18.160.41.63, 18.160.41.124, 18.160.41.107, ...
Connecting to download.pytorch.org (download.pytorch.org)|18.160.41.63|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip’


2023-03-12 09:24:39 (136 MB/s) - ‘data.zip’ saved [2882130/2882130]

Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.t

In [None]:
DATASET_PATH = 'data/names/*.txt'

In [None]:
def find_files(path):
  return glob.glob(path)

file_names = find_files(DATASET_PATH)

In [None]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

In [None]:
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) \
                 if unicodedata.category(c) != 'Mn' and \
                 c in all_letters)

In [None]:
category_lines = {}
all_categories = []

In [None]:
def read_lines(filename):
  lines = open(filename, encoding='utf-8').read().strip().split('\n')
  return [unicode_to_ascii(line) for line in lines]

In [None]:
for filename in file_names:
  category = os.path.splitext(os.path.basename(filename))[0]
  all_categories.append(category)
  lines = read_lines(filename)
  category_lines[category] = lines

In [None]:
n_categories = len(all_categories)

In [None]:
def letter_to_index(letter):
  return all_letters.find(letter)

def letter_to_tensor(letter):
  tensor = torch.zeros(1, n_letters)
  tensor[0][letter_to_index(letter)] = 1

  return tensor

def line_to_tensor(line):
  tensor = torch.zeros(len(line), 1, n_letters)
  for li, letter in enumerate(line):
    tensor[li][0][letter_to_index(letter)] = 1

  return tensor

In [None]:
class Network(nn.Module):
  def __init__(self, input_size, hidden_size, output_size, num_rnn_layers=1, bidirectional=False):
    super(Network, self).__init__()

    self.num_rnn_layers = num_rnn_layers
    self.hidden_size = hidden_size
    self.bidirectional = bidirectional

    # self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=self.num_rnn_layers, bidirectional=bidirectional)
    self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=self.num_rnn_layers, bidirectional=bidirectional)
    
    h_size = 2 * self.hidden_size if self.bidirectional else self.bidirectional
    self.output = nn.Linear(h_size, output_size)

  def forward(self, x, hidden):
    # x, hidden = self.rnn(x, hidden)
    x, hidden = self.gru(x, hidden)
    x = self.output(x)

    return hidden, x
  
  def init_hidden(self):
    if self.bidirectional:
      return torch.zeros(2 * self.num_rnn_layers, 1, self.hidden_size)
    else:
      return torch.zeros(self.num_rnn_layers, 1, self.hidden_size)

In [None]:
X = []
Y = []

for label, category in enumerate(all_categories):
  for line in category_lines[category]:
    X.append(line_to_tensor(line))
    Y.append(torch.tensor([label]))

train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=.2)

print(f'training set has {len(train_x)} examples')
print(f'validation set has {len(test_x)} examples')

training set has 16059 examples
validation set has 4015 examples


In [None]:
n_hidden = 64

model = Network(input_size=n_letters, hidden_size=n_hidden, output_size=n_categories, bidirectional=True).to(device)

In [None]:
critition = nn.CrossEntropyLoss()
learning_rate = 0.005
optimizer = optim.SGD(params=model.parameters(), lr=learning_rate)

In [None]:
def train_step(x, y):
  hidden = model.init_hidden().to(device)

  optimizer.zero_grad()

  # for i, data in enumerate(x):
  hidden, output = model(x, hidden)
  
  loss = critition(output[-1], y)
  loss.backward()
  # torch.nn.utils.clip_grad_norm_(model.parameters(), 100)
  optimizer.step()

  output = F.softmax(output[-1], dim=-1)
  label = torch.argmax(output)


  return loss.item(), label

In [None]:
def predict(x):
  hidden = model.init_hidden().to(device)
  
  with torch.no_grad():
    # for i, data in enumerate(x):
    hidden, output = model(x, hidden)
  
  output = F.softmax(output[-1], dim=-1)
  label = torch.argmax(output)

  return label

In [None]:
epochs = 10

train_labels = [label.item() for label in train_y]
val_labels = [label.item() for label in test_y]

train_x = [x.to(device) for x in train_x]
train_y = [y.to(device) for y in train_y]

test_x = [x.to(device) for x in test_x]
test_y = [y.to(device) for y in test_y]

In [None]:
_, preds = model(train_x[0], model.init_hidden())
preds[-1].shape

torch.Size([1, 18])

In [None]:
for i in range(epochs):
  train_preds = []
  val_preds = []
  total_loss = .0

  for x, y in tqdm(zip(train_x, train_y)):
    loss_value, label = train_step(x, y)
    label = label.detach().cpu()
    total_loss += loss_value
    train_preds.append(label)
  
  for x in test_x:
    label = predict(x).detach().cpu()
    val_preds.append(label)
  
  train_accuracy = accuracy_score(train_labels, train_preds) 
  val_accuracy = accuracy_score(val_labels, val_preds) 
  total_loss /= len(train_x)

  # if (i + 1) % 10 == 0:
  print(f'\n#Epoch {i + 1}')
  print(f'Loss: {total_loss}, accuracy: {train_accuracy * 100}%, val_accuracy: {val_accuracy * 100}%')
  # else:
    # print('.', end='')

0it [00:00, ?it/s]


#Epoch 1
Loss: 1.5797966825968703, accuracy: 52.6745127342923%, val_accuracy: 60.2241594022416%


0it [00:00, ?it/s]


#Epoch 2
Loss: 1.2675883635931713, accuracy: 62.90553583660253%, val_accuracy: 65.0560398505604%


0it [00:00, ?it/s]


#Epoch 3
Loss: 1.127066186721342, accuracy: 66.87838595180273%, val_accuracy: 67.82067247820672%


0it [00:00, ?it/s]


#Epoch 4
Loss: 1.0445438446404143, accuracy: 68.91462731178778%, val_accuracy: 69.29016189290161%


0it [00:00, ?it/s]


#Epoch 5
Loss: 0.9892588757620915, accuracy: 70.06662930443987%, val_accuracy: 69.96264009962641%


0it [00:00, ?it/s]


#Epoch 6
Loss: 0.948951499603514, accuracy: 71.19995018369761%, val_accuracy: 71.4321295143213%


0it [00:00, ?it/s]


#Epoch 7
Loss: 0.9176418897278801, accuracy: 72.10287066442493%, val_accuracy: 72.52801992528019%


0it [00:00, ?it/s]


#Epoch 8
Loss: 0.8900069964315683, accuracy: 72.75670963322747%, val_accuracy: 72.97633872976338%


0it [00:00, ?it/s]


#Epoch 9
Loss: 0.8633899862209244, accuracy: 73.56622454698301%, val_accuracy: 74.12204234122044%


0it [00:00, ?it/s]


#Epoch 10
Loss: 0.8361301816081227, accuracy: 74.4317828009216%, val_accuracy: 74.52054794520548%
