In [16]:
# following along this blog post to gain more understanding about RNNs
# https://jaketae.github.io/study/pytorch-rnn/

In [17]:
# Task -> Build a simple classification model that can correctly determine the 
# nationality of a person given their name. Simple , we want to be able to tell where a 
# particular name is from 

In [18]:
# download and unzip data in current directory
!curl -O https://download.pytorch.org/tutorial/data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2814k  100 2814k    0     0  1763k      0  0:00:01  0:00:01 --:--:-- 1762k


In [19]:
!unzip data.zip

Archive:  data.zip
replace data/eng-fra.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [20]:
# looking at the data in more detail
import os
import random 

# setup pytorch 
import torch 
from torch import nn
import torch.nn.functional as F
from unidecode import unidecode

seed_value = torch.manual_seed(42)
# change the device to GPU if its there else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
# specifying a directory and try to print all the labels that are there in the directory
data_directory = "./data/names"

# then construct a dictionary that will map a language to a numerical label
lang2label = {}
for i, file_name in enumerate(os.listdir(data_directory)):
    key = file_name.split(".")[0]
    val = torch.tensor([i],dtype=torch.long)
    lang2label[key] = val

In [22]:
#checking what lang2label contains 
print(lang2label)

{'Czech': tensor([0]), 'German': tensor([1]), 'Arabic': tensor([2]), 'Japanese': tensor([3]), 'Chinese': tensor([4]), 'Vietnamese': tensor([5]), 'Russian': tensor([6]), 'French': tensor([7]), 'Irish': tensor([8]), 'English': tensor([9]), 'Spanish': tensor([10]), 'Greek': tensor([11]), 'Italian': tensor([12]), 'Portuguese': tensor([13]), 'Scottish': tensor([14]), 'Dutch': tensor([15]), 'Korean': tensor([16]), 'Polish': tensor([17])}


In [23]:
# count of languages
num_langs = len(lang2label)

In [24]:
num_langs

18

In [25]:
# Preprocessing Stage 
# preprocessing the names -> first want to use unidecode to standardize all the names and remove 
# any acute symbols or the likes

unidecode("Ślusàrski")

'Slusarski'

In [26]:
# converting a decoded string to a tensor so that the model can process it 
from string import ascii_letters
char2idx = {letter: i for i , letter in enumerate(ascii_letters + " .,:;-'")}
num_letters = len(char2idx)

In [27]:
# the total number of tokens in our character vocabulary 
num_letters

59

In [31]:
# building a function that accomplishes the task
def name2tensor(name):
    tensor = torch.zeros(len(name),1,num_letters)
    for i, char in enumerate(name):
        tensor[i][0][char2idx[char]] = 1
    return tensor

In [32]:
name2tensor("abc")

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]]])

In [33]:
# Dataset Creation ->
# Need to build our dataset with all the preprocessing steps. 
# collecting all the decoded and converted tensors in a list, with accompanying labels.
# the labels can be obtained easily from the file name

In [35]:
tensor_names = []
target_langs = []

for file in os.listdir(data_directory):
    with open(os.path.join(data_directory,file)) as f:
        lang = file.split(".")[0]
        names = [unidecode(line.rstrip()) for line in f]
        for name in names:
            try:
                tensor_names.append(name2tensor(name))
                target_langs.append(lang2label[lang])
            except KeyError:
                pass
                

In [36]:
# using sklearn's train_test_split() to seperate the training data from the testing data


In [38]:
from sklearn.model_selection import train_test_split

In [41]:
train_idx, test_idx = train_test_split(
    range(len(target_langs)),
    test_size = 0.1,
    shuffle=True,
    stratify=target_langs
)
# making the training dataset 
train_dataset = []
for i in train_idx:
    train_dataset.append((tensor_names[i],target_langs[i]))

# making the test dataset 
test_dataset = []
for i in test_idx:
    test_dataset.append((tensor_names[i],target_langs[i]))
    

In [42]:
# printing our train and test dataset
print(f"Train Dataset size: {len(train_dataset)}")
print(f"Test Dataset size: {len(test_dataset)}")

Train Dataset size: 18063
Test Dataset size: 2007


In [45]:
# Our Simple RNN model
# A simple RNN that takes a single character tensor repr as input and produces 
# some prediction and a hidden state, which can be used in the next iteration.
# just some fully connected layers with sigmoid non-linearity applied during the hidden 
# state computation 

class SimpleRNN(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(SimpleRNN,self).__init__()
        self.hidden_size = hidden_size
        self.in2hidden = nn.Linear(input_size + hidden_size, hidden_size)
        self.in2output = nn.Linear(input_size + hidden_size, output_size)

    def forward(self,x,hidden_state):
        combined = torch.cat((x,hidden_state),1)
        hidden = torch.sigmoid(self.in2hidden(combined))
        output = self.in2output(combined)
        return output, hidden 

    def init_hidden(self):
        return nn.init.kaiming_uniform_(torch.empty(1,self.hidden_size))
                

In [47]:
# calling init_hidden() at the start of every new batch. For easier training and learning 
# using kaiming_uniform_() to initialize these hidden states

# building our model and training it 
hidden_size = 256
learning_rate = 0.001

model = SimpleRNN(num_letters,hidden_size,num_langs)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

In [54]:
## training our model
num_epochs = 10
print_interval = 3000

for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i, (name,label) in enumerate(train_dataset):
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char,hidden_state)
        loss = criterion(output,label)

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(),1)
        optimizer.step()

        if (i+1) % print_interval == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Step [{i + 1}/{len(train_dataset)}], "
                f"Loss: {loss.item():.4f}"
            )            

Epoch [1/10], Step [3000/18063], Loss: 0.0000
Epoch [1/10], Step [6000/18063], Loss: 0.0077
Epoch [1/10], Step [9000/18063], Loss: 0.5191
Epoch [1/10], Step [12000/18063], Loss: 0.0159
Epoch [1/10], Step [15000/18063], Loss: 0.0025
Epoch [1/10], Step [18000/18063], Loss: 0.0012
Epoch [2/10], Step [3000/18063], Loss: 0.1433
Epoch [2/10], Step [6000/18063], Loss: 0.0010
Epoch [2/10], Step [9000/18063], Loss: 1.6404
Epoch [2/10], Step [12000/18063], Loss: 2.3458
Epoch [2/10], Step [15000/18063], Loss: 0.0060
Epoch [2/10], Step [18000/18063], Loss: 0.0076
Epoch [3/10], Step [3000/18063], Loss: 0.0395
Epoch [3/10], Step [6000/18063], Loss: 2.5282
Epoch [3/10], Step [9000/18063], Loss: 0.0309
Epoch [3/10], Step [12000/18063], Loss: 0.0148
Epoch [3/10], Step [15000/18063], Loss: 0.0016
Epoch [3/10], Step [18000/18063], Loss: 0.0000
Epoch [4/10], Step [3000/18063], Loss: 0.0245
Epoch [4/10], Step [6000/18063], Loss: 1.0341
Epoch [4/10], Step [9000/18063], Loss: 0.2133
Epoch [4/10], Step [12000

In [55]:
# testing our model , looking at accuracy factor 
num_correct = 0 
num_samples = len(test_dataset)

model.eval()

with torch.no_grad():
    for name,label in test_dataset:
        hidden_state = model.init_hidden()
        for char in name:
            output,hidden_state = model(char,hidden_state)
        _,pred = torch.max(output,dim=1)
        num_correct += bool(pred == label)

print(f"Accuracy: {num_correct/ num_samples * 100:.4f}%")




Accuracy: 82.6109%


In [56]:
# checking our model with some concrete examples 
label2lang = {}
for lang, label in lang2label.items():
    label2lang[label.item()] = lang 

def myrnn_predict(name):
    model.eval()
    tensor_name = name2tensor(name)
    with torch.no_grad():
        hidden_state = model.init_hidden()
        for char in tensor_name:
            output, hidden_state = model(char,hidden_state)
        _,pred = torch.max(output,dim=1)
    model.train()
    return label2lang[pred.item()]

In [58]:
myrnn_predict("Mike")

'Japanese'

In [59]:
myrnn_predict("Qin")

'Chinese'

In [60]:
myrnn_predict("Sagnik")

'Arabic'