In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# This code will download our labeled data
!curl -O https://download.pytorch.org/tutorial/data.zip; unzip data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2814k  100 2814k    0     0  7906k      0 --:--:-- --:--:-- --:--:-- 7906k
Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.txt   
  inflating: data/names/Polish.txt   
  inflating: data/names/Portuguese.txt  
  inflating: data/names/Russian.txt  
  inflating: data/names/Scottish.txt  
  inflating: data/names/Spanish.txt  
  inflating

In [5]:
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 4.8 MB/s 
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6


In [6]:
# Let's import our dependencies and configure some settings
import os
import random
from string import ascii_letters

import torch
from torch import nn
import torch.nn.functional as F
from unidecode import unidecode

_ = torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
# We first specify a directory, then try to print out all the labels there are.
# We can then construct a dictionary that maps a language to a numerical label
data_dir = "./data/names"

lang2label = {
    file_name.split(".")[0]: torch.tensor([i], dtype=torch.long)
    for i, file_name in enumerate(os.listdir(data_dir))
}

In [9]:
lang2label

{'Irish': tensor([0]),
 'Czech': tensor([1]),
 'Vietnamese': tensor([2]),
 'English': tensor([3]),
 'French': tensor([4]),
 'Scottish': tensor([5]),
 'Korean': tensor([6]),
 'Russian': tensor([7]),
 'Japanese': tensor([8]),
 'Italian': tensor([9]),
 'Greek': tensor([10]),
 'Portuguese': tensor([11]),
 'Polish': tensor([12]),
 'German': tensor([13]),
 'Arabic': tensor([14]),
 'Dutch': tensor([15]),
 'Spanish': tensor([16]),
 'Chinese': tensor([17])}

In [11]:
"""Let’s store the number of languages in some variable so that we can use it later in our model declaration,
specifically when we specify the size of the final output layer"""
num_langs = len(lang2label)

In [12]:
"""Now, let’s preprocess the names. We first want to use unidecode to standardize all names and remove any
acute symbols or the likes"""
unidecode("Ślusàrski")

'Slusarski'

In [14]:
"""Once we have a decoded string, we then need to convert it to a tensor so that the model can process it. 
This can first be done by constructing a char2idx mapping, as shown below"""
char2idx = {letter: i for i, letter in enumerate(ascii_letters + " .,:;-'")}
num_letters = len(char2idx)
num_letters

59

We see that there are a total of 59 tokens in our character vocabulary. This includes spaces and punctuations, such as ` .,:;-‘. This also means that each name will now be expressed as a tensor of size (num_char, 59); in other words, each character will be a tensor of size (59,)`. We can now build a function that accomplishes this task, as shown below:

In [16]:
def name2tensor(name):
    tensor = torch.zeros(len(name), 1, num_letters)
    for i, char in enumerate(name):
        tensor[i][0][char2idx[char]] = 1
    return tensor

If you read the code carefully, you’ll realize that the output tensor is of size (num_char, 1, 59), which is different from the explanation above. Well, the reason for that extra dimension is that we are using a batch size of 1 in this case. In PyTorch, RNN layers expect the input tensor to be of size (seq_len, batch_size, input_size).

Let’s quickly verify the output of the name2tensor() function with a dummy input.

In [21]:
name2tensor("abc")

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]]])

Now we need to build a our dataset with all the preprocessing steps. Let’s collect all the decoded and converted tensors in a list, with accompanying labels. The labels can be obtained easily from the file name, for example german.txt.

In [22]:
tensor_names = []
target_langs = []

for file in os.listdir(data_dir):
    with open(os.path.join(data_dir, file)) as f:
        lang = file.split(".")[0]
        names = [unidecode(line.rstrip()) for line in f]
        for name in names:
            try:
                tensor_names.append(name2tensor(name))
                target_langs.append(lang2label[lang])
            except KeyError:
                pass

We could wrap this in a PyTorch Dataset class, but for simplicity sake let’s just use a good old for loop to feed this data into our model. Since we are dealing with normal lists, we can easily use sklearn’s train_test_split() to separate the training data from the testing data.

In [33]:
from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(
    range(len(target_langs)), 
    test_size=0.1, 
    shuffle=True, 
    stratify=target_langs
)

train_dataset = [
    (tensor_names[i], target_langs[i])
    for i in train_idx
]

test_dataset = [
    (tensor_names[i], target_langs[i])
    for i in test_idx
]

  array = np.asarray(array, order=order, dtype=dtype)
  array = np.asarray(array, order=order, dtype=dtype)


Let’s see how many training and testing data we have. Note that we used a test_size of 0.1.

In [34]:
print(f"Train: {len(train_dataset)}")
print(f"Test: {len(test_dataset)}")

Train: 18063
Test: 2007


Now we can build our model. This is a very simple RNN that takes a single character tensor representation as input and produces some prediction and a hidden state, which can be used in the next iteration. Notice that it is just some fully connected layers with a sigmoid non-linearity applied during the hidden state computation.

In [35]:
class MyRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MyRNN, self).__init__()
        self.hidden_size = hidden_size
        self.in2hidden = nn.Linear(input_size + hidden_size, hidden_size)
        self.in2output = nn.Linear(input_size + hidden_size, output_size)
    
    def forward(self, x, hidden_state):
        combined = torch.cat((x, hidden_state), 1)
        hidden = torch.sigmoid(self.in2hidden(combined))
        output = self.in2output(combined)
        return output, hidden
    
    def init_hidden(self):
        return nn.init.kaiming_uniform_(torch.empty(1, self.hidden_size))


We call init_hidden() at the start of every new batch. For easier training and learning, I decided to use kaiming_uniform_() to initialize these hidden states.

We can now build our model and start training it.

In [36]:
hidden_size = 256
learning_rate = 0.001

model = MyRNN(num_letters, hidden_size, num_langs)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

I realized that training this model is very unstable, and as you can see the loss jumps up and down quite a bit. Nonetheless, I didn’t want to cook my 13-inch MacBook Pro so I decided to stop at two epochs.

In [37]:
num_epochs = 2
print_interval = 3000

for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i, (name, label) in enumerate(train_dataset):
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        
        if (i + 1) % print_interval == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Step [{i + 1}/{len(train_dataset)}], "
                f"Loss: {loss.item():.4f}"
            )


Epoch [1/2], Step [3000/18063], Loss: 0.0230
Epoch [1/2], Step [6000/18063], Loss: 1.3776
Epoch [1/2], Step [9000/18063], Loss: 0.0006
Epoch [1/2], Step [12000/18063], Loss: 3.2339
Epoch [1/2], Step [15000/18063], Loss: 0.4707
Epoch [1/2], Step [18000/18063], Loss: 3.4659
Epoch [2/2], Step [3000/18063], Loss: 0.1831
Epoch [2/2], Step [6000/18063], Loss: 0.1333
Epoch [2/2], Step [9000/18063], Loss: 1.9100
Epoch [2/2], Step [12000/18063], Loss: 0.0001
Epoch [2/2], Step [15000/18063], Loss: 0.0237
Epoch [2/2], Step [18000/18063], Loss: 3.1870


Now we can test our model. We could look at other metrics, but accuracy is by far the simplest, so let’s go with that.

In [38]:
num_correct = 0
num_samples = len(test_dataset)

model.eval()

with torch.no_grad():
    for name, label in test_dataset:
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred == label)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

Accuracy: 71.5994%


The model records a 72 percent accuracy rate. This is very bad, but given how simple the models is and the fact that we only trained the model for two epochs, we can lay back and indulge in momentary happiness knowing that the simple RNN model was at least able to learn something.

Let’s see how well our model does with some concrete examples. Below is a function that accepts a string as input and outputs a decoded prediction.

In [39]:
label2lang = {label.item(): lang for lang, label in lang2label.items()}

def myrnn_predict(name):
    model.eval()
    tensor_name = name2tensor(name)
    with torch.no_grad():
        hidden_state = model.init_hidden()
        for char in tensor_name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
    model.train()    
    return label2lang[pred.item()]

In [45]:
myrnn_predict("hasan")

'Arabic'