In [None]:
# CLASSIFYING NAMES WITH A CHARACTER-LEVEL RNN
# modified approach
# https://jaketae.github.io/study/pytorch-rnn/

# March 2023
# Download the dataset:
# https://download.pytorch.org/tutorial/data.zip
# copy to your datapath

In [1]:
import os, sys
import urllib.request

from google.colab import drive
drive.mount('/content/drive')
#change this based on your setup
root = '/content/drive/My Drive/Colab/ML/'
sys.path.append(root +'code/')
datapath =  root + 'data/'

Mounted at /content/drive


In [2]:
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 KB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6


In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob

#check to see if the data is downloaded
print(datapath)
for name in glob.glob(datapath + 'names/*'):
    print(name)

/content/drive/My Drive/Colab/ML/data/
/content/drive/My Drive/Colab/ML/data/names/Korean.txt
/content/drive/My Drive/Colab/ML/data/names/German.txt
/content/drive/My Drive/Colab/ML/data/names/Portuguese.txt
/content/drive/My Drive/Colab/ML/data/names/Dutch.txt
/content/drive/My Drive/Colab/ML/data/names/Polish.txt
/content/drive/My Drive/Colab/ML/data/names/Russian.txt
/content/drive/My Drive/Colab/ML/data/names/Greek.txt
/content/drive/My Drive/Colab/ML/data/names/Arabic.txt
/content/drive/My Drive/Colab/ML/data/names/Spanish.txt
/content/drive/My Drive/Colab/ML/data/names/Irish.txt
/content/drive/My Drive/Colab/ML/data/names/Czech.txt
/content/drive/My Drive/Colab/ML/data/names/Italian.txt
/content/drive/My Drive/Colab/ML/data/names/Japanese.txt
/content/drive/My Drive/Colab/ML/data/names/Chinese.txt
/content/drive/My Drive/Colab/ML/data/names/English.txt
/content/drive/My Drive/Colab/ML/data/names/Scottish.txt
/content/drive/My Drive/Colab/ML/data/names/French.txt
/content/drive/My

In [5]:
import random
from string import ascii_letters
import torch
from torch import nn
import torch.nn.functional as F

somenumber = 42
_ = torch.manual_seed(somenumber)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
datapath_names = datapath + 'names/'
print(datapath_names)

lang2label = {file_name.split(".")[0]: torch.tensor([i], dtype=torch.long)
  for i, file_name in enumerate(os.listdir(datapath_names))}

lang2label

/content/drive/My Drive/Colab/ML/data/names/


{'Korean': tensor([0]),
 'German': tensor([1]),
 'Portuguese': tensor([2]),
 'Dutch': tensor([3]),
 'Polish': tensor([4]),
 'Russian': tensor([5]),
 'Greek': tensor([6]),
 'Arabic': tensor([7]),
 'Spanish': tensor([8]),
 'Irish': tensor([9]),
 'Czech': tensor([10]),
 'Italian': tensor([11]),
 'Japanese': tensor([12]),
 'Chinese': tensor([13]),
 'English': tensor([14]),
 'Scottish': tensor([15]),
 'French': tensor([16]),
 'Vietnamese': tensor([17])}

In [7]:
num_langs = len(lang2label)


In [8]:
char2idx = {letter: i for i, letter in enumerate(ascii_letters + " .,:;-'")}
num_letters = len(char2idx); num_letters

59

In [9]:
def name2tensor(name):
    tensor = torch.zeros(len(name), 1, num_letters)
    for i, char in enumerate(name):
        tensor[i][0][char2idx[char]] = 1
    return tensor

In [10]:
#test: convert a simple string to a tensor
name2tensor("abc")

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]]])

In [14]:
from unidecode import *

tensor_names = []
target_langs = []

for file in os.listdir(datapath_names):
    with open(os.path.join(datapath_names, file)) as f:
        lang = file.split(".")[0]
        names = [unidecode(line.rstrip()) for line in f]
        for name in names:
            try:
                tensor_names.append(name2tensor(name))
                target_langs.append(lang2label[lang])
            except KeyError:
                pass

In [15]:
from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(
    range(len(target_langs)), 
    test_size=0.1, 
    shuffle=True, 
    stratify=target_langs
)

train_dataset = [
    (tensor_names[i], target_langs[i])
    for i in train_idx
]

test_dataset = [
    (tensor_names[i], target_langs[i])
    for i in test_idx
]

  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)


In [16]:
print(f"Train: {len(train_dataset)}")
print(f"Test: {len(test_dataset)}")

Train: 18063
Test: 2007


In [17]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.in2hidden = nn.Linear(input_size + hidden_size, hidden_size)
        self.in2output = nn.Linear(input_size + hidden_size, output_size)
    
    def forward(self, x, hidden_state):
        combined = torch.cat((x, hidden_state), 1)
        hidden = torch.sigmoid(self.in2hidden(combined))
        output = self.in2output(combined)
        return output, hidden
    
    def init_hidden(self):
        return nn.init.kaiming_uniform_(torch.empty(1, self.hidden_size))

In [19]:
hidden_size = 256
learning_rate = 0.001

model_s = SimpleRNN(num_letters, hidden_size, num_langs)
criterion_s = nn.CrossEntropyLoss()
optimizer_s = torch.optim.Adam(model_s.parameters(), lr=learning_rate)

In [21]:
num_epochs = 2
print_interval = 3000

for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i, (name, label) in enumerate(train_dataset):
        hidden_state = model_s.init_hidden()
        for char in name:
            output, hidden_state = model_s(char, hidden_state)
        loss = criterion_s(output, label)

        optimizer_s.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model_s.parameters(), 1)
        optimizer_s.step()
        
        if (i + 1) % print_interval == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Step [{i + 1}/{len(train_dataset)}], "
                f"Loss: {loss.item():.4f}"
            )

Epoch [1/2], Step [3000/18063], Loss: 2.7406
Epoch [1/2], Step [6000/18063], Loss: 0.0204
Epoch [1/2], Step [9000/18063], Loss: 0.0080
Epoch [1/2], Step [12000/18063], Loss: 3.3750
Epoch [1/2], Step [15000/18063], Loss: 1.7620
Epoch [1/2], Step [18000/18063], Loss: 0.0000
Epoch [2/2], Step [3000/18063], Loss: 0.0093
Epoch [2/2], Step [6000/18063], Loss: 0.3503
Epoch [2/2], Step [9000/18063], Loss: 0.5274
Epoch [2/2], Step [12000/18063], Loss: 0.9098
Epoch [2/2], Step [15000/18063], Loss: 0.0007
Epoch [2/2], Step [18000/18063], Loss: 4.0069


In [22]:
num_correct = 0
num_samples = len(test_dataset)

model_s.eval()

with torch.no_grad():
    for name, label in test_dataset:
        hidden_state = model_s.init_hidden()
        for char in name:
            output, hidden_state = model_s(char, hidden_state)
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred == label)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

Accuracy: 71.0513%


In [23]:
label2lang = {label.item(): lang for lang, label in lang2label.items()}

def SimpleRNN_predict(name):
    model_s.eval()
    tensor_name = name2tensor(name)
    with torch.no_grad():
        hidden_state = model_s.init_hidden()
        for char in tensor_name:
            output, hidden_state = model_s(char, hidden_state)
        _, pred = torch.max(output, dim=1)
    model_s.train()    
    return (label2lang[pred.item()])

In [24]:
SimpleRNN_predict("Qin")

'Chinese'

In [25]:
SimpleRNN_predict("Quentin")

'Russian'

In [26]:
SimpleRNN_predict("Slaveya")

'Russian'

In [27]:
SimpleRNN_predict("Fernando")

'Italian'

Compare to  a different model. Try GRU

In [28]:
class GRUModel(nn.Module):
    def __init__(self, num_layers, hidden_size):
        super(GRUModel, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.gru = nn.GRU(
            input_size=num_letters, 
            hidden_size=hidden_size, 
            num_layers=num_layers,
        )
        self.fc = nn.Linear(hidden_size, num_langs)
    
    def forward(self, x):
        hidden_state = self.init_hidden()
        output, hidden_state = self.gru(x, hidden_state)
        output = self.fc(output[-1])
        return output
    
    def init_hidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size).to(device)

In [30]:
model_g = GRUModel(num_layers=2, hidden_size=hidden_size)
optimizer_g = torch.optim.Adam(model_g.parameters(), lr=learning_rate)
criterion_g = nn.CrossEntropyLoss()

In [31]:
for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i, (name, label) in enumerate(train_dataset):
        output = model_g(name)
        loss = criterion_g(output, label)

        optimizer_g.zero_grad()
        loss.backward()
        optimizer_g.step()
         
        if (i + 1) % print_interval == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Step [{i + 1}/{len(train_dataset)}], "
                f"Loss: {loss.item():.4f}"
            )

Epoch [1/2], Step [3000/18063], Loss: 1.9539
Epoch [1/2], Step [6000/18063], Loss: 3.2937
Epoch [1/2], Step [9000/18063], Loss: 0.4654
Epoch [1/2], Step [12000/18063], Loss: 0.0004
Epoch [1/2], Step [15000/18063], Loss: 1.4867
Epoch [1/2], Step [18000/18063], Loss: 0.0001
Epoch [2/2], Step [3000/18063], Loss: 0.0028
Epoch [2/2], Step [6000/18063], Loss: 0.0009
Epoch [2/2], Step [9000/18063], Loss: 1.5605
Epoch [2/2], Step [12000/18063], Loss: 2.3188
Epoch [2/2], Step [15000/18063], Loss: 0.0001
Epoch [2/2], Step [18000/18063], Loss: 1.0600


In [32]:
num_correct = 0

model_g.eval()

with torch.no_grad():
    for name, label in test_dataset:
        output = model_g(name)
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred == label)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

Accuracy: 80.8670%


In [33]:
def GRU_RNN_predict(name):
    model_g.eval()
    tensor_name = name2tensor(name)
    with torch.no_grad():
        output = model_g(tensor_name)
        _, pred = torch.max(output, dim=1)
    model_g.train()
    return label2lang[pred.item()]

In [35]:
GRU_RNN_predict("Jake")


'English'

In [36]:
GRU_RNN_predict("Qin")

'Chinese'

In [37]:
GRU_RNN_predict("Fernando")


'Italian'

In [38]:
GRU_RNN_predict("Quentin")

'English'