In [1]:
# Copyright (c) 2023 Sophie Katz
#
# This file is part of Sophie's ML Monorepo.
#
# Sophie's ML Monorepo is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later version.
#
# Sophie's ML Monorepo is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with Sophie's
# ML Monorepo. If not, see <https://www.gnu.org/licenses/>.

# PyTorch RNN Tutorial - Name Classification Using A Recurrent Neural Net

Tutorial URL: https://www.youtube.com/watch?v=WEV61GmmPrk

Valid as of: 2023.05.28

In [22]:
from ml.core.download import download_http
from ml.core.extract import extract_archive
from ml.core.repo_paths import get_dir_artifacts_data_raw, get_dir_artifacts_data_intermediate
import matplotlib.pyplot as plt
import os
import pathlib
import torch as T
import torch.nn as nn
import unicodedata

In [3]:
path_dir_artifacts_data_raw = get_dir_artifacts_data_raw("pytorch_rnn", create=True)
path_dir_artifacts_data_intermediate = get_dir_artifacts_data_intermediate("pytorch_rnn", create=True)

path_data = path_dir_artifacts_data_raw / "data.zip"

download_http("https://download.pytorch.org/tutorial/data.zip", path_data)

extract_archive(path_data, path_dir_artifacts_data_intermediate)

Downloading 'https://download.pytorch.org/tutorial/data.zip' to c:\Users\sophi\Code\ml\artifacts\data\pytorch_rnn\raw\data.zip...
  File already downloaded.
Extracting c:\Users\sophi\Code\ml\artifacts\data\pytorch_rnn\raw\data.zip to c:\Users\sophi\Code\ml\artifacts\data\pytorch_rnn\intermediate...


100%|██████████| 9.71M/9.71M [00:00<00:00, 2.40GB/s]

  Extraction complete.





In [27]:
def convert_to_ascii(text):
    # Convert special characters to their ascii equivalents
    return "".join([c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn"])

def load_data(path):
    dir_names = pathlib.Path(path, "data", "names")
    result = []

    for list_filename in os.listdir(dir_names):
        country = list_filename.split(".")[0]

        with open(dir_names / list_filename, "r", encoding="utf-8") as file:
            for name in file.readlines():
                result.append((country, convert_to_ascii(name.strip())))

    return result

def get_country_name_mapping(data):
    return {key: value for key, value in enumerate(set([country for country, _ in data]))}

def get_alphabet(data):
    return set([c for _, name in data for c in name])

data = load_data(path_dir_artifacts_data_intermediate)

print(f"Data: {data[:15]}")

country_name_mapping = get_country_name_mapping(data)

print(f"Countries: {country_name_mapping}")

assert len(country_name_mapping) == 18

alphabet = get_alphabet(data)

print(f"Alphabet: {alphabet} (len: {len(alphabet)})")

Data: [('Arabic', 'Khoury'), ('Arabic', 'Nahas'), ('Arabic', 'Daher'), ('Arabic', 'Gerges'), ('Arabic', 'Nazari'), ('Arabic', 'Maalouf'), ('Arabic', 'Gerges'), ('Arabic', 'Naifeh'), ('Arabic', 'Guirguis'), ('Arabic', 'Baba'), ('Arabic', 'Sabbagh'), ('Arabic', 'Attia'), ('Arabic', 'Tahan'), ('Arabic', 'Haddad'), ('Arabic', 'Aswad')]
Countries: {0: 'Polish', 1: 'Chinese', 2: 'Japanese', 3: 'Dutch', 4: 'German', 5: 'Scottish', 6: 'French', 7: 'English', 8: 'Italian', 9: 'Russian', 10: 'Portuguese', 11: 'Vietnamese', 12: 'Korean', 13: 'Irish', 14: 'Czech', 15: 'Spanish', 16: 'Greek', 17: 'Arabic'}
Alphabet: {'J', 'D', 'm', 'Y', 'E', 'v', 'V', 'e', 'y', 'F', 'Z', 'r', 'M', 'A', 'Q', 'x', 'I', 'C', 'K', 'a', 'g', 'O', 'z', 'B', '/', 'G', 'ł', 'R', 'i', 'L', '\xa0', 't', 'n', 'c', 'o', "'", 'u', 'W', 'd', 'P', 's', 'q', 'f', 'p', ':', ' ', 'X', 'w', 'T', 'h', '-', '1', 'k', 'U', 'j', ',', 'N', 'S', 'b', 'ß', 'l', 'H'} (len: 62)


In [26]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()

        self.hidden_size = hidden_size
        self.input_to_hidden = nn.Linear(input_size + hidden_size, hidden_size)
        self.input_to_output = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input, hidden):
        combined = T.cat((input, hidden), 1)
        hidden = self.input_to_hidden(combined)
        output = self.input_to_output(combined)
        output = self.softmax(output)

        return output, hidden

    def create_hidden_initial(self):
        return T.zeros(1, self.hidden_size)

hidden_size = 128

rnn = RNN(len(alphabet), hidden_size, len(country_name_mapping))

hidden = rnn.create_hidden_initial()