### Imports

In [40]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import unicodedata
import string
import torch
import torch.nn as nn

### Loading Datasets

In [3]:
def findFiles(path): 
    return glob.glob(path)

In [4]:
print(findFiles('datasets/data/names/*.txt'))

['datasets/data/names/Arabic.txt', 'datasets/data/names/French.txt', 'datasets/data/names/German.txt', 'datasets/data/names/Japanese.txt', 'datasets/data/names/Korean.txt', 'datasets/data/names/Russian.txt', 'datasets/data/names/English.txt', 'datasets/data/names/Irish.txt', 'datasets/data/names/Scottish.txt', 'datasets/data/names/Portuguese.txt', 'datasets/data/names/Greek.txt', 'datasets/data/names/Chinese.txt', 'datasets/data/names/Dutch.txt', 'datasets/data/names/Czech.txt', 'datasets/data/names/Polish.txt', 'datasets/data/names/Vietnamese.txt', 'datasets/data/names/Spanish.txt', 'datasets/data/names/Italian.txt']


In [5]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

In [6]:
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )


In [8]:
print(unicodeToAscii('Ślusàrski'))

Slusarski


In [9]:
# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

In [10]:
# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]


In [15]:
for filename in findFiles('datasets/data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

In [16]:
n_categories = len(all_categories)

In [18]:
print ("Name Categories: ", all_categories)

Name Categories:  ['Arabic', 'French', 'German', 'Japanese', 'Korean', 'Russian', 'English', 'Irish', 'Scottish', 'Portuguese', 'Greek', 'Chinese', 'Dutch', 'Czech', 'Polish', 'Vietnamese', 'Spanish', 'Italian']


In [23]:
print("Example Arabic Names: ", category_lines['Arabic'][:15])

Example Arabic Names:  ['Khoury', 'Nahas', 'Daher', 'Gerges', 'Nazari', 'Maalouf', 'Gerges', 'Naifeh', 'Guirguis', 'Baba', 'Sabbagh', 'Attia', 'Tahan', 'Haddad', 'Aswad']


### Converting Names to Tensors

In [25]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

In [26]:
# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

In [29]:
print(letterToTensor('a'))

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])


In [33]:
# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    #That extra 1 dimension is because PyTorch assumes everything is in batches - we’re just using a batch size of 1 here.
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [34]:
print(lineToTensor('ab'))

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]]])


### Creating the RNN Network

In [58]:
x = torch.randn(1, 3)

In [59]:
print(x)
nn.LogSoftmax(dim=1)(x)

tensor([[-0.4182,  1.4131,  2.3059]])


tensor([[-3.1128, -1.2815, -0.3887]])

In [60]:
nn.LogSoftmax(dim=0)(x)

tensor([[0., 0., 0.]])