In [132]:
import glob
import unicodedata
import string
import torch
import torch.nn as nn
import random

In [133]:
alphabet = string.ascii_lowercase
input_size = len(alphabet)
filenames=glob.glob('./data/names/*')
output_size = len(filenames)
hidden_size = 128


In [134]:
categories = []
names = {}
categoryLines = {}

for i,f in enumerate(filenames):
    cls = f.split('/')[-1].split('.')[0]
    categories.append(cls)
    cline = []
    with open(f, 'r') as dfile:
        for line in dfile:
            name = unicodedata.normalize('NFKD', line.rstrip('\n')).encode('ascii','ignore').decode('utf-8').lower()
            names[name] = cls
            cline.append(name)
    categoryLines[cls] = cline


In [135]:
categoryLines[categories[0]];

In [136]:
def letter2Index(letter): return alphabet.find(letter)
def index2Letter(i): return alphabet[i]
def letter2Tensor(letter):
    t = torch.zeros(1,input_size)
    idx = letter2Index(letter)
    t[idx] = 1
    return t
def cat2Index(cat): return categories.index(cat)
def cat2Tensor(cat): 
    t = torch.zeros(1, dtype=torch.long)
    t[0] = cat
    return t

In [137]:
x =cat2Tensor(5)
print(x.size(),x)

torch.Size([1]) tensor([ 5])


In [138]:
def randomChoice(l): return l[random.randint(0, len(l)-1)]    
def randomCategory(c): return randomChoice(c)
def randomWord(w):
    i = random.randint(0, len(w)-1)
    word = list(w)[i]
    cat =  names[word]
    cat_idx = cat2Tensor(cat2Index(cat))
    word_idx = word2Tensor(word)
    return i, word, word_idx, cat_idx

In [139]:
def word2Tensor(line):
    t = torch.zeros(len(line),1,input_size)
    for i,l in enumerate(line):
        t[i][0][letter2Index(l)] = 1
    return t
            

In [140]:
idx,word,w,cat = randomWord(names)
# print(idx,word,w,cat)


In [141]:
list(names)[0]

'veitch'

In [142]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(input_size+hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size+hidden_size, output_size)
        self.softMax = nn.LogSoftmax(dim=1)
    
    def forward(self,input, hidden):
        combined = torch.cat((input, hidden),1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softMax(output)
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1,self.hidden_size)

In [143]:
def categoryFromOutput(output):
    val, idx = output.topk(1)
    return val, idx

In [144]:

rnn  = RNN(input_size, hidden_size, output_size)

In [145]:
in_put = letter2Tensor('a')
hidden = torch.zeros(1,hidden_size)
out_put, hidden = rnn(in_put,hidden)
print(out_put)

tensor([[-2.7975, -3.5700, -3.0174, -2.7378, -2.5893, -3.1056, -3.4039,
         -2.8023, -3.1005, -2.8919, -2.9581, -2.6795, -3.1690, -3.0042,
         -2.7648, -2.3966, -3.0337, -2.6785]])


In [146]:
out_put, hidden = rnn(in_put,hidden)
print(out_put)

tensor([[-2.6764, -3.4456, -2.9940, -2.7209, -2.5420, -3.2052, -3.5449,
         -2.9766, -2.8770, -2.8413, -3.1155, -2.9389, -2.9495, -2.8711,
         -2.7499, -2.3439, -3.0307, -2.8968]])


In [147]:
# idx,w,word,cat = randomWord(names)
# print(cat)
# cat_idx = cat2Index(cat)
# print(cat_idx)
# cat_idx = cat2Tensor(cat_idx)
# print(cat_idx)
# word_idx = word2Tensor(w)

In [148]:
criterion = nn.NLLLoss()
# criterion = nn.CrossEntropyLoss()

In [149]:
lr = 0.0001
optimizer = torch.optim.SGD(rnn.parameters(), lr=lr)
for i in range(1000):
    idx,w,word,y = randomWord(names)

    hidden = rnn.init_hidden()
    rnn.zero_grad()
    for l in word:
        output, hidden = rnn(l,hidden)
#         print(output)
    #print(output.size(), y, y.size())    
    loss = criterion(output,y)
    loss.backward()
#     for p in rnn.parameters():
#         p.data.add_(-lr, p.grad.data)

    optimizer.step()

#     print(loss.item())
print(loss.item())

2.678760290145874


In [152]:
def evaluate(word):
    hidden = rnn.init_hidden()
    for l in word:
        output, hidden = rnn(l,hidden)
#     print(output)        
    return output

In [154]:
total = 100
accuracy =0
for x in range(total):
    idx,name, word,y = randomWord(names)
    with torch.no_grad():
        out = evaluate(word)
        val,i = categoryFromOutput(out)
#         print(idx,name,categories[y],categories[i],y,val,out)
        print(name, categories[y], categories[i])
        if categories[y] == categories[i]:
            accuracy+=1

print("accuracy", accuracy/total)            

fujimaki Japanese Arabic
toichkin Russian Russian
kelso English English
munehin Russian Russian
bazhenov Russian Russian
hovanec Czech English
okeefe English Spanish
parratt English Vietnamese
busto Spanish Spanish
vilchek Russian English
ennos English Dutch
bellandini Italian Spanish
fellmen English Russian
de campo Italian English
broadley English Dutch
dubhshlaine Irish English
lupichev Russian Russian
granovsky Russian English
joltovsky Russian English
schlender German English
coulson English Russian
paisley English Dutch
kaufer German English
rallis Greek Dutch
dzhaldzhireev Russian Russian
winter German Arabic
tchekomasov Russian Russian
vasindin Russian Russian
alldritt English Russian
menendez Spanish Russian
shamsutdinov Russian Russian
soto Spanish Spanish
rosso English English
taverna Italian Russian
stevenson English Russian
alberici Italian Russian
awgustoff Russian Dutch
abano Spanish Dutch
haenraets Dutch Dutch
agli Italian Russian
tinker English Dutch
petru Czech Russia

In [68]:
a,b = categoryFromOutput(out)
print(out,a,b)

tensor([[-4.2089, -5.9110, -4.5130, -5.8196, -6.2518, -3.1251, -6.2346,
         -5.2451, -0.3598, -4.5567, -6.2746, -4.7919, -5.3836, -5.1650,
         -4.0600, -1.9801, -3.7620, -4.8171]]) tensor([[-0.3598]]) tensor([[ 8]])


In [515]:
y

tensor([ 15])