# utils

In [1]:
import io 
import os
import unicodedata
import string
import glob

import random
import torch

In [2]:
# alphabet small + capital letters + " .,;'"
ALL_LETTERS = string.ascii_letters + " .,;"
N_LETTERS =  len(ALL_LETTERS)

In [3]:
type(ALL_LETTERS)

str

In [4]:
ALL_LETTERS[7]

'h'

In [5]:
#turn unicpde sdtring into plain ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) !='Mn' and c in ALL_LETTERS)

In [6]:
z = io.open('French.txt', encoding='utf-8')
print(z)

<_io.TextIOWrapper name='French.txt' mode='r' encoding='utf-8'>


In [7]:
a = io.open('French.txt', encoding='utf-8').read()
print(a[0:50])

Abel
Abraham
Adam
Albert
Allard
Archambault
Armist


In [8]:
len(a)

2162

In [9]:
type(a)

str

In [10]:
b = a.strip()
print(b[0:50])

Abel
Abraham
Adam
Albert
Allard
Archambault
Armist


In [11]:
type(b)

str

In [12]:
c = b.split('\n')
c[0:5]

['Abel', 'Abraham', 'Adam', 'Albert', 'Allard']

In [13]:
type(c)

list

In [14]:
len(c)

277

In [21]:
def load_data():
    category_lines = {}
    all_categories = []
    
    def find_files(path):
        return glob.glob(path)
    
    #read file and split into liens
    def read_lines(filename):
        lines = io.open(filename, encoding='utf-8').read().strip().split('\n')
        return [unicode_to_ascii(line) for line in lines]
    
    for filename in find_files('M:\\DL\TEXT\Datas\surname_rnn_data/names/*.txt'):
        category = os.path.splitext(os.path.basename(filename))[0]
        all_categories.append(category)
        
        lines = read_lines(filename)
        category_lines[category] = lines
        
    return category_lines, all_categories


In [22]:
x , y = load_data()
print(y)

[]


In [18]:
print(x['French'])

KeyError: 'French'

## now convert to tensors

In [None]:
def letter_index(letter):
    return ALL_LETTERS.find(letter)

print(letter_index('x'))

for i in ALL_LETTERS[0:7]:
    print(i,letter_index(i))

In [None]:
# Just for demonstration, turn a letter into a (1 x n_letters) Tensor
def letter_to_tensor(letter):
    tensor = torch.zeros(1, N_LETTERS)
    tensor[0][letter_index(letter)]=1
    return tensor

itensor =  letter_to_tensor('i')
print(type(itensor))
print(itensor.shape)
print(itensor)

In [None]:
# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors

def line_tensor(line):
    tensor = torch.zeros(len(line), 1, N_LETTERS)
    for i, letter in enumerate(line):
        tensor[i][0][letter_index(letter)]=1
    return tensor

wordtensor = line_tensor('succint')
print(wordtensor.shape)
print(wordtensor)
    


In [None]:
category_lines, all_categories = load_data()
print(all_categories.index('French'))


In [None]:
def random_train_example(category_lines, all_categories):
    def random_choice(a):
        random_idx = random.randint(0, len(a)-1)
        return a[random_idx]
    
    category = random_choice(all_categories)
    line = random_choice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype = torch.long)
    linetensor = line_tensor(line)
    return category, line, category_tensor, linetensor


In [None]:
category_lines, all_categories = load_data()
p,q,r,s = random_train_example(category_lines, all_categories)

print(p,q,r.shape,s.shape,type(r), type(s),r.dtype, s.dtype)

# model

In [None]:
ed = torch.tensor([[2]])
fg = torch.tensor([[3]])
ss = torch.cat((ed,fg),0) #default dimension
ds = torch.cat((ed,fg), 1)
print(ed.shape)
print(ds, ss)
print(ds.shape, ss.shape)

In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

# from utils import ALL_LETTERS, N_LETTERS
# from utils import letter_index, letter_to_tensor, line_tensor, random_train_example

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim = 1)
        
    def forward(self, input_tensor, hidden_tensor):
            
        combined = torch.cat((input_tensor, hidden_tensor), 1)

        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden, combined
        
    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)
        
    

In [None]:
N_LETTERS
category_lines, all_categories = load_data()
n_categories = len(all_categories)
n_hidden = 128
rnn = RNN(N_LETTERS,n_hidden,n_categories)

In [None]:
rnn.i2h

## for letter 

In [None]:
input_tensor = letter_to_tensor('A')
hidden_tensor = rnn.init_hidden()

output, next_hidden, combined = rnn(input_tensor, hidden_tensor)

In [None]:
print(input_tensor.shape)
print(hidden_tensor.shape)
print(combined.shape)

print(output.shape)
print(next_hidden.shape)


## for word

In [None]:
input_tensor = line_tensor('Albert')
hidden_tensor = rnn.init_hidden()

output, next_hidden, combined = rnn(input_tensor[0], hidden_tensor)

In [None]:
print(input_tensor.shape)
print(hidden_tensor.shape)
print(combined.shape)

print(output.shape)
print(next_hidden.shape)


In [None]:
def category_from_output(output):
    category_idx = torch.argmax(output).item()
    return all_categories[category_idx]

print(category_from_output(output))

In [None]:
criterion = nn.NLLLoss()
learning_rate = 0.005
optimizer = torch.optim.SGD(rnn.parameters(), lr = learning_rate)

In [None]:
def train(line_tensor, category_tensor):
    hidden = rnn.init_hidden()
    
    for i in range(line_tensor.shape[0]):
        output, hidden, combined = rnn(line_tensor[i],hidden)
        
    loss = criterion(output, category_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return output,  loss.item()



In [None]:
curretn_loss = 0
all_losses= []
plot_steps, print_steps = 1000,5000
n_iters =  100000
 
for i in range(n_iters):
    category, line, category_tensor, linetensor = random_train_example(category_lines, all_categories)
    
    output, loss =  train(line_tensor, category_tensor)
    current_loss +=loss
    
    if (i+1) % plot_steps ==0:
        all_losses.append(current_loss/plot_steps)
        current_loss = 0
        
    if (i+1) % print_steps == 0:
        guess = category_from_output(output)
        correct = 'correct' if guess ==  category else f'wrong ({category})'
        print(f'{i} {i/n_iters*100} {loss:.4f}{line}/{guess}{correct}')
        
plt.figure()
plt.plot(all_losses)
plt.show()
