# **Character-RNN**

In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn

import string

## Getting Data

In [2]:
df = pd.read_csv('names-and-origins.csv')

In [3]:
df.head(10)

Unnamed: 0,origin,name,normalized_name
0,English,Abbas,abbas
1,English,Abbey,abbey
2,English,Abbott,abbott
3,English,Abdi,abdi
4,English,Abel,abel
5,English,Abraham,abraham
6,English,Abrahams,abrahams
7,English,Abrams,abrams
8,English,Ackary,ackary
9,English,Ackroyd,ackroyd


In [4]:
len(df)

20074

In [5]:
df['origin'] = df['origin'].astype('category')

In [6]:
df['origin_code'] = df['origin'].cat.codes

In [7]:
df['origin'].cat.categories

Index(['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish',
       'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese'],
      dtype='object')

In [10]:
df.head(10)

Unnamed: 0,origin,name,normalized_name,origin_code
0,English,Abbas,abbas,4
1,English,Abbey,abbey,4
2,English,Abbott,abbott,4
3,English,Abdi,abdi,4
4,English,Abel,abel,4
5,English,Abraham,abraham,4
6,English,Abrahams,abrahams,4
7,English,Abrams,abrams,4
8,English,Ackary,ackary,4
9,English,Ackroyd,ackroyd,4


## Tokenizer

In [11]:
class Tokenizer:
    
    def __init__(self, num_tokens=26):
        self.num_tokens = num_tokens
        self.tokens = list(string.ascii_lowercase)
        self.token_to_index = {ch:i for i,ch in enumerate(self.tokens)}
        self.index_to_token = {i:ch for i,ch in enumerate(self.tokens)}
        
    def tokenize(self, x):
        one_hot = torch.zeros(self.num_tokens, dtype=torch.long)
        if x not in self.tokens:
            raise Exception("unknown token")
        else:
            idx = self.token_to_index[x]
        one_hot[idx] = 1
        return one_hot
    
    def get_char(self,x):
        idx = torch.argmax(x).item()
        return self.index_to_token[idx]
    
    def tokenize_name(self, name):
        vector = torch.zeros(size=(len(name), self.num_tokens))
        for i,ch in enumerate(name):
            vector[i] = self.tokenize(ch)
            
        return vector
    
    def get_name(self,vector):
        name = ''
        for i in range(vector.size(0)):
            if torch.sum(vector[i]).item() == 1:
                name += self.get_char(vector[i])
            
        return name

In [12]:
tk = Tokenizer()
tk.tokenize_name('shreyas').shape

torch.Size([7, 26])

# Custom Dataset

In [13]:
class NamesDataset:
    
    def __init__(self, df, max_length=19, is_test = False):
        
        self.is_test = is_test
        self.df = df
        
#         self.df['origin'] = self.df['origin'].astype('category')
#         self.df['origin_code'] = self.df['origin'].cat.codes
        
        self.names = list(self.df['normalized_name'])
        self.origins = list(self.df['origin'])
        self.categories = {i:origin for i, origin in enumerate(self.df['origin'].cat.categories)}
        self.labels = list(self.df['origin_code'])
        
        self.max_length = max_length
        
        self.tk = Tokenizer()
        
    def __len__(self):
        return len(self.names)
    
    def __getitem__(self, idx):
        
        name = self.tk.tokenize_name(self.names[idx])
        pad_zeros = torch.zeros(size=(self.max_length-name.size(0), self.tk.num_tokens))
        
        padded = torch.concat([name,pad_zeros])
        
        if self.is_test is False:
            label = torch.tensor(self.labels[idx], dtype=torch.long)
            return (padded, label)
        
        return padded

### Shuffle-Split Dataframe into train and validation set

In [16]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(df, test_size=0.2, shuffle=True, random_state = 1357)
type(train), type(val)

train.reset_index(inplace=True, drop=True)
val.reset_index(inplace=True, drop=True)

In [17]:
train.shape, val.shape

((16059, 4), (4015, 4))

In [18]:
max_length = max(map(len, df['normalized_name'].values))
max_length

19

In [19]:
train_ds = NamesDataset(train)
val_ds = NamesDataset(val)

In [20]:
len(train_ds), len(val_ds)

(16059, 4015)

In [21]:
train_ds[0][0].shape, val_ds[0][0].shape

(torch.Size([19, 26]), torch.Size([19, 26]))

___


# Recurrent Neural Networks

![RNN diagram](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b5/Recurrent_neural_network_unfold.svg/640px-Recurrent_neural_network_unfold.svg.png)

[Image src: By fdeloche - Own work, CC BY-SA 4.0](https://commons.wikimedia.org/w/index.php?curid=60109157)

## Notes:

- In this we'll be using a many-to-one RNN. Many inputs i.e. the sequence of characters (name) and the output (origin)
- The longest name in the list is 19 characters long.
- Total number of tokens/characters are 26 - lowercase alphabets
- Each token has been one-hot encoded in the shape (1,26)
- Each name is in the shape (19, 26) after padding with zeros.

[RNN pytorch docs](https://pytorch.org/docs/stable/generated/torch.nn.RNN.html)

For the RNN:

- N = batch_size
- L = sequence length
- D = 1 (unidirectional)
- Hin = input_size
- Hout = hidden_size

Shapes:

- input: `(N,L,Hin)` when batch_first=True
- hidden:  `(D∗num_layers,N,Hout)` -- initialized to zeros automatically by torch

In [60]:
class RNNModel(nn.Module):
    
    def __init__(self, input_size=26, seq_len=19, hidden_size=64, num_classes=18):
        
        super(RNNModel, self).__init__()
        self.input_size = input_size 
        self.sequence_length = seq_len 
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.num_layers = 1
        
        
        self.rnn = nn.RNN(self.input_size, self.hidden_size, self.num_layers ,batch_first=True)
        
        self.fc = nn.Linear(self.hidden_size * self.sequence_length, self.num_classes)
        
    def forward(self, x):
        # X: shape: batch x seq_len x input_size
        batch_size = x.size(0)
        x, h_out = self.rnn(x)
        
        x = torch.flatten(x,1)
        x = self.fc(x)
        
        return x
        

In [61]:
m = RNNModel()
x = torch.rand((1,19,26))
x = m(x)
x

tensor([[ 0.0026,  0.0304,  0.2558,  0.2011, -0.0301,  0.0762, -0.1369, -0.0292,
          0.1412,  0.1963,  0.3258, -0.2663, -0.0191,  0.1027, -0.0392, -0.2966,
          0.1201, -0.0470]], grad_fn=<AddmmBackward0>)