# **Character-RNN**

In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn

import string

In [2]:
from sklearn.metrics import accuracy_score

## Getting Data

In [3]:
df = pd.read_csv('names-and-origins.csv')

In [4]:
df.head(10)

Unnamed: 0,origin,name,normalized_name
0,English,Abbas,abbas
1,English,Abbey,abbey
2,English,Abbott,abbott
3,English,Abdi,abdi
4,English,Abel,abel
5,English,Abraham,abraham
6,English,Abrahams,abrahams
7,English,Abrams,abrams
8,English,Ackary,ackary
9,English,Ackroyd,ackroyd


In [5]:
len(df)

20074

In [6]:
df['origin'] = df['origin'].astype('category')

In [7]:
df['origin_code'] = df['origin'].cat.codes

In [8]:
df['origin'].cat.categories

Index(['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish',
       'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese'],
      dtype='object')

In [9]:
df.head(10)

Unnamed: 0,origin,name,normalized_name,origin_code
0,English,Abbas,abbas,4
1,English,Abbey,abbey,4
2,English,Abbott,abbott,4
3,English,Abdi,abdi,4
4,English,Abel,abel,4
5,English,Abraham,abraham,4
6,English,Abrahams,abrahams,4
7,English,Abrams,abrams,4
8,English,Ackary,ackary,4
9,English,Ackroyd,ackroyd,4


## Tokenizer

In [10]:
class Tokenizer:
    
    def __init__(self, num_tokens=26):
        self.num_tokens = num_tokens
        self.tokens = list(string.ascii_lowercase)
        self.token_to_index = {ch:i for i,ch in enumerate(self.tokens)}
        self.index_to_token = {i:ch for i,ch in enumerate(self.tokens)}
        
    def tokenize(self, x):
        one_hot = torch.zeros(self.num_tokens, dtype=torch.long)
        if x not in self.tokens:
            raise Exception("unknown token")
        else:
            idx = self.token_to_index[x]
        one_hot[idx] = 1
        return one_hot
    
    def get_char(self,x):
        idx = torch.argmax(x).item()
        return self.index_to_token[idx]
    
    def tokenize_name(self, name):
        vector = torch.zeros(size=(len(name), self.num_tokens))
        for i,ch in enumerate(name):
            vector[i] = self.tokenize(ch)
            
        return vector
    
    def get_name(self,vector):
        name = ''
        for i in range(vector.size(0)):
            if torch.sum(vector[i]).item() == 1:
                name += self.get_char(vector[i])
            
        return name

In [11]:
tk = Tokenizer()
tk.tokenize_name('shreyas').shape

torch.Size([7, 26])

___

# Custom Dataset

In [12]:
class NamesDataset:
    
    def __init__(self, df, max_length=19, is_test = False):
        
        self.is_test = is_test
        self.df = df
        
#         self.df['origin'] = self.df['origin'].astype('category')
#         self.df['origin_code'] = self.df['origin'].cat.codes
        
        self.names = list(self.df['normalized_name'])
        self.origins = list(self.df['origin'])
        self.categories = {i:origin for i, origin in enumerate(self.df['origin'].cat.categories)}
        self.labels = list(self.df['origin_code'])
        
        self.max_length = max_length
        
        self.tk = Tokenizer()
        
    def __len__(self):
        return len(self.names)
    
    def __getitem__(self, idx):
        
        name = self.tk.tokenize_name(self.names[idx])
        pad_zeros = torch.zeros(size=(self.max_length-name.size(0), self.tk.num_tokens))
        
        padded = torch.concat([name,pad_zeros])
        
        if self.is_test is False:
            label = torch.tensor(self.labels[idx], dtype=torch.long)
            return (padded, label)
        
        return padded

### Shuffle-Split Dataframe into train and validation set

In [13]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(df, test_size=0.2, shuffle=True, random_state = 1357)
type(train), type(val)

train.reset_index(inplace=True, drop=True)
val.reset_index(inplace=True, drop=True)

In [14]:
train.shape, val.shape

((16059, 4), (4015, 4))

In [15]:
max_length = max(map(len, df['normalized_name'].values))
max_length

19

In [16]:
train_ds = NamesDataset(train)
val_ds = NamesDataset(val)

In [17]:
len(train_ds), len(val_ds)

(16059, 4015)

In [18]:
train_ds[0][0].shape, val_ds[0][0].shape

(torch.Size([19, 26]), torch.Size([19, 26]))

___


# Recurrent Neural Networks

![RNN diagram](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b5/Recurrent_neural_network_unfold.svg/640px-Recurrent_neural_network_unfold.svg.png)

[Image src: By fdeloche - Own work, CC BY-SA 4.0](https://commons.wikimedia.org/w/index.php?curid=60109157)

## Notes:

- In this we'll be using a many-to-one RNN. Many inputs i.e. the sequence of characters (name) and the output (origin)
- The longest name in the list is 19 characters long.
- Total number of tokens/characters are 26 - lowercase alphabets
- Each token has been one-hot encoded in the shape (1,26)
- Each name is in the shape (19, 26) after padding with zeros.

[RNN pytorch docs](https://pytorch.org/docs/stable/generated/torch.nn.RNN.html)

For the RNN:

- N = batch_size
- L = sequence length
- D = 1 (unidirectional)
- Hin = input_size
- Hout = hidden_size

Shapes:

- input: `(N,L,Hin)` when batch_first=True
- hidden:  `(D∗num_layers,N,Hout)` -- initialized to zeros automatically by torch

In [19]:
class RNNModel(nn.Module):
    
    def __init__(self, input_size=26, seq_len=19, hidden_size=64, num_classes=18):
        
        super(RNNModel, self).__init__()
        self.input_size = input_size 
        self.sequence_length = seq_len 
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.num_layers = 1
        
        
        self.rnn = nn.RNN(self.input_size, self.hidden_size, self.num_layers ,batch_first=True)
        
        self.fc = nn.Linear(self.hidden_size * self.sequence_length, self.num_classes)
        
    def forward(self, x):
        # X: shape: batch x seq_len x input_size
        batch_size = x.size(0)
        x, h_out = self.rnn(x)
        
        x = torch.flatten(x,1)
        x = self.fc(x)
        
        return x
        

In [19]:
m = RNNModel()
x = torch.rand((1,19,26))
x = m(x)
x

tensor([[-0.0639,  0.1338, -0.0672, -0.0165,  0.0578,  0.1744,  0.0541,  0.0883,
         -0.1580,  0.0591, -0.1514,  0.0150,  0.0386,  0.2637,  0.1453,  0.1607,
         -0.0078,  0.0740]], grad_fn=<AddmmBackward0>)

___

## DataLoaders

In [20]:
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_ds, batch_size=64, shuffle=True)

In [21]:
len(train_loader), len(val_loader)

(251, 63)

___

# Training

In [37]:
class Trainer:
    
    def __init__(self, model, loaders, config):
        self.model = model
        self.train_loader, self.val_loader = loaders
        self.config = config
        
        self.loss_fn = nn.CrossEntropyLoss()
        
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config['lr'])
        
        self.train_losses = []
        self.val_losses = []
        
        self.train_accs = []
        self.val_accs = []
        
    def train_one_epoch(self):
        
        running_loss = 0
        running_acc = 0
        
        for x,y in self.train_loader:
            
            self.optimizer.zero_grad()
            
            preds = self.model(x)
            
            loss = self.loss_fn(preds, y)
            
            loss.backward()
            self.optimizer.step()
            
            running_loss += loss.item()
            
            pred_labels = torch.argmax(preds,dim=1)
            running_acc += accuracy_score(pred_labels, y)
        
        train_loss = running_loss / len(self.train_loader)
        train_acc = running_acc / len(self.train_loader)
        self.train_losses.append(train_loss)
        self.train_accs.append(train_acc)
        
    
    @torch.no_grad()
    def valid_one_epoch(self):
        
        running_loss = 0
        running_acc = 0
        
        for x,y in self.val_loader:
            
            preds = self.model(x)
            loss = self.loss_fn(preds, y)
            
            running_loss += loss.item()
            pred_labels = torch.argmax(preds,dim=1)
            running_acc += accuracy_score(pred_labels, y)
            
        val_loss = running_loss / len(self.val_loader)
        val_acc = running_acc / len(self.val_loader)
        self.val_losses.append(val_loss)
        self.val_accs.append(val_acc)
        
    
    def fit(self):
        
        for epoch in range(self.config['epochs']):
            
            self.model.train()

            self.train_one_epoch()
            
            self.model.eval()
            
            self.valid_one_epoch()
            
        
            print(f"\n\n{'-'*7}EPOCH: {epoch+1}/{self.config['epochs']}{'-'*7}")
            print(f"Train Loss: {self.train_losses[-1]} | Validation Loss: {self.val_losses[-1]}")
            print(f"Train Accuracy: {self.train_accs[-1]} | Validation Accuracy: {self.val_accs[-1]}")
            
            
    
    def predict(self, name):
        
        self.tk = Tokenizer()
        name = self.tk.tokenize_name(name)
        pad_zeros = torch.zeros(size=(19-name.size(0), self.tk.num_tokens))
        
        padded = torch.concat([name,pad_zeros])
        x = torch.unsqueeze(padded, dim=0)
        
        x = model(x)
        return x
        

In [38]:
config = {
    'lr': 1e-3,
    'epochs': 20
}

model = RNNModel()
trainer = Trainer(model, (train_loader, val_loader), config)

In [39]:
trainer.fit()



-------EPOCH: 1/20-------
Train Loss: 1.536788992197865 | Validation Loss: 1.2916508354837932
Train Accuracy: 0.555899284219056 | Validation Accuracy: 0.6277123438027693


-------EPOCH: 2/20-------
Train Loss: 1.1643616989314318 | Validation Loss: 1.1008577545483906
Train Accuracy: 0.6548424944290634 | Validation Accuracy: 0.6617116261398177


-------EPOCH: 3/20-------
Train Loss: 1.0384506870550938 | Validation Loss: 1.024940640207321
Train Accuracy: 0.6868036329259234 | Validation Accuracy: 0.6855211499493415


-------EPOCH: 4/20-------
Train Loss: 0.9646633573737278 | Validation Loss: 0.9646554021608262
Train Accuracy: 0.7026670858937132 | Validation Accuracy: 0.704122340425532


-------EPOCH: 5/20-------
Train Loss: 0.9137719259319077 | Validation Loss: 0.9396944216319493
Train Accuracy: 0.7171662924572895 | Validation Accuracy: 0.7095786896318812


-------EPOCH: 6/20-------
Train Loss: 0.8807445021264582 | Validation Loss: 0.9204899933603075
Train Accuracy: 0.725331723951651 | V

In [58]:
name = 'yu'
pred = torch.argmax(trainer.predict(name), dim=1)
labels = list(df['origin'].cat.categories)
print(f"{name} : {labels[pred]}")

name = 'nobita'
pred = torch.argmax(trainer.predict(name), dim=1)
labels = list(df['origin'].cat.categories)
print(f"{name} : {labels[pred]}")

name = 'adam'
pred = torch.argmax(trainer.predict(name), dim=1)
labels = list(df['origin'].cat.categories)
print(f"{name} : {labels[pred]}")

yu : Korean
nobita : Japanese
adam : English
