# Generate name dataset

In [1]:
#default_exp mllib.charrnn

Here we will create the dataset which can help us to import names from the text file for name generation techniques.

In [2]:
#export
from torch.utils.data import Dataset
import os
from typing import List, Dict
from sklearn.preprocessing import LabelEncoder
import torch
import re

from torch.utils.data import DataLoader, random_split

In [5]:
#export
# Reserved tokens for things like padding and EOS symbols.
PAD = "<pad>"
EOS = "<EOS>"
BOS = "<BOS>"
RESERVED_TOKENS = [PAD, EOS, BOS]
NUM_RESERVED_TOKENS = len(RESERVED_TOKENS)
PAD_ID = RESERVED_TOKENS.index(PAD)  # Normally 0
EOS_ID = RESERVED_TOKENS.index(EOS)  # Normally 1
BOS_ID = RESERVED_TOKENS.index(BOS)  # Normally 2


class NamesDataset(Dataset):
    def __init__(self, charset, file_lists=List[str], length=10 ):
        self.samples = []
        self.charset = charset + '\0'
        self.length = length
        self.char_codec = LabelEncoder()
        
        for file in file_lists:
            self.read_file(file)
            
        self._init_dataset()
    
    def _init_dataset(self):
        self.char_codec.fit(list(self.charset))
    
    def to_one_hot(self, codec, values):
        value_idxs = codec.transform(values)
        return torch.eye(len(codec.classes_))[value_idxs]
    
    def one_hot_sample(self, *args):
        # get arguments to convert to one_hot
        t_name = self.to_one_hot(self.char_codec, list(args[0]))
        return t_name
        
    def read_file(self, file_path:str):
        print(file_path)
        with open(file_path,'r') as name_file:
            for name in name_file.read().splitlines()[1:]:
                filtered_name = re.sub(r'\W+', '', name)
                if len(filtered_name) < self.length:
                    filtered_name += '\0' * (self.length - len(filtered_name))
                else:
                    filtered_name = filtered_name[:self.length-1] + '\0'
                self.samples.append(filtered_name.upper())
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx:int)-> str:
        name = self.samples[idx]
        print(name)
        return self.one_hot_sample(name)
    
def pad_collate(batch):
    """ Pads input and target to the same length """
    
    names = batch
    names_pad = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=PAD_ID)
    

In [6]:
#dataset = TESNamesDataset(data_root, charset, length)
charset = set([ c.upper() for c in string.ascii_letters ])
charset = ''.join(sorted(charset))
ds = NamesDataset(file_lists=['./data/first_names.all.txt'], charset = charset + "-' ")
trainset, valset = random_split(ds, [131566, 32892])

train_loader = DataLoader(trainset, batch_size=10, shuffle=True, num_workers=0)
val_loader = DataLoader(valset, batch_size=10, shuffle=True, num_workers=0)

./data/first_names.all.txt


In [7]:
next(iter(train_loader)).shape

NAVITA    
BRINGHURS 
KUNTAKINT 
TERRALL   
RIGNALL   
DONALYNN  
SHAWANDRA 
SUKINA    
ALLEENA   
SAWAIRA   


torch.Size([10, 10, 30])

# Convert it into Fastai Dataset

In general, when we have pytorch dataset like above, we can easily convert it into FastAI dataset by using `Transform` class.

In [10]:
from fastai.text.all import *

In [11]:
file_lists=['./data/first_names.all.txt']

In [12]:
names_list = []
with open(file_lists[0],'r') as file:
    for name in file.read().splitlines()[1:]:
        filtered_name = re.sub(r'\W+', '', name)
        names_list.append(filtered_name.upper())

In [13]:
src_tfms = [lambda x: ['xxbos'] + list(x), Numericalize()]
len_tfms = [lambda x: torch.tensor(len(x)+1, dtype=torch.int32)]
#tgt_tfms = [lambda x: list(x)[1:]]

dsrc = Datasets(names_list, tfms=[src_tfms, len_tfms], splits=RandomSplitter(valid_pct=0.2)(names_list))

In [14]:
len(dsrc.train), len(dsrc.valid)

(131567, 32891)

In [15]:
dsrc.decode(dsrc[5])

((#6) ['xxbos','A','A','B','A','N'], tensor(6, dtype=torch.int32))

In [16]:
@ItemTransform
def after_item(obj):
    return (obj[0][:-1], obj[0][1:], obj[1])

def pad_input_chunk_new(samples, n_inp=2,**kwargs):
    "Pad `samples` by adding padding by chunks of size `seq_len`"
    
    max_len = max([len(s[n]) for s in samples for n in range(n_inp)])
    padeds = [[pad_chunk(s[n],pad_len=max_len,**kwargs) for n in range(n_inp) ] for s in samples]
    
    return [(*p, *s[n_inp:]) for p, s in zip(padeds, samples)]

In [17]:
dls = dsrc.dataloaders(after_item=after_item, before_batch=pad_input_chunk_new, bs=4, n_inp=2,)

In [18]:
b = dls.one_batch()

In [19]:
new_b = (b[0],b[1])

## Hyperparameters

Let's record all hyperparamters found above

In [21]:
vocab_size = len(dsrc.vocab)
PAD_ID = 1
BOS_ID = 2
embed_size = 30

# Pytorch Lightning

In [22]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.utils import rnn
from torch.utils.data import DataLoader, Dataset
from dotmap import DotMap
from typing import Dict

import collections
import math

import numpy as np
import torch


In [23]:
class EncoderRNN(pl.LightningModule):
    def __init__(self, hidden_size, embed_size, embed):
        super(EncoderRNN, self).__init__()
        self.embeds = embed
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        
    def forward(self, inp, lengths):

        emb = self.embeds(inp)
        output, hidden = self.rnn(emb)
        return output, hidden

In [24]:
class DecoderRNN(pl.LightningModule):
    def __init__(self, embed, embed_size, hidden_size, output_size, max_len):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_len = max_len
        
        self.embeds = embed
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first = True)
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward_step(self, input, hidden, encoder_output):
        del encoder_output # todo : use it for attention
        
        emb = self.embeds(input)
        o , h = self.rnn(emb, hidden)
        
        out = self.out(o)
        return F.log_softmax(out, -1), h
    
    def forward(self, enc_h, enc_out, tgt = None):
        """ Uses teacher enforcing, relies on tgt starting with BOS"""
        decoder_input = tgt
        # uses decoder input as teacher enforcing
        
        if tgt is None:        # inference
            # during test time, we generate all the decoder values
            batch_size = enc_h.size(0) if enc_h is not None else 1
            decoder_input = torch.LongTensor(
                [batch_size* [BOS_ID]]).view(batch_size, 1).to(enc_h.device)
        
        decoder_hidden = enc_h
            
        decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden, enc_out)
            
        return decoder_output, decoder_hidden
            

In [25]:
def acc_cm(preds, labels, nb_clss):
    """Calculates all confusion matrix based metrics."""
    acc = (labels == preds).float().mean()

    cm = torch.zeros((nb_clss, nb_clss), device=labels.device)
    for label, pred in zip(labels, preds):
        cm[label.long(), pred.long()] += 1

    tp = cm.diagonal()[1:].sum()
    fp = cm[:, 1:].sum() - tp
    fn = cm[1:, :].sum() - tp
    return (acc, tp, fp, fn)


In [26]:
class Seq2SeqLightningModule(pl.LightningModule):
    def __init__(self, hp:Dict ):
        super().__init__()
        self.hparams= hp
        
        # share embedding layer by encoder or decoder
        self.embed = nn.Embedding(hp.vocab_size, hp.embedding_size, padding_idx = PAD_ID)
        
        self.encoder = EncoderRNN(hp.hidden_size, hp.embedding_size, self.embed)
        self.decoder = DecoderRNN(self.embed, hp.embedding_size, hp.hidden_size, hp.vocab_size, hp.max_len)
        
        self.criterion = nn.NLLLoss(ignore_index = PAD_ID)
        
        
    def forward(self, src, lengths, tgt=None):
        encoder_output, encoder_hidden = self.encoder(src, lengths)
        outputs,hidden = self.decoder(encoder_hidden, encoder_output, tgt)
        
        return outputs
    
    def training_step(self, batch, batch_idx):
        src, tgt, lengths = batch

        
        output = self.forward(src, lengths, tgt)
        loss = self.criterion(output.view(-1,output.shape[2]), tgt.view(-1))
        #loss = self.criterion(output.data, tgt.data)    # both are packed
        self.log('train_loss', loss)
        
        return {'loss': loss}
    
    def validation_step(self, batch, batch_idx):
        src, tgt, lengths = batch

        
        output = self.forward(src, lengths, tgt)
        loss = self.criterion(output.view(-1,output.shape[2]), tgt.view(-1))
        
         # metrics
        preds = torch.argmax(output.data, dim=-1)
        # preds = elementwise_apply(torch.argmax, output, -1)
        (acc, tp, fp, fn) = acc_cm(preds, tgt.data, vocab_size)
        
        return {
            'val_loss': loss,
            'val_acc': acc,
            'tp': tp,
            'fp': fp,
            'fn': fn
        }
        preds_pad, _ = rnn.pad_packed_sequence(
            rnn.PackedSequence(preds, output.batch_sizes),
            batch_first=True,
            padding_value=text_encoder.PAD_ID)
        tgts_pad, _ = rnn.pad_packed_sequence(tgt,
                                              batch_first=True,
                                              padding_value=text_encoder.PAD_ID)

        bleu = metrics.compute_bleu(tgts_pad.tolist(), preds_pad.tolist())
        return {
            'val_loss': loss,
            'val_acc': acc,
            'tp': tp,
            'fp': fp,
            'fn': fn,
            'bleu': bleu
        }

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        return optimizer

In [117]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PAD_ID = 1

class RNN(pl.LightningModule):
    def __init__(self, hp:Dict):
        super(RNN, self).__init__()
        self.hparams = hp
        
        self.num_layers = hp.num_layers
        self.hidden_size = hp.embedding_size
        self.output_size = hp.vocab_size
        self.input_size = hp.vocab_size
        
        self.embedding = nn.Embedding(self.input_size, self.output_size)
        self.rnn = nn.LSTM(input_size = self.input_size, hidden_size=self.hidden_size, num_layers = self.num_layers)
        self.decoder = nn.Linear(self.hidden_size, self.output_size)
        
        self.criterion = nn.NLLLoss(ignore_index = PAD_ID)

        
    def forward(self, input_seq, hidden_state):

        
        embedding  = self.embedding(input_seq)
        output, hidden_state = self.rnn(embedding.unsqueeze(0), hidden_state)
        output = self.decoder(output)
    
        return F.log_softmax(output, -1), (hidden_state[0].detach(), hidden_state[1].detach())
    
    def init_hidden(self, batch_size):
        h = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        c = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        
        return h,c
    
    def training_step(self, batch, batch_idx):
        src, tgt, lengths = batch

        hidden_state = self.init_hidden(src.shape[0])
        loss = 0
        chunk_len = src.shape[1]
        
        for j in range(chunk_len):
            output, hidden_state = self.forward(src[:,j],hidden_state)
            output = output.reshape(output.shape[1]*output.shape[0],-1)
            
            loss += self.criterion(output, tgt[:,j])
        
        loss = loss / chunk_len
        
        self.log('train_loss', loss)
        return {'loss': loss}
    
    
    def validation_step(self, batch, batch_idx):
        src, tgt, lengths = batch

        hidden_state = self.init_hidden(src.shape[0])
        loss = 0
        chunk_len = src.shape[1]
        
        for j in range(chunk_len):
            output, hidden_state = self.forward(src[:,j],hidden_state)
            output = output.reshape(output.shape[1]*output.shape[0],-1)
            
            loss += self.criterion(output, tgt[:,j])
        
        loss = loss / chunk_len
        
        self.log('val_loss', loss)
        return {'val_loss': loss}
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        return optimizer
    
    def generate(self, initial_char = 'A', predict_len = 15, temperature=0.85):
        hidden, cell = self.init_hidden(batch_size = 1)
        initial_input = self.char2tensor(initial_char)
        predicted_str = initial_char
        
        for p in range(len(inital_char)-1):
            _, (hidden, cell) = self.rnn(initial_input[p].view(1).to(device), hidden, cell)
            
        last_char = initial_input[-1]
        
        for p in range(predict_len):
            output, (hidden , cell) = self.rnn(last_char.view(1).to(device), hidden, cell)
            output_dist = output.data.view(-1).div(temperature).exp()
            top_char = torch.multinomial(output_dist, 1)[0]
            
            # convert back to string
            predicted_char = all_chars[top_char]
            predicted += predicted_char
            last_char  = top_char
            
        return predicted

# Training Script

In [125]:
from pytorch_lightning.loggers import TensorBoardLogger

In [126]:
logger = TensorBoardLogger("tb_logs", name="my_model")

In [127]:
hparams = DotMap({'vocab_size': vocab_size, 
          'embedding_size': embed_size,
          'hidden_size': embed_size,
            'max_len': 15,
            'num_layers':1,
            'lr': 0.02})

In [128]:
#model = Seq2SeqLightningModule(hparams)
model = RNN(hparams)

In [129]:
trainer = pl.Trainer(fast_dev_run=False, logger=logger)

GPU available: False, used: False
TPU available: None, using: 0 TPU cores


In [136]:
trainer.fit(model, dls.train, dls.valid)


  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 10.8 K
1 | rnn       | LSTM      | 16.3 K
2 | decoder   | Linear    | 3.2 K 
3 | criterion | NLLLoss   | 0     
----------------------------------------
30.4 K    Trainable params
0         Non-trainable params
30.4 K    Total params
0.121     Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

RuntimeError: Caught RuntimeError in DataLoader worker process 3.
Original Traceback (most recent call last):
  File "/Users/puneet/.virtualenvs/torch/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
    data = fetcher.fetch(index)
  File "/Users/puneet/.virtualenvs/torch/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 34, in fetch
    data = next(self.dataset_iter)
  File "/Users/puneet/Projects/fastai/fastai/data/load.py", line 120, in create_batches
    yield from map(self.do_batch, self.chunkify(res))
  File "/Users/puneet/Projects/fastai/fastai/data/load.py", line 146, in do_batch
    def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
  File "/Users/puneet/Projects/fastai/fastai/data/load.py", line 145, in create_batch
    def create_batch(self, b): return (fa_collate,fa_convert)[self.prebatched](b)
  File "/Users/puneet/Projects/fastai/fastai/data/load.py", line 50, in fa_collate
    else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
  File "/Users/puneet/Projects/fastai/fastai/data/load.py", line 50, in <listcomp>
    else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
  File "/Users/puneet/Projects/fastai/fastai/data/load.py", line 49, in fa_collate
    return (default_collate(t) if isinstance(b, _collate_types)
  File "/Users/puneet/.virtualenvs/torch/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 53, in default_collate
    storage = elem.storage()._new_shared(numel)
  File "/Users/puneet/.virtualenvs/torch/lib/python3.9/site-packages/torch/storage.py", line 135, in _new_shared
    return cls._new_using_filename(size)
RuntimeError: Shared memory manager connection has timed out at ../torch/lib/libshm/core.cpp:99


In [135]:
%load_ext tensorboard
%tensorboard --logdir tb_logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 7318), started 0:00:03 ago. (Use '!kill 7318' to kill it.)