# Generating names dataset

Here we will generate names dataset. Names dataset is supposed to be list of names.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import re
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

In [None]:
file_lists=['/notebooks/nlp_deeplearning/charmodel/data/first_names.all.txt']

names_list = []
with open(file_lists[0],'r') as file:
    for name in file.read().splitlines()[1:]:
        filtered_name = re.sub(r'\W+', '', name)
        names_list.append(filtered_name.upper())

In [None]:
names_list[:5]

['AISHA', 'AISHAH', 'AJAY', 'AAISHA', 'AAISHAH']

## Load data

In [None]:
import sys
sys.path.insert(0,'/notebooks/Projects/Seq2Seq')

In [None]:
from mllib.seq2seq.namegen import *
from dotmap import DotMap
from mllib.seq2seq.model import *
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers.neptune import NeptuneLogger
import pytorch_lightning as pl



In [None]:
dsrc = get_dataset(names_list)


# Modelling

In [None]:
hparams = DotMap({'vocab_size': len(dsrc.vocab), 
          'embedding_size': 30,
          'hidden_size': 300,
            'max_len': 15,
            'num_layers':2,
            'lr': 0.02})


# Training

In [None]:

neptune_logger = NeptuneLogger(
    api_key="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIwYWY0OTQ4MS03MGY4LTRhNjUtOTFlZC0zZjVjMjlmZGQxNjQifQ==",
    project_name="puneetgirdhar.in/charnn")

tensorboard_logger = TensorBoardLogger("tb_logs", name="my_model")

NeptuneLogger will work in online mode


In [None]:
dls = dsrc.dataloaders(after_item=after_item, before_batch=pad_input_chunk_new, bs=32, n_inp=2)

# make sure that we use serializing option to instantiate the model

model = RNN(hparams, char2tensor = str(dict(dls.numericalize.o2i)), vocab=str(dls.numericalize.vocab))

checkpoint_callback = ModelCheckpoint(
    dirpath = './checkpoints',
    filename='{epoch}',
    save_top_k=3,
    monitor='val_loss',
    mode='min'
)


trainer = pl.Trainer(fast_dev_run=False, logger=neptune_logger, auto_lr_find='learning_rate',gpus=1,
                    callbacks=[EarlyStopping(monitor='val_loss',patience=5), checkpoint_callback],
                    )

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [None]:
trainer.fit(model, dls.train, dls.valid)

https://app.neptune.ai/puneetgirdhar.in/charnn/e/CHAR-33



  | Name      | Type             | Params
-----------------------------------------------
0 | dropout   | Dropout          | 0     
1 | embedding | Embedding        | 3.1 K 
2 | rnn       | LSTM             | 1.1 M 
3 | decoder   | Linear           | 31.3 K
4 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.621     Total estimated model params size (MB)


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

{'A': 'AÎहÜÍxxrepxxfakeĢĲहYPशFसÐ', 'B': 'BxxfakeĚŻĖVxxfldŐĂŅĞSČÔÃxxunk', 'R': 'RPŻĲWxxbosZŤÏÔRJYयCF', 'KAR': 'KARxxrepSŁĖZÁÖŽVĂÏxxupÜरŇ', 'TE': 'TEĲČÊŽĖŐÊVAĪÈÐश', 'CHRI': 'CHRIJŤŪOxxrepHŞŤAĂबÜxxunkÌÚ'}


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AĚYAH', 'B': 'BBE', 'R': 'REMIE', 'KAR': 'KARLYN', 'TE': 'TEYON', 'CHRI': 'CHRILKEB'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AरN', 'B': 'BEAGAN', 'R': 'RÁTZ', 'KAR': 'KARTO', 'TE': 'TEF', 'CHRI': 'CHRINER'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AHER', 'B': 'BAXI', 'R': 'RCPEE', 'KAR': 'KARYA', 'TE': 'TEDA', 'CHRI': 'CHRISTOFER'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'ADHIR', 'B': 'BAÞĢ', 'R': 'RŢA', 'KAR': 'KAR', 'TE': 'TEGEEN', 'CHRI': 'CHRISEPHER'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AÁS', 'B': 'BYDEEN', 'R': 'RČH', 'KAR': 'KARI', 'TE': 'TEĖ', 'CHRI': 'CHRISHUA'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'ALJT', 'B': 'BYTLAUGH', 'R': 'RŻ', 'KAR': 'KARINE', 'TE': 'TESSOPHE', 'CHRI': 'CHRISPOTH'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AMIRA', 'B': 'BOX', 'R': 'RVÂET', 'KAR': 'KARRAH', 'TE': 'TE', 'CHRI': 'CHRISTINNA'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AMBER', 'B': 'BRITTA', 'R': 'RŽ', 'KAR': 'KARO', 'TE': 'TELLI', 'CHRI': 'CHRISTOFORO'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AZIN', 'B': 'BERCIO', 'R': 'RŞIM', 'KAR': 'KAR', 'TE': 'TEISHA', 'CHRI': 'CHRISTOPHAROUS'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'ABRĪELLE', 'B': 'BUSH', 'R': 'RŢĞÂURG', 'KAR': 'KARTIM', 'TE': 'TEBORY', 'CHRI': 'CHRISTOPHER'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AÕ', 'B': 'BOÐLR', 'R': 'RĀQAR', 'KAR': 'KARYS', 'TE': 'TE', 'CHRI': 'CHRISTIANA'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AĢIGASIALO', 'B': 'BĢEWRAN', 'R': 'RŢĞERG', 'KAR': 'KARBJÖRG', 'TE': 'TE', 'CHRI': 'CHRISPUNDER'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AYSHU', 'B': 'BEK', 'R': 'RNTĞÐUR', 'KAR': 'KARLEYSA', 'TE': 'TEFACHO', 'CHRI': 'CHRISTIANA'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AŞ', 'B': 'BYGALL', 'R': 'RŢĞRÍÐUR', 'KAR': 'KARPA', 'TE': 'TE', 'CHRI': 'CHRISSE'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AमFRYD', 'B': 'BBÂKA', 'R': 'RÚDS', 'KAR': 'KARLO', 'TE': 'TESWARAN', 'CHRI': 'CHRISTIANJAMES'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'A', 'B': 'BJÖRN', 'R': 'RलD', 'KAR': 'KARSHAN', 'TE': 'TERIE', 'CHRI': 'CHRISTALIN'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AHN', 'B': 'BJÖRT', 'R': 'RÓD', 'KAR': 'KARLISHA', 'TE': 'TERMAN', 'CHRI': 'CHRISTOPHER'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AULAH', 'B': 'BUDUR', 'R': 'RĽ', 'KAR': 'KARLESA', 'TE': 'TEEW', 'CHRI': 'CHRISTION'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AHN', 'B': 'BJÖRG', 'R': 'RÚE', 'KAR': 'KARLOSARA', 'TE': 'TELAN', 'CHRI': 'CHRISTYN'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AŪD', 'B': 'BYLEE', 'R': 'RÚDŢY', 'KAR': 'KARIM', 'TE': 'TEKO', 'CHRI': 'CHRISTOPHERJOSE'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'ANA', 'B': 'BYE', 'R': 'RKŞWELLER', 'KAR': 'KARYA', 'TE': 'TEFORD', 'CHRI': 'CHRISTOPHER'}



1

# Evaluation

Now, we can generate some names randomly

In [None]:
md = get_first_name_model()

In [None]:
md.cuda()

RNN(
  (dropout): Dropout(p=0.2, inplace=False)
  (embedding): Embedding(104, 30, scale_grad_by_freq=True)
  (rnn): LSTM(30, 300, num_layers=2, batch_first=True, dropout=0.2)
  (decoder): Linear(in_features=300, out_features=104, bias=True)
  (criterion): CrossEntropyLoss()
)

In [None]:
md.generate("CHRIS")

'CHRISTIE'