# Generating names dataset

Here we will generate names dataset. Names dataset is supposed to be list of names.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import re
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

In [None]:
file_lists=['/notebooks/nlp_deeplearning/charmodel/data/first_names.all.txt']

names_list = []
with open(file_lists[0],'r') as file:
    for name in file.read().splitlines()[1:]:
        filtered_name = re.sub(r'\W+', '', name)
        names_list.append(filtered_name.upper())

In [None]:
names_list[:5]

['AISHA', 'AISHAH', 'AJAY', 'AAISHA', 'AAISHAH']

## Load data

In [None]:
from mllib.seq2seq.namegen import *
from dotmap import DotMap
from mllib.seq2seq.model import *
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers.neptune import NeptuneLogger
import pytorch_lightning as pl



In [None]:
dsrc = get_dataset(names_list)


# Modelling

In [None]:
hparams = DotMap({'vocab_size': len(dsrc.vocab), 
          'embedding_size': 30,
          'hidden_size': 300,
            'max_len': 15,
            'num_layers':2,
            'lr': 0.02})


# Training

In [None]:

neptune_logger = NeptuneLogger(
    api_key="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIwYWY0OTQ4MS03MGY4LTRhNjUtOTFlZC0zZjVjMjlmZGQxNjQifQ==",
    project_name="puneetgirdhar.in/charnn")

tensorboard_logger = TensorBoardLogger("tb_logs", name="my_model")

NeptuneLogger will work in online mode


In [None]:
dls = dsrc.dataloaders(after_item=after_item, before_batch=pad_input_chunk_new, bs=32, n_inp=2)

# make sure that we use serializing option to instantiate the model

model = RNN(hparams, char2tensor = str(dict(dls.numericalize.o2i)), vocab=str(dls.numericalize.vocab))

checkpoint_callback = ModelCheckpoint(
    dirpath = './checkpoints',
    filename='{epoch}',
    save_top_k=3,
    monitor='val_loss',
    mode='min'
)


trainer = pl.Trainer(fast_dev_run=False, logger=neptune_logger, auto_lr_find='learning_rate',gpus=1,
                    callbacks=[EarlyStopping(monitor='val_loss',patience=5), checkpoint_callback],
                    )

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [None]:
trainer.fit(model, dls.train, dls.valid)

https://app.neptune.ai/puneetgirdhar.in/charnn/e/CHAR-32



  | Name      | Type             | Params
-----------------------------------------------
0 | dropout   | Dropout          | 0     
1 | embedding | Embedding        | 3.1 K 
2 | rnn       | LSTM             | 1.1 M 
3 | decoder   | Linear           | 31.3 K
4 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.621     Total estimated model params size (MB)


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

{'A': 'AxxmajगÆÞŪÁदxxupDZŽÕBVग', 'B': 'BÊExxfakeÌशदनVAxxfakeÑदxxfakeKxxup', 'R': 'RxxfakeĪ', 'KAR': 'KARŞxxfakeĞLÈYÑÅनCEŘĻÈŪ', 'TE': 'TExxfakeलxxfakeमŻÝŐÏxxfakeगxxfakeSQOF', 'CHRI': 'CHRIÁŅŪलĢŻxxmajŽतĀĽयÂÓĪ'}


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AQUINDE', 'B': 'BÜALLAR', 'R': 'RÄAJAY', 'KAR': 'KARITZA', 'TE': 'TEGHER', 'CHRI': 'CHRIDKAL'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AIKA', 'B': 'BURNEY', 'R': 'RÚBURT', 'KAR': 'KARYOS', 'TE': 'TENVIER', 'CHRI': 'CHRINCER'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AÂ', 'B': 'BHUNI', 'R': 'RENVRA', 'KAR': 'KARIO', 'TE': 'TEJATO', 'CHRI': 'CHRISTIANNAH'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AVENTING', 'B': 'BINKAI', 'R': 'RUTHARD', 'KAR': 'KARIA', 'TE': 'TESIA', 'CHRI': 'CHRISHED'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'ANKA', 'B': 'BUWN', 'R': 'REETTA', 'KAR': 'KARAY', 'TE': 'TENDRA', 'CHRI': 'CHRISTENA'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AL', 'B': 'BURG', 'R': 'REDA', 'KAR': 'KARI', 'TE': 'TEOFOE', 'CHRI': 'CHRISHAH'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AŘJUNAY', 'B': 'BUSHANAN', 'R': 'RIEYANNA', 'KAR': 'KARI', 'TE': 'TERNISLAV', 'CHRI': 'CHRISTYNE'}


Experiencing connection interruptions. Reestablishing communication with Neptune.
Experiencing connection interruptions. Reestablishing communication with Neptune.


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AMARIE', 'B': 'BURN', 'R': 'RxxrepZIJA', 'KAR': 'KARO', 'TE': 'TERLINA', 'CHRI': 'CHRISTINE'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'A', 'B': 'BUKK', 'R': 'RËÐÐUR', 'KAR': 'KARRA', 'TE': 'TERRYANN', 'CHRI': 'CHRISHANA'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AYIA', 'B': 'BORGAJAY', 'R': 'RĪJANA', 'KAR': 'KARSHA', 'TE': 'TEYA', 'CHRI': 'CHRISTON'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AŁ', 'B': 'BJÖLN', 'R': 'RÚNKA', 'KAR': 'KARAGH', 'TE': 'TERIUS', 'CHRI': 'CHRISALDO'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'AZAVI', 'B': 'BOOL', 'R': 'RxxfakeĚDA', 'KAR': 'KARA', 'TE': 'TEUS', 'CHRI': 'CHRISTHARD'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'ATCHEANE', 'B': 'BEUGH', 'R': 'RNÚACUS', 'KAR': 'KARPHYLEE', 'TE': 'TERESLUCH', 'CHRI': 'CHRISTRUPH'}


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

{'A': 'ATCHUP', 'B': 'BOBIM', 'R': 'RÐÐPLÍDOSLAW', 'KAR': 'KARELA', 'TE': 'TERRALL', 'CHRI': 'CHRISTOPHER'}


# Evaluation

Now, we can generate some names randomly

In [None]:
trainer.gen_name("A")

'AÓÁजÖÞXxxupxxfakexxwrepHĻÚĻÌÝ'

In [None]:
train_from_scratch(names_list, hparams)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


<pytorch_lightning.trainer.trainer.Trainer at 0x7f4464aafdf0>