In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.insert(0,'../')

In [None]:
from mllib.nlp.datasets.cmudict import CMUDict
from datasets import load_dataset
from mllib.nlp.seq2seq import Seq2Seq

In [None]:
ds = load_dataset('/notebooks/dlnotebooks/mllib/nlp/datasets/cmudict.py')

Reusing dataset cmu_dict (/root/.cache/huggingface/datasets/cmu_dict/cmu3/1.0.0/a0e598136ef9603a0d6d97059f5e1d2cac789cfe3c0998cb2b4b7fd4198da504)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
train_test = ds['train'].train_test_split(test_size=0.2)

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/cmu_dict/cmu3/1.0.0/a0e598136ef9603a0d6d97059f5e1d2cac789cfe3c0998cb2b4b7fd4198da504/cache-b2d95f15faaf4500.arrow and /root/.cache/huggingface/datasets/cmu_dict/cmu3/1.0.0/a0e598136ef9603a0d6d97059f5e1d2cac789cfe3c0998cb2b4b7fd4198da504/cache-5ed2c4bc219f1934.arrow


In [None]:
train_test['train'][0]

{'word': 'preempted',
 'word_length': 9,
 'phoneme': ['P', 'R', 'IY0', 'EH1', 'M', 'P', 'T', 'IH0', 'D']}

# Data Processing

Processing data with map inspired by `tf.dataset.map` map method 

In [None]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator



In [None]:
train_test['train']['word'][0], train_test['train']['phoneme'][0] 

('preempted', ['P', 'R', 'IY0', 'EH1', 'M', 'P', 'T', 'IH0', 'D'])

In [None]:
phoneme_vocab = build_vocab_from_iterator(train_test['train']['phoneme'])
word_vocab = build_vocab_from_iterator(train_test['train']['word'])

108124lines [00:00, 533153.22lines/s]
108124lines [00:00, 556501.02lines/s]


In [None]:
word_vocab.lookup_indices(['a','b','c'])

[3, 18, 11]

# Data Collator

In [None]:
BATCH_SIZE = 8

In [None]:
import numpy as np

def process_single_example(word_tokens, phoneme, word_length):
    # Heree you can add variety of operations, Not only is it tokenize
    # The object that this function handles, Namely dataset this data type, adopt featuer
    
    src = word_vocab.lookup_indices(word_tokens)
    trg = phoneme_vocab.lookup_indices(phoneme)

    return src, trg, word_length

def collate_batch(batch):
    
    batch_size = len(batch['word'])
    out = [process_single_example(*tokens) for tokens in zip(batch['word'], batch['phoneme'], batch['word_length'])]
    

    return {
        'src': [b[0] for b in out],
        'trg': [b[1] for b in out],
        'src_len': [b[2] for b in out],
    }

In [None]:
ds_processed = train_test.map(collate_batch, remove_columns=['word','word_length','phoneme'], 
                        batch_size= BATCH_SIZE,
                           batched=True).with_format('pytorch', output_all_columns=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/cmu_dict/cmu3/1.0.0/a0e598136ef9603a0d6d97059f5e1d2cac789cfe3c0998cb2b4b7fd4198da504/cache-9251eda5a2ed1528.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/cmu_dict/cmu3/1.0.0/a0e598136ef9603a0d6d97059f5e1d2cac789cfe3c0998cb2b4b7fd4198da504/cache-2af5bbd04ed3259a.arrow


In [None]:
ds_processed['train'][2]

{'src': tensor([18,  8,  9,  9,  5,  7, 16]),
 'trg': tensor([15, 19,  6, 10, 25]),
 'src_len': tensor(7)}

In [None]:
from torch.utils.data import DataLoader

In [None]:
def pad_collate(batch):
    
    def pad(xs):
        return torch.nn.utils.rnn.pad_sequence(xs, batch_first=True)
    
    src = [b['src'] for b in batch]
    trg = [b['trg'] for b in batch]
    src_len = [b['src_len'] for b in batch]
    
    return {
        'src': pad(src), 
        'trg' : pad(trg), 
        'src_len' : src_len
    }


In [None]:
dls = DataLoader(ds_processed['train'], shuffle=True, collate_fn=pad_collate, batch_size=32)

In [None]:
#next(iter(dls))

In [None]:
from transformers import DataCollatorWithPadding, default_data_collator

In [None]:
import random
# checking
def decode_word(lst):
    return ''.join([word_vocab.itos[l] for l in lst])

def decode_phoneme(lst):
    return ','.join([phoneme_vocab.itos[l] for l in lst])

indices = random.sample(range(10,1000), 5 )

for l in indices:
    src = decode_word(ds_processed['train']['src'][l])
    trg = decode_phoneme(ds_processed['train']['trg'][l])
    src_len = ds_processed['train']['src_len'][l]
    print(src, trg, src_len)

spano S,P,AA1,N,OW0 tensor(5)
crampton K,R,AE1,M,P,T,AH0,N tensor(8)
dimples D,IH1,M,P,AH0,L,Z tensor(7)
modality M,AH0,D,AE1,L,AH0,T,IY0 tensor(8)
receptionists R,IY0,S,EH1,P,SH,AH0,N,IH0,S,T,S tensor(13)


# Model building

In [None]:
import pytorch_lightning as pl

In [None]:
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger


In [None]:
lr_monitor = LearningRateMonitor(logging_interval='step')
logger = TensorBoardLogger('tb_logs', name='my_model')
trainer = pl.Trainer(callbacks=[lr_monitor],max_epochs=1, gpus=1, logger=[logger])

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [None]:
input_vocab_size = len(word_vocab)
output_vocab_size  = len(phoneme_vocab)

model = Seq2Seq(input_vocab_size, output_vocab_size,p=0.1)

In [None]:
batch = next(iter(dls))

In [None]:
trainer.fit(model, train_dataloader=dls)


  | Name    | Type             | Params
---------------------------------------------
0 | _loss   | CrossEntropyLoss | 0     
1 | encoder | Encoder          | 14.0 K
2 | decoder | Decoder          | 17.3 K
---------------------------------------------
31.4 K    Trainable params
0         Non-trainable params
31.4 K    Total params
0.125     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

> [0;32m/notebooks/dlnotebooks/mllib/nlp/seq2seq.py[0m(222)[0;36mtraining_step[0;34m()[0m
[0;32m    220 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    221 [0;31m        [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 222 [0;31m        [0msrc_seq[0m[0;34m,[0m [0mtrg_seq[0m[0;34m,[0m [0msrc_lengths[0m [0;34m=[0m [0mbatch[0m[0;34m[[0m[0;34m'src'[0m[0;34m][0m[0;34m,[0m[0mbatch[0m[0;34m[[0m[0;34m'trg'[0m[0;34m][0m[0;34m,[0m [0mbatch[0m[0;34m[[0m[0;34m'src_len'[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    223 [0;31m        [0msrc_seq[0m [0;34m=[0m [0msrc_seq[0m[0;34m.[0m[0mtranspose[0m[0;34m([0m[0;36m0[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    224 [0;31m        [0mtrg_seq[0m [0;34m=[0m [0mtrg_seq[0m[0;34m.[0m[0mtranspose[0m[0;34m([0m[0;36m0[0m[0;34m,[0m [0;36m1[0m[0;3

ipdb>  n


> [0;32m/notebooks/dlnotebooks/mllib/nlp/seq2seq.py[0m(223)[0;36mtraining_step[0;34m()[0m
[0;32m    221 [0;31m        [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    222 [0;31m        [0msrc_seq[0m[0;34m,[0m [0mtrg_seq[0m[0;34m,[0m [0msrc_lengths[0m [0;34m=[0m [0mbatch[0m[0;34m[[0m[0;34m'src'[0m[0;34m][0m[0;34m,[0m[0mbatch[0m[0;34m[[0m[0;34m'trg'[0m[0;34m][0m[0;34m,[0m [0mbatch[0m[0;34m[[0m[0;34m'src_len'[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 223 [0;31m        [0msrc_seq[0m [0;34m=[0m [0msrc_seq[0m[0;34m.[0m[0mtranspose[0m[0;34m([0m[0;36m0[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    224 [0;31m        [0mtrg_seq[0m [0;34m=[0m [0mtrg_seq[0m[0;34m.[0m[0mtranspose[0m[0;34m([0m[0;36m0[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    225 [0;31m[0;34m[0m[0m
[0m


ipdb>  


> [0;32m/notebooks/dlnotebooks/mllib/nlp/seq2seq.py[0m(224)[0;36mtraining_step[0;34m()[0m
[0;32m    222 [0;31m        [0msrc_seq[0m[0;34m,[0m [0mtrg_seq[0m[0;34m,[0m [0msrc_lengths[0m [0;34m=[0m [0mbatch[0m[0;34m[[0m[0;34m'src'[0m[0;34m][0m[0;34m,[0m[0mbatch[0m[0;34m[[0m[0;34m'trg'[0m[0;34m][0m[0;34m,[0m [0mbatch[0m[0;34m[[0m[0;34m'src_len'[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    223 [0;31m        [0msrc_seq[0m [0;34m=[0m [0msrc_seq[0m[0;34m.[0m[0mtranspose[0m[0;34m([0m[0;36m0[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 224 [0;31m        [0mtrg_seq[0m [0;34m=[0m [0mtrg_seq[0m[0;34m.[0m[0mtranspose[0m[0;34m([0m[0;36m0[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    225 [0;31m[0;34m[0m[0m
[0m[0;32m    226 [0;31m        [0moutput[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0msrc_seq[0m[0;34m,[0

ipdb>  


> [0;32m/notebooks/dlnotebooks/mllib/nlp/seq2seq.py[0m(226)[0;36mtraining_step[0;34m()[0m
[0;32m    224 [0;31m        [0mtrg_seq[0m [0;34m=[0m [0mtrg_seq[0m[0;34m.[0m[0mtranspose[0m[0;34m([0m[0;36m0[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    225 [0;31m[0;34m[0m[0m
[0m[0;32m--> 226 [0;31m        [0moutput[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0msrc_seq[0m[0;34m,[0m [0msrc_lengths[0m[0;34m,[0m [0mtrg_seq[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    227 [0;31m[0;34m[0m[0m
[0m[0;32m    228 [0;31m        [0;31m# do not know if this is a problem, loss will be computed with sos token[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  b self.forward


Breakpoint 9 at /notebooks/dlnotebooks/mllib/nlp/seq2seq.py:160


ipdb>  c


> [0;32m/notebooks/dlnotebooks/mllib/nlp/seq2seq.py[0m(167)[0;36mforward[0;34m()[0m
[0;32m    165 [0;31m[0;34m[0m[0m
[0m[0;32m    166 [0;31m[0;34m[0m[0m
[0m[0;32m--> 167 [0;31m        [0mbatch_size[0m [0;34m=[0m [0msource[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    168 [0;31m        [0mtarget_len[0m [0;34m=[0m [0mtarget[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    169 [0;31m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/notebooks/dlnotebooks/mllib/nlp/seq2seq.py[0m(168)[0;36mforward[0;34m()[0m
[0;32m    166 [0;31m[0;34m[0m[0m
[0m[0;32m    167 [0;31m        [0mbatch_size[0m [0;34m=[0m [0msource[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 168 [0;31m        [0mtarget_len[0m [0;34m=[0m [0mtarget[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    169 [0;31m[0;34m[0m[0m
[0m[0;32m    170 [0;31m        [0mtarget_vocab_size[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0moutput_dim[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  


> [0;32m/notebooks/dlnotebooks/mllib/nlp/seq2seq.py[0m(170)[0;36mforward[0;34m()[0m
[0;32m    168 [0;31m        [0mtarget_len[0m [0;34m=[0m [0mtarget[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    169 [0;31m[0;34m[0m[0m
[0m[0;32m--> 170 [0;31m        [0mtarget_vocab_size[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0moutput_dim[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    171 [0;31m[0;34m[0m[0m
[0m[0;32m    172 [0;31m        [0moutputs[0m [0;34m=[0m [0mtorch[0m[0;34m.[0m[0mzeros[0m[0;34m([0m[0mtarget_len[0m[0;34m,[0m [0mbatch_size[0m[0;34m,[0m [0mtarget_vocab_size[0m[0;34m)[0m[0;34m.[0m[0mto[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mdevice[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  


> [0;32m/notebooks/dlnotebooks/mllib/nlp/seq2seq.py[0m(172)[0;36mforward[0;34m()[0m
[0;32m    170 [0;31m        [0mtarget_vocab_size[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0moutput_dim[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    171 [0;31m[0;34m[0m[0m
[0m[0;32m--> 172 [0;31m        [0moutputs[0m [0;34m=[0m [0mtorch[0m[0;34m.[0m[0mzeros[0m[0;34m([0m[0mtarget_len[0m[0;34m,[0m [0mbatch_size[0m[0;34m,[0m [0mtarget_vocab_size[0m[0;34m)[0m[0;34m.[0m[0mto[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mdevice[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    173 [0;31m[0;34m[0m[0m
[0m[0;32m    174 [0;31m[0;34m[0m[0m
[0m


ipdb>  


> [0;32m/notebooks/dlnotebooks/mllib/nlp/seq2seq.py[0m(176)[0;36mforward[0;34m()[0m
[0;32m    174 [0;31m[0;34m[0m[0m
[0m[0;32m    175 [0;31m[0;34m[0m[0m
[0m[0;32m--> 176 [0;31m        [0mhidden[0m[0;34m,[0m [0mcell[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mencoder[0m[0;34m([0m[0msource[0m[0;34m,[0m [0msource_len[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    177 [0;31m[0;34m[0m[0m
[0m[0;32m    178 [0;31m        [0;31m# mask = [batch_size, src len][0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  


> [0;32m/notebooks/dlnotebooks/mllib/nlp/seq2seq.py[0m(181)[0;36mforward[0;34m()[0m
[0;32m    179 [0;31m        [0;31m# without sos token at the beginning and eos token at the end[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    180 [0;31m[0;34m[0m[0m
[0m[0;32m--> 181 [0;31m        [0mx[0m [0;34m=[0m [0mtarget[0m[0;34m[[0m[0;36m0[0m[0;34m,[0m[0;34m:[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    182 [0;31m[0;34m[0m[0m
[0m[0;32m    183 [0;31m        [0;32mfor[0m [0mt[0m [0;32min[0m [0mrange[0m[0;34m([0m[0;36m1[0m[0;34m,[0m [0mtarget_len[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  


> [0;32m/notebooks/dlnotebooks/mllib/nlp/seq2seq.py[0m(183)[0;36mforward[0;34m()[0m
[0;32m    181 [0;31m        [0mx[0m [0;34m=[0m [0mtarget[0m[0;34m[[0m[0;36m0[0m[0;34m,[0m[0;34m:[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    182 [0;31m[0;34m[0m[0m
[0m[0;32m--> 183 [0;31m        [0;32mfor[0m [0mt[0m [0;32min[0m [0mrange[0m[0;34m([0m[0;36m1[0m[0;34m,[0m [0mtarget_len[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    184 [0;31m            [0moutput[0m[0;34m,[0m [0mhidden[0m[0;34m,[0m [0mcell[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mdecoder[0m[0;34m([0m[0mx[0m[0;34m,[0m [0mhidden[0m[0;34m,[0m [0mcell[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    185 [0;31m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/notebooks/dlnotebooks/mllib/nlp/seq2seq.py[0m(184)[0;36mforward[0;34m()[0m
[0;32m    182 [0;31m[0;34m[0m[0m
[0m[0;32m    183 [0;31m        [0;32mfor[0m [0mt[0m [0;32min[0m [0mrange[0m[0;34m([0m[0;36m1[0m[0;34m,[0m [0mtarget_len[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 184 [0;31m            [0moutput[0m[0;34m,[0m [0mhidden[0m[0;34m,[0m [0mcell[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mdecoder[0m[0;34m([0m[0mx[0m[0;34m,[0m [0mhidden[0m[0;34m,[0m [0mcell[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    185 [0;31m[0;34m[0m[0m
[0m[0;32m    186 [0;31m            [0moutputs[0m[0;34m[[0m[0mt[0m[0;34m][0m [0;34m=[0m [0moutput[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


TypeError: linear(): argument 'input' (position 1) must be Tensor, not tuple
> [0;32m/notebooks/dlnotebooks/mllib/nlp/seq2seq.py[0m(184)[0;36mforward[0;34m()[0m
[0;32m    182 [0;31m[0;34m[0m[0m
[0m[0;32m    183 [0;31m        [0;32mfor[0m [0mt[0m [0;32min[0m [0mrange[0m[0;34m([0m[0;36m1[0m[0;34m,[0m [0mtarget_len[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 184 [0;31m            [0moutput[0m[0;34m,[0m [0mhidden[0m[0;34m,[0m [0mcell[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mdecoder[0m[0;34m([0m[0mx[0m[0;34m,[0m [0mhidden[0m[0;34m,[0m [0mcell[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    185 [0;31m[0;34m[0m[0m
[0m[0;32m    186 [0;31m            [0moutputs[0m[0;34m[[0m[0mt[0m[0;34m][0m [0;34m=[0m [0moutput[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  exit()


BdbQuit: 

In [None]:
!nvidia-smi

Fri Jan 21 23:27:59 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 4000     Off  | 00000000:00:05.0 Off |                  N/A |
| 30%   46C    P0    29W / 125W |   1044MiB /  7982MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces