In [2]:
!git clone --single-branch --branch master https://github.com/DivyaRathod3D/CharNMT.git

Cloning into 'CharNMT'...
remote: Enumerating objects: 258, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 258 (delta 0), reused 6 (delta 0), pack-reused 250[K
Receiving objects: 100% (258/258), 716.33 MiB | 37.92 MiB/s, done.
Resolving deltas: 100% (125/125), done.
Updating files: 100% (81/81), done.


In [2]:
cd CharNMT

/kaggle/working/CharNMT


In [3]:
# from IPython.display import FileLink

In [4]:
# FileLink(r"CharNMT/checkpoint1/transformer_hi_en_char_wmt")

In [5]:
# !mv CharNMT/checkpoint CharNMT/checkpoint1

In [6]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [7]:
class CharDataset(Dataset):
    '''
    Dataset is a iterable that returns input and target sentence. It adds <sos> at the begining, and <eos> at the end, 
    and filling in <pad> if sentence length is less than pre-defined value.
    '''
    
    def __init__(self, x, y, sequence_len, encoder=None):
        # data in the type of pairs of sentence
        data = ''.join(x+y)
        # from collections import Counter
        # vocab_size = 250

        # ct = Counter(data)
        # include = sorted(ct, key=ct.get, reverse=True)
        # if len(include)>vocab_size: include = include[:vocab_size]
        # rule = ''.join(include)
        chars = ['<pad>'] +['<sos>'] + ['<eos>'] + sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)

        print('data has %d characters, %d unique chars, %d sentences.' % (data_size, len(chars), len(x)))
        print('sentence length nine_nine_percentile: %d' % (sequence_len))
        
        self.x, self.y = x, y
        self.ch2i = {ch:i for i,ch in enumerate(chars)}
        self.i2ch = {i:ch for i,ch in enumerate(chars)}
        self.vocab_size = vocab_size
        self.sequence_len = sequence_len
        self.encoder=encoder
    
    def __len__(self):
        return len(self.x) # len x = y
    
    def __getitem__(self, idx):
        
        indx = self.padding([self.ch2i[ch] for ch in self.x[idx]] + [self.ch2i['<eos>']])
        indy = [self.ch2i['<sos>']] + self.padding([self.ch2i[ch] for ch in self.y[idx]] + [self.ch2i['<eos>']])

        x = torch.tensor(indx, dtype=torch.long)
        y = torch.tensor(indy, dtype=torch.long)

        return x,y
                                                                                                                               
    def padding(self, string):
        if len(string)<self.sequence_len:
            string =  string + [0]*(self.sequence_len - len(string))
        else:
            string = string[:self.sequence_len -1] + [self.ch2i['<eos>']]
                   
        return string

In [8]:
sequence_len = 128
min_len = 0

In [9]:
# process and save data

from utils.pre_processing import *
from utils.utils import *
en, ml = list(), list()
# paths = ["data/pib/","data/hinden/","data/indic/"]
paths = ["data/WAT/ml-en"]
for path in paths:
    x = open(path + "train.en", encoding='utf-8').read().split("\n")
    y = open(path + "train.ml", encoding='utf-8').read().split("\n")
    x,y = pre_processing(x, y, min_length=min_len, max_length=sequence_len) # remove sentence less than 4 characters
    en += x
    ml += y
    
path = "data/cleaned/"
pickle(path + "en", en)
pickle(path + "ml", ml)
# nine_nine_percentile = int(np.percentile([len(sen) for sen in vi],99))


342265
Some last sentences
of the garden | बगीचे का
in the centre of the garden | बगीचे के बीच में
on the elevated area situated in the centre of the garden | उद्यान के केन्द्र में स्थित ऊंचे स्थान पर
in the pond which is on the elevated area situated in the centre of the garden | उस तालाब में जो उद्यान के केन्द्र में स्थित ऊंचाई पर स्थित है


True

In [10]:
# Load saved data
from utils.utils import *
from utils.pre_processing import *

path = "data/cleaned/"
en = pickle(path+"en")
ml = pickle(path+"ml")
en,ml = pre_processing(en, ml, min_length=min_len, max_length=sequence_len) # clip sentences
for i in range(-1,-5,-1):
    print(en[i],'|',ml[i])

293652
Some last sentences
of the garden | बगीचे का
in the centre of the garden | बगीचे के बीच में
on the elevated area situated in the centre of the garden | उद्यान के केन्द्र में स्थित ऊंचे स्थान पर
in the pond which is on the elevated area situated in the centre of the garden | उस तालाब में जो उद्यान के केन्द्र में स्थित ऊंचाई पर स्थित है
of the garden | बगीचे का
in the centre of the garden | बगीचे के बीच में
on the elevated area situated in the centre of the garden | उद्यान के केन्द्र में स्थित ऊंचे स्थान पर
in the pond which is on the elevated area situated in the centre of the garden | उस तालाब में जो उद्यान के केन्द्र में स्थित ऊंचाई पर स्थित है


In [11]:
dataset = CharDataset(ml, en, sequence_len=sequence_len)

print('sample tensors ', next(iter(dataset)))
print("vocab: ", dataset.ch2i)

data has 20894274 characters, 162 unique chars, 293652 sentences.
sentence length nine_nine_percentile: 128
sample tensors  (tensor([121, 117, 127, 113, 129,   2,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]), tensor([ 1, 37, 53, 46, 63, 46, 46, 47, 54,  2,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  

In [12]:
from model.encode_decode_transformer import Transformer, TransformerConfig
from utils.trainer import Trainer, TrainerConfig
tconfig = TrainerConfig(max_epochs=5, batch_size=16, learning_rate=6e-4, grad_norm_clip=1.0, device='cuda',
                       lr_decay=True, warmup_tokens=5000, ckpt_n_print_iter=4000, ckpt_path='checkpoint/transformer_hi_en_char_wmt')

mconfig = TransformerConfig(vocab_size=dataset.vocab_size, sequence_len=dataset.sequence_len, embed_dim=256,
                           n_block=8, n_head=8, device=tconfig.device)

In [13]:
model = Transformer(mconfig)

In [14]:
sentences = ["മദ്യപിച്ചു വാഹനമോടിക്കുക.. , അമിത വേഗം പോരാത്തതിന് പണക്കാരനും.. ഇത് മറ്റൊരു ആകസ്‌മികമായ സംഭവമാണെന്ന് തോന്നുന്നുണ്ടോ?",
           "ജപ്പാന്‍റെ മഞ്ചൂരിയന്‍ അധിനിവേശം.. സര്‍വരാജ്യ സംഖ്യം തെറ്റായാണ് വിധിച്ചിരിക്കുന്നത്... അതുപോലെ തന്നെ ഭൂമിയിലെ സംസ്കാരം നിലവിലുള്ള എല്ലാ രാജ്യങ്ങളും.",
           "ഒരു 100 ഹോട്ടലുകളിൽ നിന്നും ഭക്ഷണം കഴിച്ചാലും... ..15 വർഷം ഞാൻ കഴിച്ച ഭക്ഷണത്തിന്റെ രുചി മറക്കില്ല.",
           "നിങ്ങള്‍ ഭയപ്പെടുന്നുണ്ടോ? അതായത് നിങ്ങളീ നാടകം കളിക്കുന്നത്... നിങ്ങളൊരു പരാജിതനായ സൂപ്പര്‍ഹീറോ ആണെന്നത് മറയ്ക്കാനാണെന്ന് ജനങ്ങള്‍ പറയുമെന്ന്?",
           "അല്ല, അല്ല എന്ന്‍ തന്നെ. അതേ ഞാനെന്തിന് ഇവിടെ വന്നു എന്ന്‍ ഒരു പിടിയുമില്ല."
            ]
trainer = Trainer(model, dataset, tconfig, test_dataset=sentences, collate=None)

In [15]:
# # load pre-trained weights
from utils.utils import pickle
model.load_state_dict(pickle(tconfig.ckpt_path)) # load


<All keys matched successfully>

In [16]:
trainer.train()

epoch: 1 | train loss: 900.02839  | lr: 1.920000e-06:   0%|          | 0/18353 [00:00<?, ?it/s]

['I would like to meet him.', "It's the same thing that I need it.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this plan.']


epoch: 1 | train loss: 0.33862  | lr: 5.402125e-04:  22%|██▏       | 3999/18353 [06:15<21:03, 11.36it/s]  

['I would like to meet him.', "It's the same thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 1 | train loss: 0.31074  | lr: 3.689517e-04:  44%|████▎     | 8000/18353 [12:29<15:15, 11.31it/s]  

['I would like to meet him.', "It's the same thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 1 | train loss: 0.29991  | lr: 1.655665e-04:  65%|██████▌   | 11999/18353 [18:43<09:18, 11.37it/s] 

['And I would like to meet him.', "It's the same thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 1 | train loss: 0.34074  | lr: 6.000000e-05:  87%|████████▋ | 16003/18353 [25:02<23:15,  1.68it/s]  

['I would like to meet him.', "It's the same thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 1 | train loss: 0.31457  | lr: 6.000000e-05: 100%|██████████| 18353/18353 [28:32<00:00, 10.72it/s]
epoch: 2 | train loss: 0.31245  | lr: 6.000000e-05:   0%|          | 3/18353 [00:04<6:19:27,  1.24s/it] 

['I would like to meet him.', "It's the same thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 2 | train loss: 0.31873  | lr: 6.998531e-05:  22%|██▏       | 4002/18353 [06:07<2:54:28,  1.37it/s]

['I would like to meet him.', "It's the same thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 2 | train loss: 0.29956  | lr: 2.470311e-04:  44%|████▎     | 8000/18353 [12:05<15:09, 11.38it/s]  

['I would like to meet him.', "It's the same thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 2 | train loss: 0.31483  | lr: 4.489129e-04:  65%|██████▌   | 12003/18353 [18:23<1:07:18,  1.57it/s]

["I'd like to meet him.", "It's the same thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 2 | train loss: 0.38715  | lr: 5.813135e-04:  87%|████████▋ | 16002/18353 [24:25<30:14,  1.30it/s]  

["I'd like to meet it.", "It's the only thing I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 2 | train loss: 0.35707  | lr: 5.995559e-04: 100%|██████████| 18353/18353 [27:56<00:00, 10.95it/s]
epoch: 3 | train loss: 0.35231  | lr: 5.995473e-04:   0%|          | 3/18353 [00:04<6:00:40,  1.18s/it] 

["I'd like to meet him.", "It's the same thing that I need it.", "Don't worry about.", 'He borrowed me two books.', 'I agree with this plan.']


epoch: 3 | train loss: 0.36041  | lr: 5.191985e-04:  22%|██▏       | 4003/18353 [06:06<2:12:01,  1.81it/s]

['And I would like to meet him.', "It's the same thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this planning.']


epoch: 3 | train loss: 0.33413  | lr: 3.367254e-04:  44%|████▎     | 8003/18353 [12:09<1:39:34,  1.73it/s]

['I would like to meet him.', "It's the only thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 3 | train loss: 0.30996  | lr: 1.371860e-04:  65%|██████▌   | 12002/18353 [18:11<1:00:50,  1.74it/s]

['And I would like to meet him.', "It's the only thing I need.", "Don't worry about.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 3 | train loss: 0.30096  | lr: 6.000000e-05:  87%|████████▋ | 16003/18353 [24:13<21:46,  1.80it/s]  

['And I would like to meet him.', "It's the only thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 3 | train loss: 0.30311  | lr: 6.000000e-05: 100%|██████████| 18353/18353 [27:43<00:00, 11.03it/s]
epoch: 4 | train loss: 0.28722  | lr: 6.000000e-05:   0%|          | 0/18353 [00:00<?, ?it/s]

['And I would like to meet him.', "It's the only thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 4 | train loss: 0.28429  | lr: 9.219139e-05:  22%|██▏       | 3999/18353 [06:12<24:06,  9.92it/s]

['I would like to meet him.', "It's the same thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 4 | train loss: 0.28613  | lr: 2.795239e-04:  44%|████▎     | 8002/18353 [12:29<2:21:55,  1.22it/s]

["I'd like to meet him.", "It's the same thing that I need.", "Don't worry my worry.", 'He borrowed me two books.', 'I agree with this plan.']


epoch: 4 | train loss: 0.31592  | lr: 4.763208e-04:  65%|██████▌   | 12003/18353 [18:33<1:01:37,  1.72it/s]

["I'd like to meet it.", "It's the same thing that I need.", "Don't worry me.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 4 | train loss: 0.34735  | lr: 5.909702e-04:  87%|████████▋ | 16002/18353 [24:36<34:24,  1.14it/s]  

['And I would like to meet him.', 'This is the only thing that I need.', "Don't do my worries.", 'He borrowed me two books.', 'I agree with this plan.']


epoch: 4 | train loss: 0.37629  | lr: 5.960109e-04: 100%|██████████| 18353/18353 [28:06<00:00, 10.88it/s]
epoch: 5 | train loss: 0.36838  | lr: 5.959854e-04:   0%|          | 3/18353 [00:04<6:23:12,  1.25s/it] 

['I would like to meet him.', "It's the only thing that I need.", 'Do not worry my worry.', 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 5 | train loss: 0.35437  | lr: 4.956530e-04:  22%|██▏       | 4003/18353 [06:07<2:14:33,  1.78it/s]

['And I would like to meet him.', "It's the same thing that I need.", "Don't worry my worries.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 5 | train loss: 0.31592  | lr: 3.041662e-04:  44%|████▎     | 8002/18353 [12:09<1:55:18,  1.50it/s]

["And I'd like to meet him.", "It's the same thing that I need.", "Don't worry my thinking.", 'He borrowed me two books.', 'I agree with this plan.']


epoch: 5 | train loss: 0.31949  | lr: 1.107384e-04:  65%|██████▌   | 12003/18353 [18:12<53:38,  1.97it/s]  

['I would like to meet him.', "It's the same thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this plan.']


epoch: 5 | train loss: 0.29227  | lr: 6.000000e-05:  87%|████████▋ | 16002/18353 [24:19<32:08,  1.22it/s]

['I would like to meet him.', "It's the same thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this plan.']


epoch: 5 | train loss: 0.33712  | lr: 6.000000e-05: 100%|██████████| 18353/18353 [27:53<00:00, 10.97it/s]


In [17]:
samples = ["എവിടെയാണ് തുന്നൽക്കാരൻ കളികൾ വീണ്ടും കൂടു കൂട്ടുന്നത് മീവൽ പക്ഷികൾ മഴയ്ക്കു വേണ്ടി കേഴുന്നത് ?",
           "നിങ്ങള്‍ എനിക്ക് വേണ്ടി പണിയെടുക്കാന്‍ തയ്യാറല്ലെങ്കില്‍, അധികവും പുറത്ത് പോകണം, അപ്പൊ ഇപ്പോള്‍ തന്നെ ഉപേക്ഷിക്കണം.",
           "നി നുണ പറയുന്നതാണെന്ന് ഞാൻ കണ്ടെത്തിയാൽ, നിന്റെയീ രോമമില്ലാത്ത കിടുക്കമാണി .. ...ഞാൻ ചെത്തിക്കളയും ഇതിന്റെ ഉപയോഗം നിനക്കറിയുന്നതിന് മുൻപ്.",
           "പക്ഷേ, ഈ മേഘങ്ങളിലെ എല്ലാ ജലവും, പര്‍വ്വതത്തിന്‍റെ തെക്ക് വശത്ത്‌ തന്നെ പെയ്തു തീരും.",
           "ഏതു വര്‍ഷം എന്നെനിക്ക് കൃത്യമായി പറയാന്‍ സാധിക്കയില്ല... കാരണം അതിനെ കുറിച്ച് ഒന്നും ഞങ്ങള്‍ക്കറിയില്ല.",
          ]
result = model.generate_output(samples, dataset, top_k=5, print_process=True)
print(result)

100%|██████████| 6/6 [00:06<00:00,  1.13s/it]

['In the meantime, three people appear on a bike.', 'Why are you not answering?', 'These books are me.', 'Thank you for your help.', 'He was much more than a king.', 'i saw a wonderful dream today']





In [18]:
# benchmarking using bleu score
path = "data/WAT/ml-en/"
en = open(path+"test.en.txt", encoding='utf-8').read().split("\n")
hi = open(path+"test.ml.txt", encoding='utf-8').read().split("\n")
en, hi = pre_processing(en, hi, min_length=min_len, max_length=sequence_len) # remove sentence less than 4 characters

result = model.generate_output(hi, dataset, top_k=5, print_process=True)

2507
Some last sentences
32 chartered planes have been booked to ferry the guests to and fro. | मेहमानों को लानेले जाने के लिए 32 चार्टर्ड विमानों की व्यवस्था की गई है।
250 VIPs have been invited to this royal party. | इस शाही पार्टी के लिए 250 वीवीआईपी लोगों को भी आमंत्रित किया गया है।
It is noteworthy that both Nita and Isha are professional, classical dancers. | गौरतलब है कि नीता और ईशा दोनों ही प्रोफेशनल क्लासिकल डांसर है।
It is being said that Nita Ambani and her daughter Isha may also perform in this. | बताया जा रहा है कि नीता अंबानी और उनकी बेटी ईशा भी इसमें अपनी प्रस्तुती दे सकती है।


100%|██████████| 1632/1632 [27:33<00:00,  1.01s/it]


In [19]:
from utils.utils import *
score, references, candidates = bleu_score(en, result)
print(score)

17.323974017161312
