In [3]:
!git clone --single-branch --branch master https://github.com/DivyaRathod3D/CharNMT.git

Cloning into 'CharNMT'...
remote: Enumerating objects: 221, done.[K
remote: Counting objects: 100% (61/61), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 221 (delta 24), reused 61 (delta 24), pack-reused 160[K
Receiving objects: 100% (221/221), 555.26 MiB | 36.37 MiB/s, done.
Resolving deltas: 100% (105/105), done.
Updating files: 100% (75/75), done.


In [4]:
cd CharNMT

/kaggle/working/CharNMT


In [5]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [6]:
class CharDataset(Dataset):
    '''
    Dataset is a iterable that returns input and target sentence. It adds <sos> at the begining, and <eos> at the end, 
    and filling in <pad> if sentence length is less than pre-defined value.
    '''
    
    def __init__(self, x, y, sequence_len, encoder=None):
        # data in the type of pairs of sentence
        data = ''.join(x+y)
        # from collections import Counter
        # vocab_size = 250

        # ct = Counter(data)
        # include = sorted(ct, key=ct.get, reverse=True)
        # if len(include)>vocab_size: include = include[:vocab_size]
        # rule = ''.join(include)
        chars = ['<pad>'] +['<sos>'] + ['<eos>'] + sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)

        print('data has %d characters, %d unique chars, %d sentences.' % (data_size, len(chars), len(x)))
        print('sentence length nine_nine_percentile: %d' % (sequence_len))
        
        self.x, self.y = x, y
        self.ch2i = {ch:i for i,ch in enumerate(chars)}
        self.i2ch = {i:ch for i,ch in enumerate(chars)}
        self.vocab_size = vocab_size
        self.sequence_len = sequence_len
        self.encoder=encoder
    
    def __len__(self):
        return len(self.x) # len x = y
    
    def __getitem__(self, idx):
        
        indx = self.padding([self.ch2i[ch] for ch in self.x[idx]] + [self.ch2i['<eos>']])
        indy = [self.ch2i['<sos>']] + self.padding([self.ch2i[ch] for ch in self.y[idx]] + [self.ch2i['<eos>']])

        x = torch.tensor(indx, dtype=torch.long)
        y = torch.tensor(indy, dtype=torch.long)

        return x,y
                                                                                                                               
    def padding(self, string):
        if len(string)<self.sequence_len:
            string =  string + [0]*(self.sequence_len - len(string))
        else:
            string = string[:self.sequence_len -1] + [self.ch2i['<eos>']]
                   
        return string

In [7]:
sequence_len = 128
min_len = 4

In [8]:
# process and save data
from utils.pre_processing import *
from utils.utils import *
en, hi = list(), list()
# paths = ["data/pib/","data/hinden/","data/indic/"]
paths = ["data/samanantar/"]
for path in paths:
    x = open(path + "Eng.txt", encoding='utf-8').read().split("\n")
    y = open(path + "Hin.txt", encoding='utf-8').read().split("\n")
    x,y = pre_processing(x, y, min_length=min_len, max_length=sequence_len) # remove sentence less than 4 characters
    en += x
    hi += y
    
path = "data/cleaned/"
pickle(path + "en", en)
pickle(path + "hi", hi)
# nine_nine_percentile = int(np.percentile([len(sen) for sen in vi],99))

250000
Some last sentences
No casuality was reported. | गनीमत रही कि इससे कोई हादसा नहीं हुआ।
He was a sixtime parliamentarian. | वे छह बार विधायक रहे।
For example, Edersheim said that the Jews regarded Genesis 8 11 as Messianic. | मिसाल के लिए, इडरशाइम ने कहा कि यहूदी मानते हैं कि उत्पत्ति 8 11 में लिखी बात, मसीह के बारे में कही गयी है ।
Section 144 has been imposed in the entire district. | फैसले के मद्देनजर पूरे क्षेत्र में धारा 144 लागू की गई।


True

In [9]:
# Load saved data
from utils.utils import *
from utils.pre_processing import *

path = "data/cleaned/"
en = pickle(path+"en")
hi = pickle(path+"hi")
en,hi = pre_processing(en, hi, min_length=min_len, max_length=sequence_len) # clip sentences
for i in range(-1,-5,-1):
    print(en[i],'|',hi[i])

178067
Some last sentences
No casuality was reported. | गनीमत रही कि इससे कोई हादसा नहीं हुआ।
He was a sixtime parliamentarian. | वे छह बार विधायक रहे।
For example, Edersheim said that the Jews regarded Genesis 8 11 as Messianic. | मिसाल के लिए, इडरशाइम ने कहा कि यहूदी मानते हैं कि उत्पत्ति 8 11 में लिखी बात, मसीह के बारे में कही गयी है ।
Section 144 has been imposed in the entire district. | फैसले के मद्देनजर पूरे क्षेत्र में धारा 144 लागू की गई।
No casuality was reported. | गनीमत रही कि इससे कोई हादसा नहीं हुआ।
He was a sixtime parliamentarian. | वे छह बार विधायक रहे।
For example, Edersheim said that the Jews regarded Genesis 8 11 as Messianic. | मिसाल के लिए, इडरशाइम ने कहा कि यहूदी मानते हैं कि उत्पत्ति 8 11 में लिखी बात, मसीह के बारे में कही गयी है ।
Section 144 has been imposed in the entire district. | फैसले के मद्देनजर पूरे क्षेत्र में धारा 144 लागू की गई।


In [10]:
dataset = CharDataset(hi, en, sequence_len=sequence_len)

print('sample tensors ', next(iter(dataset)))
print("vocab: ", dataset.ch2i)

data has 21421083 characters, 168 unique chars, 178065 sentences.
sentence length nine_nine_percentile: 128
sample tensors  (tensor([ 96, 137, 120, 114,  73, 101, 118,   3, 114, 136,  73,   3,  89, 129,
        101, 125, 140,  73,   3,  89, 127,   3, 114, 131, 118, 142, 115,   3,
        112, 124, 130, 104,   3, 124, 137,   6,   3,  89, 142, 115, 140,  73,
         89, 128,   3, 110, 142, 116,  96, 127, 104, 128, 115, 140,  73,   3,
         89, 129,   3, 123, 114, 132, 106, 142, 107, 128,   3,  89, 136,   3,
        114, 127, 114, 118, 136,   3, 114, 136,  73,   3,  79, 108,  89, 129,
          3, 123,  73,  90, 142, 115, 127,   3,  75, 108, 142, 115,   3,  96,
        129, 120,   3, 123, 114, 131, 124, 140,  73,   3, 123, 136,   3,  96,
        125, 142, 115, 127, 106, 127,   3, 124, 137, 155,   2,   0,   0,   0,
          0,   0]), tensor([ 1, 38, 53, 50,  3, 67, 46, 57, 66, 50,  3, 60, 51,  3, 54, 59, 64, 50,
        48, 65, 64,  3, 54, 59,  3, 65, 53, 50,  3, 47, 54, 60, 64, 61, 5

In [12]:
from model.encode_decode_transformer import Transformer, TransformerConfig
from utils.trainer import Trainer, TrainerConfig
tconfig = TrainerConfig(max_epochs=2, batch_size=16, learning_rate=6e-4, grad_norm_clip=1.0, device='cuda',
                       lr_decay=True, warmup_tokens=5000, ckpt_n_print_iter=4000, ckpt_path='checkpoint/transformer_hi_en_char_smt')

mconfig = TransformerConfig(vocab_size=dataset.vocab_size, sequence_len=dataset.sequence_len, embed_dim=256,
                           n_block=8, n_head=8, device=tconfig.device)

In [13]:
model = Transformer(mconfig)

In [14]:
sentences = ["मैं उससे मिलना चाहूँगा।",
           "यह वही चीज़ है जिसकी मुझे ज़रूरत है।",
           "मेरी चिंता मत करो।",
           "उसने मुझे दो किताबें उधार दीं।",
           "मैं इस योजना से सहमत हूँ।"
            ]
trainer = Trainer(model, dataset, tconfig, test_dataset=sentences, collate=None)

In [15]:
# load pre-trained weights
from utils.utils import pickle
model.load_state_dict(pickle(tconfig.ckpt_path)) # load


<All keys matched successfully>

In [16]:
trainer.train()

epoch: 1 | train loss: 900.03472  | lr: 1.920000e-06:   0%|          | 0/11129 [00:01<?, ?it/s]

['I want to meet him.', 'This is the same thing I need to.', 'Dont worry about me.', 'He gave me two books.', 'Let me agree with this plan.']


epoch: 1 | train loss: 0.54088  | lr: 4.436939e-04:  36%|███▌      | 4000/11129 [06:16<10:29, 11.33it/s]  

['I would like to meet him.', 'This is the same thing that I need to.', 'Dont worry about me.', 'He gave me two books.', 'I agree with this scheme.']


epoch: 1 | train loss: 0.52889  | lr: 1.155236e-04:  72%|███████▏  | 7999/11129 [12:30<04:40, 11.16it/s]  

['I want to meet him.', 'This is the same thing that I need to.', 'Dont worry about me.', 'He gave me two books.', 'I agree with this scheme.']


epoch: 1 | train loss: 0.48579  | lr: 6.000000e-05: 100%|██████████| 11129/11129 [17:26<00:00, 10.64it/s] 
epoch: 2 | train loss: 0.48852  | lr: 6.000000e-05:   0%|          | 0/11129 [00:00<?, ?it/s]

['I want to meet him.', 'This is the same thing I need to.', 'Dont worry about me.', 'He gave me two books.', 'I agree with this scheme.']


epoch: 2 | train loss: 0.44373  | lr: 1.807673e-04:  36%|███▌      | 4000/11129 [06:13<10:44, 11.07it/s]

['I want to meet him.', 'This is the same thing that I need to.', 'Dont worry about me.', 'He gave me two books.', 'I agree with this scheme.']


epoch: 2 | train loss: 0.50993  | lr: 5.052873e-04:  72%|███████▏  | 8002/11129 [12:32<40:05,  1.30it/s]  

['I want to meet him.', 'Thats the same thing that I needed.', 'Dont worry about me.', 'He gave me two books.', 'I agree with this scheme.']


epoch: 2 | train loss: 0.55515  | lr: 5.987651e-04: 100%|██████████| 11129/11129 [17:13<00:00, 10.77it/s]


In [17]:
samples = ["इसी बीच एक बाइक पर तीन लोग आते दिखाई दिए।",
           "तुम जवाब क्यों नहीं दे रहे हो?",
           "ये किताबें मेरीं हैं।",
           "आप की मदद के लिए धन्यवाद.",
           "वह एक राजा से बहुत ज़्यादा था।",
           "मैंने आज एक अद्भुत सपना देखा"
          ]
result = model.generate_output(samples, dataset, top_k=5, print_process=True)
print(result)

100%|██████████| 6/6 [00:06<00:00,  1.06s/it]

['Meanwhile, three people were injured on the bike.', 'Why are you not answering?', 'These books are my books.', 'Thanks for your help.', 'She was more than one of the kings.', 'I have seen a wonderful dream today']





In [18]:
# benchmarking using bleu score
path = "data/test/samanantar/test2/"
# paths = ["data/test/samanantar/test1/", "data/test/samanantar/test2/"]
# for path in paths:
en = open(path+"test.en.txt", encoding='utf-8').read().split("\n")
hi = open(path+"test.hi.txt", encoding='utf-8').read().split("\n")
en, hi = pre_processing(en, hi, min_length=min_len, max_length=sequence_len) # remove sentence less than 4 characters

result = model.generate_output(hi, dataset, top_k=5, print_process=True)

2500
Some last sentences
HeShe cannot function within the association in any patron or advisory capacity nor be a member of a committee or council. | वह संघ के साथ संरक्षक या सलाहकार की भूमिका भी नहीं निभा सकता और किसी समिति या परिषद का सदस्य नहीं बन सकता।
The video has also gone viral on social media. | ये वीडियो सोशल मिडिया पर काफी वायरल भी हो गया है।
God let himself be found by them and continued to give them rest all around. | फिर परमेश्वर उनको मिला और उसने चारों ओर से उन्हें विश्राम दिया । 
His replacement will be England fast bowler Tom Curran. | इंग्लैंड के दाएं हाथ के तेज गेंदबाज टॉम कुरन उनका स्थान लेंगे।


100%|██████████| 1788/1788 [30:56<00:00,  1.04s/it]


In [19]:
from utils.utils import *
score, references, candidates = bleu_score(en, result)
print(score)

16.528464183283624
