In [2]:
#!git clone --single-branch --branch master https://github.com/DivyaRathod3D/CharNMT.git

Cloning into 'CharNMT'...
remote: Enumerating objects: 233, done.[K
remote: Counting objects: 100% (73/73), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 233 (delta 28), reused 70 (delta 25), pack-reused 160[K
Receiving objects: 100% (233/233), 609.34 MiB | 38.73 MiB/s, done.
Resolving deltas: 100% (109/109), done.
Updating files: 100% (77/77), done.


In [6]:
#cd CharNMT

/kaggle/working/CharNMT


In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
class CharDataset(Dataset):
    '''
    Dataset is a iterable that returns input and target sentence. It adds <sos> at the begining, and <eos> at the end, 
    and filling in <pad> if sentence length is less than pre-defined value.
    '''
    
    def __init__(self, x, y, sequence_len, encoder=None):
        # data in the type of pairs of sentence
        data = ''.join(x+y)
        # from collections import Counter
        # vocab_size = 250

        # ct = Counter(data)
        # include = sorted(ct, key=ct.get, reverse=True)
        # if len(include)>vocab_size: include = include[:vocab_size]
        # rule = ''.join(include)
        chars = ['<pad>'] +['<sos>'] + ['<eos>'] + sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)

        print('data has %d characters, %d unique chars, %d sentences.' % (data_size, len(chars), len(x)))
        print('sentence length nine_nine_percentile: %d' % (sequence_len))
        
        self.x, self.y = x, y
        self.ch2i = {ch:i for i,ch in enumerate(chars)}
        self.i2ch = {i:ch for i,ch in enumerate(chars)}
        self.vocab_size = vocab_size
        self.sequence_len = sequence_len
        self.encoder=encoder
    
    def __len__(self):
        return len(self.x) # len x = y
    
    def __getitem__(self, idx):
        
        indx = self.padding([self.ch2i[ch] for ch in self.x[idx]] + [self.ch2i['<eos>']])
        indy = [self.ch2i['<sos>']] + self.padding([self.ch2i[ch] for ch in self.y[idx]] + [self.ch2i['<eos>']])

        x = torch.tensor(indx, dtype=torch.long)
        y = torch.tensor(indy, dtype=torch.long)

        return x,y
                                                                                                                               
    def padding(self, string):
        if len(string)<self.sequence_len:
            string =  string + [0]*(self.sequence_len - len(string))
        else:
            string = string[:self.sequence_len -1] + [self.ch2i['<eos>']]
                   
        return string

In [3]:
sequence_len = 128
min_len = 0

In [9]:
# process and save data
import sys
import os
path = os.path.abspath("utils")
sys.path.append(path)
import utils.pre_processingtelugu
import pickle

en2, tel = list(), list()
# paths = ["data/pib/","data/hinden/","data/indic/"]
paths = ["data\WAT\te-en"]
for path in paths:
    x = open(path + "train.en", encoding='utf-8').read().split("\n")
    y = open(path + "train.te", encoding='utf-8').read().split("\n")
    x,y = pre_processingtelugu(x, y, min_length=min_len, max_length=sequence_len) # remove sentence less than 4 characters
    en2 += x
    tel += y
    
path = "data/cleaned/"
pickle(path + "en-tel", en2)
pickle(path + "tel", tel)
# nine_nine_percentile = int(np.percentile([len(sen) for sen in vi],99))

ModuleNotFoundError: No module named 'utils.pre_processingtelugu'

In [10]:
# Load saved data
from utils.utils import *
from utils.pre_processing import *

path = "data/cleaned/"
en = pickle(path+"en-tel")
hi = pickle(path+"tel")
en,hi = pre_processing(en-tel,tel, min_length=min_len, max_length=sequence_len) # clip sentences
for i in range(-1,-5,-1):
    print(en[i],'|',hi[i])

ModuleNotFoundError: No module named 'utils.utils'

In [12]:
dataset = CharDataset(tel,en-tel, sequence_len=sequence_len)

print('sample tensors ', next(iter(dataset)))
print("vocab: ", dataset.ch2i)

data has 20894274 characters, 162 unique chars, 293652 sentences.
sentence length nine_nine_percentile: 128
sample tensors  (tensor([121, 117, 127, 113, 129,   2,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]), tensor([ 1, 37, 53, 46, 63, 46, 46, 47, 54,  2,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  

In [13]:
from model.encode_decode_transformer import Transformer, TransformerConfig
from utils.trainer import Trainer, TrainerConfig
tconfig = TrainerConfig(max_epochs=1, batch_size=16, learning_rate=6e-4, grad_norm_clip=1.0, device='cuda',
                       lr_decay=True, warmup_tokens=5000, ckpt_n_print_iter=4000, ckpt_path='checkpoint/transformer_tel_en_char_wmt')

mconfig = TransformerConfig(vocab_size=dataset.vocab_size, sequence_len=dataset.sequence_len, embed_dim=256,
                           n_block=8, n_head=8, device=tconfig.device)

In [14]:
model = Transformer(mconfig)

In [15]:
sentences = ["మీరు నాకు ఉంటాయి కాలేదు ఎప్పుడూ తెలుసు.",
           "నేను మీ గాడిద కోసం ఏదో వచ్చింది!",
           "ఆమె నేను వచ్చి ఏమి పట్టణం వైపు వాసన చూడగలము.",
           "జుడిత్, మీరు ఒప్పుకుంటే, నేను మీరు ప్రేమలో వెర్రి కేవలం అని తెలుసు?",
           "మీరు పశుక్షేత్రంలో మీ అవకాశం తిరిగి వచ్చింది."
            ]
trainer = Trainer(model, dataset, tconfig, test_dataset=sentences, collate=None)

In [16]:
# # load pre-trained weights
from utils.utils import pickle
model.load_state_dict(pickle(tconfig.ckpt_path)) # load


<All keys matched successfully>

In [17]:
trainer.train()

epoch: 1 | train loss: 900.01603  | lr: 1.920000e-06:   0%|          | 0/18353 [00:00<?, ?it/s]

['And I would like to meet him.', "It's the same thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 1 | train loss: 0.35804  | lr: 5.402125e-04:  22%|██▏       | 4000/18353 [06:16<21:01, 11.37it/s]  

['I would like to meet him.', "It's the same thing that I need it.", "Don't worry about me.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 1 | train loss: 0.35084  | lr: 3.689517e-04:  44%|████▎     | 8000/18353 [12:31<15:13, 11.33it/s]   

["I'd like to meet it.", "It's the same thing that I need.", "Don't worry me.", 'He borrowed me two books.', 'I agree with this plan.']


epoch: 1 | train loss: 0.32126  | lr: 1.655665e-04:  65%|██████▌   | 11999/18353 [18:46<09:19, 11.37it/s] 

['And I would like to meet him.', "It's the same thing that I need.", "Don't worry my concern.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 1 | train loss: 0.33557  | lr: 6.000000e-05:  87%|████████▋ | 16003/18353 [25:04<20:40,  1.89it/s]  

['I would like to meet him.', "It's the same thing that I need.", "Don't worry me.", 'He borrowed me two books.', 'I agree with this scheme.']


epoch: 1 | train loss: 0.37875  | lr: 6.000000e-05: 100%|██████████| 18353/18353 [28:36<00:00, 10.69it/s]


In [18]:
samples = ["నేను చూడాలి ఎవరైనా కలిసే వెళ్ళి ఉంది.",
           "మరియు మీరు ఇప్పుడు మీ చేతిలో వాటిని ఆస్వాదించగల.",
           "నేనుదీన్నిమనిషి,పట్టణంలోకి మాన్ కబ్ మరియు స్త్రోల్ హక్కు",
           "ఈ కాఫీ టేబుల్ మీద ఆ వజ్రపుటుంగరం ."
          ]
result = model.generate_output(samples, dataset, top_k=3, print_process=True)
print(result)

100%|██████████| 6/6 [00:06<00:00,  1.03s/it]

['In the meantime, three people appeared on a bike.', 'Why are you not answering?', 'These books are me.', 'Thank you for your help.', 'He was much more than a king.', 'I saw a wonderful dream today']





In [19]:
# benchmarking using bleu score
path = "data\WAT\te-en"
en = open(path+"test.en.txt", encoding='utf-8').read().split("\n")
tel = open(path+"test.te.txt", encoding='utf-8').read().split("\n")
en, tel = pre_processing(en, tel, min_length=min_len, max_length=sequence_len) # remove sentence less than 4 characters

result = model.generate_output(tel, dataset, top_k=5, print_process=True)

2507
Some last sentences
32 chartered planes have been booked to ferry the guests to and fro. | मेहमानों को लानेले जाने के लिए 32 चार्टर्ड विमानों की व्यवस्था की गई है।
250 VIPs have been invited to this royal party. | इस शाही पार्टी के लिए 250 वीवीआईपी लोगों को भी आमंत्रित किया गया है।
It is noteworthy that both Nita and Isha are professional, classical dancers. | गौरतलब है कि नीता और ईशा दोनों ही प्रोफेशनल क्लासिकल डांसर है।
It is being said that Nita Ambani and her daughter Isha may also perform in this. | बताया जा रहा है कि नीता अंबानी और उनकी बेटी ईशा भी इसमें अपनी प्रस्तुती दे सकती है।


100%|██████████| 1602/1602 [27:20<00:00,  1.02s/it]


In [20]:
from utils.utils import *
score, references, candidates = bleu_score(en, result)
print(score)

15.588739145252545
