In [86]:
import math
import torchtext
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from collections import Counter
from torchtext.vocab import Vocab
import io
import time
import pandas as pd
import numpy as np
import pickle
import sentencepiece as spm
from torch.nn import TransformerEncoder, TransformerDecoder, TransformerEncoderLayer, TransformerDecoderLayer
torch.manual_seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 3090 Ti


In [87]:
data_path = "./process_data/merge_data.txt"

In [88]:
def read_data(data_path):
    with open(data_path, "r") as f:
        data = f.readlines()
    return data

In [89]:
data = read_data(data_path)

In [90]:
data[:10]

['যাও।\tGo.\n',
 'যান।\tGo.\n',
 'যা।\tGo.\n',
 'পালাও!\tRun!\n',
 'পালান!\tRun!\n',
 'কে?\tWho?\n',
 'আগুন!\tFire!\n',
 'বাঁচাও!\tHelp!\n',
 'বাঁচান!\tHelp!\n',
 'থামুন!\tStop!\n']

In [91]:
# data1 = list(map(lambda x: x.split("\t")[:2][::-1], data))
data = list(map(lambda x: [x.split("\t")[0], x.split("\t")[1].replace("\n", "")], data))

In [92]:
data[:10]

[['যাও।', 'Go.'],
 ['যান।', 'Go.'],
 ['যা।', 'Go.'],
 ['পালাও!', 'Run!'],
 ['পালান!', 'Run!'],
 ['কে?', 'Who?'],
 ['আগুন!', 'Fire!'],
 ['বাঁচাও!', 'Help!'],
 ['বাঁচান!', 'Help!'],
 ['থামুন!', 'Stop!']]

# Data Preprocessing

In [93]:
bn = ["০","১","২","৩","৪","৫","৬","৭","৮","৯"]
en = ["0","1","2","3","4","5","6","7","8","9"]

bn_en = dict(map(lambda x, y: [x, y] ,bn, en ))

In [94]:
print(bn_en)

{'০': '0', '১': '1', '২': '2', '৩': '3', '৪': '4', '৫': '5', '৬': '6', '৭': '7', '৮': '8', '৯': '9'}


In [95]:
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility

import re

# bn_to_en_number = lambda x
bn_regex = r'[০-৯]+'
en_regex = r'[0-9]+'

def bn_number(bn_matches):
    return [match[0] for matchNum, match in enumerate(bn_matches, start=1)]
def en_number(en_matches):
    return [match[0] for matchNum, match in enumerate(en_matches, start=1)]

def convert_bn2en(bn_n):
    bn2en = "".join(map(lambda x: bn_en[x], bn_n))
    return bn2en

def check_lenght(bn2en_n, en_n):
    bn_lenght= list(map(len, bn2en_n))
    en_lenght = list(map(len, en_n))
    if len(bn_lenght) != len(en_lenght):
        return {}, False
    if bn_lenght != en_lenght:
        bn2en_n = bn2en_n[::-1]
    en_bn_map  = dict(map(lambda x, y: [x, y] , en_n, bn2en_n ))
    return en_bn_map, True
    
    
def get_number_data(bn, en):
    bn_matches = list(re.finditer(bn_regex, bn, re.UNICODE))
    en_matches = list(re.finditer(en_regex, en, re.UNICODE))
    bn_n= bn_number(bn_matches)
    en_n = en_number(en_matches)
    bn2en_n= list(map(convert_bn2en, bn_n))
    maping, status = check_lenght(bn2en_n, en_n)
    return maping, status, en_n

def get_process_data(i):    
    maping, status, en_n= get_number_data((i[0]), (i[1]))
    en_string = i[1]
    if status:
        for miss_anno_number in en_n:
            en_string = en_string.replace(miss_anno_number, maping[miss_anno_number])
    return [i[0], en_string]

In [96]:
data = list(map(get_process_data, data))

In [97]:
data[-10:]

[['বাকেরগঞ্জ জেলা নামটি ১৭৯৭ থেকে ১৯৯৩ সালপর্যন্ত ছিল',
  'The name Bakerganj district was from 1797 to 1993 '],
 ['জেলা সদর বরিশালের নামে বিভাগের নামকরণ করা হয়',
  'The division was named after the district headquarters Barisal '],
 ['বিবিধ  বাকেরগঞ্জ উপজেলার প্রায় ৮০ভাগের পেশাই চাষাবাদ',
  'Miscellaneous occupations of about 80 per cent of Bakerganj upazila '],
 ['এই উপজেলার প্রায় ৮০ ভাগই ইসলাম ধর্ম অনুসারি',
  'About 80 percent of this upazila is Islamic '],
 ['বাকি ২০ভাগ হিন্দু এবং খ্রীষ্টান',
  'The remaining 20 percent are Hindus and Christians '],
 ['এই উপজেলায় ১টি সরকারি কলেজ রয়েছে',
  'There is 1 government college in this upazila '],
 ['বাকেরগঞ্জ সরকারি কলেজ', 'Bakerganj Government College '],
 ['তালু  মুখগহ্বর  তালু মুখগহ্বরের ছাদ',
  'The palate is the roof of the palate '],
 ['২০০৮ ২০০৮ গ্রেগরীয় বর্ষপঞ্জীর একটি অধিবর্ষ',
  '2006 is a leap year in the Gregorian calendar '],
 ['১৯০০ ১৯০০ গ্রেগরীয় বর্ষপঞ্জীর একটি সাধারণ বছর',
  '1900 is a typical year of the 1900 Grego

In [98]:
trainbn = [i[0] for i in data]
val_bn = [i[0] for i in data[-20000:]]
len(trainbn)

195775

In [99]:
val_bn[:10]

['গ্রান্দের ধারা গ্রান্দের ধারা হলো গণিতের একটি বিচ্যুত প্রকৃতির ধারা',
 'ধারাটি হচ্ছে ১   ১   ১   ১      ',
 'অসীম প্রকৃতির এই ধারটি নিয়ে গণিতবিদদের মধ্যে বিতর্ক রয়েছে',
 'কারও মতে  এর মান হবে শূন্য  ০ ',
 'অন্যদের মতে এই ধারার যোগফল এক  ১ ',
 'এই ধারাই গ্র্যান্দের ধারা হিসেবে পরিচিত',
 'তিনি বেশ কয়েকটি দর্শকনন্দিত চলচ্চিত্র নির্মাণ করেছেন',
 'তিনি ১০০মিটার হিটে অষ্টম হয়ে প্রতিযোগিতা শেষ করেন',
 'তিনি ১২ ৬০সেকেন্ড সময়ে দৌড় শেষ করেন',
 'চলচ্চিত্রটি পরিচালনা করেছেন বিখ্যাত পরিচালক বেলাল আহমেদ']

In [100]:
trainen = [i[1] for i in data]
val_en = [i[1] for i in data[-20000:]]
len(trainen)

195775

In [101]:
val_en[:10]

["Grand's style Grand's style is a deviant nature of mathematics ",
 'The clause is 1 1 1 1 ',
 'There is debate among mathematicians about this borrowing of infinite nature ',
 'According to some, its value will be zero 0 ',
 'According to others, the sum of this section is 1 ',
 'This section is known as the Grand section ',
 'He has made several acclaimed films ',
 'He finished eighth in the 100 meters ',
 'He finished the race in 12 60 seconds ',
 'The film is directed by renowned director Belal Ahmed ']

In [102]:
trainbn[-10:]

['বাকেরগঞ্জ জেলা নামটি ১৭৯৭ থেকে ১৯৯৩ সালপর্যন্ত ছিল',
 'জেলা সদর বরিশালের নামে বিভাগের নামকরণ করা হয়',
 'বিবিধ  বাকেরগঞ্জ উপজেলার প্রায় ৮০ভাগের পেশাই চাষাবাদ',
 'এই উপজেলার প্রায় ৮০ ভাগই ইসলাম ধর্ম অনুসারি',
 'বাকি ২০ভাগ হিন্দু এবং খ্রীষ্টান',
 'এই উপজেলায় ১টি সরকারি কলেজ রয়েছে',
 'বাকেরগঞ্জ সরকারি কলেজ',
 'তালু  মুখগহ্বর  তালু মুখগহ্বরের ছাদ',
 '২০০৮ ২০০৮ গ্রেগরীয় বর্ষপঞ্জীর একটি অধিবর্ষ',
 '১৯০০ ১৯০০ গ্রেগরীয় বর্ষপঞ্জীর একটি সাধারণ বছর']

In [103]:
trainen[-10:]

['The name Bakerganj district was from 1797 to 1993 ',
 'The division was named after the district headquarters Barisal ',
 'Miscellaneous occupations of about 80 per cent of Bakerganj upazila ',
 'About 80 percent of this upazila is Islamic ',
 'The remaining 20 percent are Hindus and Christians ',
 'There is 1 government college in this upazila ',
 'Bakerganj Government College ',
 'The palate is the roof of the palate ',
 '2006 is a leap year in the Gregorian calendar ',
 '1900 is a typical year of the 1900 Gregorian calendar ']

In [104]:
import os
process_data_path = "process_data"
# os.makedirs(process_data_path, exits_ok= True)
os.makedirs(process_data_path, exist_ok = True)

In [105]:
def write_txt_file(file_path, data, encoding="utf-8"):
    with open(file_path, 'w') as f:
        for key in data:
            if isinstance(key, list):
                key = key[0]
            f.write(key+"\n")

In [106]:
write_txt_file(os.path.join(process_data_path, "bn_data.txt"), trainbn)

In [107]:
write_txt_file(os.path.join(process_data_path,"en_data.txt"), trainen)

In [108]:
def merge_data_write_txt_file(file_path, bn_data, en_data, encoding="utf-8"):
    with open(file_path, 'w') as f:
        for bn, en in zip(bn_data, en_data):
#             if isinstance(key, list):
#                 key = key[0]
            f.write(bn+"\t"+en+"\n")

In [109]:
merge_data_write_txt_file(os.path.join(process_data_path, "merge_data.txt"), trainbn, trainen)

In [110]:
model_path = "model"
os.makedirs(model_path, exist_ok = True)

In [111]:
import sentencepiece as spm

def train_tokenizer(text_path="text.txt", model_prefix="model/bn_model", vocab_size=30000):
    spm.SentencePieceTrainer.train(f'--input={text_path} --model_prefix={model_prefix} --user_defined_symbols=<sep>,<cls> --vocab_size={vocab_size}')
    bn_sp = spm.SentencePieceProcessor()
    bn_sp.load(os.path.join(model_path, 'bn_model.model'))


In [112]:
bn_data_path = "process_data/bn_data.txt"
en_data_path = "process_data/en_data.txt"

In [113]:
train_tokenizer(
    text_path = bn_data_path,
    model_prefix = "model/bn_model",
    vocab_size = 50000
)

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=process_data/bn_data.txt --model_prefix=model/bn_model --user_defined_symbols=<sep>,<cls> --vocab_size=50000
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: process_data/bn_data.txt
  input_format: 
  model_prefix: model/bn_model
  model_type: UNIGRAM
  vocab_size: 50000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: <sep>
  user_defined_symbols: <cls>
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_

In [114]:
train_tokenizer(
    text_path = en_data_path,
    model_prefix = "model/en_model",
    vocab_size = 30000
)

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=process_data/en_data.txt --model_prefix=model/en_model --user_defined_symbols=<sep>,<cls> --vocab_size=30000
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: process_data/en_data.txt
  input_format: 
  model_prefix: model/en_model
  model_type: UNIGRAM
  vocab_size: 30000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: <sep>
  user_defined_symbols: <cls>
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_

In [115]:
bn_tokenizer = spm.SentencePieceProcessor(model_file='model/bn_model.model')
en_tokenizer = spm.SentencePieceProcessor(model_file='model/en_model.model')

In [116]:
en_tokenizer.encode("All residents aged 20 to 59 years who live in Japan must enroll in public pension system.")

[302,
 2625,
 164,
 48,
 349,
 15,
 2120,
 92,
 347,
 342,
 7,
 670,
 2010,
 4786,
 7,
 953,
 19800,
 292,
 39]

In [117]:
bn_tokenizer.encode("আমি আবার বিয়ে করেছি।")

[148, 200, 315, 4654, 50]

In [118]:
print(bn_tokenizer.encode_as_pieces('টমকে জিজ্ঞাসা করুন।'))
print(bn_tokenizer.encode_as_ids('টমকে জিজ্ঞাসা করুন।'))

['▁টমকে', '▁জিজ্ঞাসা', '▁করুন', '।']
[2429, 4620, 2745, 50]


In [119]:
from torchtext.vocab import vocab
def build_vocab(sentences, tokenizer):
    counter = Counter()
    for sentence in sentences:
        if isinstance(sentence, list):
            sentence = sentence[0]
        counter.update(tokenizer.encode(sentence, out_type=str))
    return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'], special_first=True)

In [120]:
torchtext.__version__

'0.12.0'

In [121]:
bn_vocab = build_vocab(trainbn, bn_tokenizer)

In [122]:
en_vocab = build_vocab(trainen, en_tokenizer)

In [123]:
def data_process(bn, en):
    data = []
    for (raw_bn, raw_en) in zip(bn, en):
        bn_tensor_ = torch.tensor([bn_vocab[token] for token in bn_tokenizer.encode(raw_bn, out_type=str)],dtype=torch.long)
        en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer.encode(raw_en, out_type=str)],dtype=torch.long)
        data.append((bn_tensor_, en_tensor_))
    return data
train_data = data_process(trainbn, trainen)
val_data = data_process(val_bn, val_en)

In [124]:
BATCH_SIZE = 150
PAD_IDX = bn_vocab['<pad>']
BOS_IDX = bn_vocab['<bos>']
EOS_IDX = bn_vocab['<eos>']

def generate_batch(data_batch):
    bn_batch, en_batch = [], []
    for (bn_item, en_item) in data_batch:
        bn_batch.append(torch.cat([torch.tensor([BOS_IDX]), bn_item, torch.tensor([EOS_IDX])], dim=0))
        en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
    bn_batch = pad_sequence(bn_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return bn_batch, en_batch

train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,shuffle=True, collate_fn=generate_batch)
val_iter = DataLoader(val_data, batch_size=BATCH_SIZE,shuffle=True, collate_fn=generate_batch)

In [125]:
from torch.nn import (TransformerEncoder, TransformerDecoder,
                      TransformerEncoderLayer, TransformerDecoderLayer)


class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
                 emb_size: int, src_vocab_size: int, tgt_vocab_size: int,
                 dim_feedforward:int = 512, dropout:float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        encoder_layer = TransformerEncoderLayer(
            d_model=emb_size, 
            nhead=NHEAD,
            dim_feedforward=dim_feedforward
            )
        self.transformer_encoder = TransformerEncoder(
            encoder_layer, 
            num_layers=num_encoder_layers
            )
        decoder_layer = TransformerDecoderLayer(
            d_model=emb_size, 
            nhead=NHEAD,
            dim_feedforward=dim_feedforward
            )
        self.transformer_decoder = TransformerDecoder(
            decoder_layer, 
            num_layers=num_decoder_layers
            )

        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor,
                tgt_mask: Tensor, src_padding_mask: Tensor,
                tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None,
                                        tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer_encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer_decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [126]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding +
                            self.pos_embedding[:token_embedding.size(0),:])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [127]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [128]:
from tqdm import tqdm
SRC_VOCAB_SIZE = len(bn_vocab)
TGT_VOCAB_SIZE = len(en_vocab)
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 150
NUM_ENCODER_LAYERS = 6
NUM_DECODER_LAYERS = 6
NUM_EPOCHS = 300


transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(device)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

def train_epoch(model, train_iter, optimizer):
    model.train()
    losses = 0
    for idx, (src, tgt) in enumerate(train_iter):
#         print("training iter : ", idx)
#     for idx in tqdm(range(len(train_iter))):
#         src, tgt = train_iter[idx]
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,
                                src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:,:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()
    return losses / len(train_iter)


In [129]:
def evaluate(model, val_iter):
    model.eval()
    losses = 0
    for idx, (src, tgt) in (enumerate(val_iter)):
#         print(idx)
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,
                                  src_padding_mask, tgt_padding_mask, src_padding_mask)
        tgt_out = tgt[1:,:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
    return losses / len(val_iter)

In [130]:
for epoch in range(1, NUM_EPOCHS+1):
    start_time = time.time()
    train_loss = train_epoch(transformer, train_iter, optimizer)
#     if epoch % 5 == 0:
    val_loss = evaluate(transformer, val_iter)
    end_time = time.time()
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, val loss : {val_loss:.3f} "
          f"Epoch time = {(end_time - start_time):.3f}s"))

        
    # save model + checkpoint to resume training later
    torch.save({
      'epoch': NUM_EPOCHS,
      'model_state_dict': transformer.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),
      'loss': train_loss,
      }, 'model/model_checkpoint.tar')

Epoch: 1, Train loss: 5.902, val loss : 4.953 Epoch time = 92.427s
Epoch: 2, Train loss: 4.562, val loss : 4.101 Epoch time = 94.268s
Epoch: 3, Train loss: 3.878, val loss : 3.419 Epoch time = 96.235s
Epoch: 4, Train loss: 3.281, val loss : 2.775 Epoch time = 95.642s
Epoch: 5, Train loss: 2.791, val loss : 2.318 Epoch time = 95.767s
Epoch: 6, Train loss: 2.421, val loss : 1.959 Epoch time = 95.708s
Epoch: 7, Train loss: 2.140, val loss : 1.697 Epoch time = 94.254s
Epoch: 8, Train loss: 1.920, val loss : 1.497 Epoch time = 94.566s
Epoch: 9, Train loss: 1.744, val loss : 1.339 Epoch time = 92.112s
Epoch: 10, Train loss: 1.598, val loss : 1.190 Epoch time = 91.463s
Epoch: 11, Train loss: 1.474, val loss : 1.085 Epoch time = 91.308s
Epoch: 12, Train loss: 1.367, val loss : 0.992 Epoch time = 94.766s
Epoch: 13, Train loss: 1.277, val loss : 0.911 Epoch time = 95.710s
Epoch: 14, Train loss: 1.197, val loss : 0.825 Epoch time = 95.715s
Epoch: 15, Train loss: 1.123, val loss : 0.766 Epoch time

Epoch: 122, Train loss: 0.155, val loss : 0.062 Epoch time = 91.213s
Epoch: 123, Train loss: 0.155, val loss : 0.061 Epoch time = 91.315s
Epoch: 124, Train loss: 0.154, val loss : 0.061 Epoch time = 91.263s
Epoch: 125, Train loss: 0.154, val loss : 0.059 Epoch time = 91.551s
Epoch: 126, Train loss: 0.153, val loss : 0.063 Epoch time = 91.151s
Epoch: 127, Train loss: 0.155, val loss : 0.061 Epoch time = 91.224s
Epoch: 128, Train loss: 0.152, val loss : 0.063 Epoch time = 91.076s
Epoch: 129, Train loss: 0.153, val loss : 0.061 Epoch time = 91.271s
Epoch: 130, Train loss: 0.152, val loss : 0.057 Epoch time = 91.052s
Epoch: 131, Train loss: 0.150, val loss : 0.059 Epoch time = 90.998s
Epoch: 132, Train loss: 0.152, val loss : 0.060 Epoch time = 91.008s
Epoch: 133, Train loss: 0.150, val loss : 0.061 Epoch time = 91.007s
Epoch: 134, Train loss: 0.151, val loss : 0.062 Epoch time = 90.948s
Epoch: 135, Train loss: 0.150, val loss : 0.059 Epoch time = 91.086s
Epoch: 136, Train loss: 0.150, val

Epoch: 241, Train loss: 0.125, val loss : 0.053 Epoch time = 91.224s
Epoch: 242, Train loss: 0.125, val loss : 0.055 Epoch time = 91.206s
Epoch: 243, Train loss: 0.126, val loss : 0.056 Epoch time = 90.965s
Epoch: 244, Train loss: 0.125, val loss : 0.054 Epoch time = 91.225s
Epoch: 245, Train loss: 0.126, val loss : 0.053 Epoch time = 90.985s
Epoch: 246, Train loss: 0.126, val loss : 0.053 Epoch time = 91.125s
Epoch: 247, Train loss: 0.126, val loss : 0.052 Epoch time = 91.171s
Epoch: 248, Train loss: 0.125, val loss : 0.051 Epoch time = 91.303s
Epoch: 249, Train loss: 0.126, val loss : 0.052 Epoch time = 91.259s
Epoch: 250, Train loss: 0.125, val loss : 0.054 Epoch time = 91.279s
Epoch: 251, Train loss: 0.125, val loss : 0.053 Epoch time = 91.248s
Epoch: 252, Train loss: 0.125, val loss : 0.058 Epoch time = 91.271s
Epoch: 253, Train loss: 0.125, val loss : 0.055 Epoch time = 91.964s
Epoch: 254, Train loss: 0.125, val loss : 0.054 Epoch time = 91.298s
Epoch: 255, Train loss: 0.124, val

In [131]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [132]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()
        ys = torch.cat([ys,torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

def translate(model, src, src_vocab, tgt_vocab, src_tokenizer):
    model.eval()
    tokens = [BOS_IDX] + [src_vocab.get_stoi()[tok] for tok in src_tokenizer.encode(src, out_type=str)]+ [EOS_IDX]
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    p_text = " ".join([tgt_vocab.get_itos()[tok] for tok in tgt_tokens]).replace("<bos>", "").replace("<eos>", "")
    pts = " ".join(list(map(lambda x : x , p_text.replace(" ", "").split("▁"))))
    return pts.strip()

In [153]:
# for i in data[:10]:
text = "বাকেরগঞ্জ জেলা নামটি ১৭৯৭ থেকে ১৯৯৩ সালপর্যন্ত ছিল"
pre = translate(transformer, text, bn_vocab, en_vocab, bn_tokenizer)
print(f"input : {text}")
print(f"prediction: {pre}")

input : বাকেরগঞ্জ জেলা নামটি ১৭৯৭ থেকে ১৯৯৩ সালপর্যন্ত ছিল
prediction: The name Bakerganj district was from 1797 to 1993


In [308]:
# itos = ja_vocab.itos()

In [134]:
import pickle
# open a file, where you want to store the data
file = open('model/bn_vocab.pkl', 'wb')
# dump information to that file
pickle.dump(bn_vocab, file)
file.close()
file = open('model/en_vocab.pkl', 'wb')
pickle.dump(en_vocab, file)
file.close()

In [135]:
# save model + checkpoint to resume training later
torch.save({
  'epoch': NUM_EPOCHS,
  'model_state_dict': transformer.state_dict(),
  'optimizer_state_dict': optimizer.state_dict(),
  'loss': train_loss,
  }, 'model/model_checkpoint.pt')

## Inference
Here the inference script after load sentencepice train tokenizer model, vocal and train model

In [136]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [137]:
bn_tokenizer = spm.SentencePieceProcessor(model_file='model/bn_model.model')
en_tokenizer = spm.SentencePieceProcessor(model_file='model/en_model.model')

In [138]:
file = open('model/bn_vocab.pkl', 'rb')
bn_vocal = pickle.load(file)
file.close()

In [139]:
file = open('model/en_vocab.pkl', 'rb')
en_vocal = pickle.load(file)
file.close()

In [140]:
PATH = "model/model_checkpoint.pt"

model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)
model.to(device)
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model.eval()

Seq2SeqTransformer(
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=512, bias=True)
        (dropout): D

In [141]:

def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()
        ys = torch.cat([ys,torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

def translate(model, src, src_vocab, tgt_vocab, src_tokenizer):
#     model.eval()
    tokens = [BOS_IDX] + [src_vocab.get_stoi()[tok] for tok in src_tokenizer.encode(src, out_type=str)]+ [EOS_IDX]
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    p_text = " ".join([tgt_vocab.get_itos()[tok] for tok in tgt_tokens]).replace("<bos>", "").replace("<eos>", "")
    pts = " ".join(list(map(lambda x : x , p_text.replace(" ", "").split("▁"))))
    return pts.strip()

In [142]:
for i in data[-10:]:
    text = "আমি আবার বিয়ে করেছি।"
    pre = translate(model, i[0], bn_vocab, en_vocab, bn_tokenizer)
    print(f"input : {i[0]}")
    print(f"Ground Truth : {i[1]}")
    print(f"prediction: {pre}")
    print("================================")

input : বাকেরগঞ্জ জেলা নামটি ১৭৯৭ থেকে ১৯৯৩ সালপর্যন্ত ছিল
Ground Truth : The name Bakerganj district was from 1797 to 1993 
prediction: The name Bakerganj district was from 1797 to 1993
input : জেলা সদর বরিশালের নামে বিভাগের নামকরণ করা হয়
Ground Truth : The division was named after the district headquarters Barisal 
prediction: The division was named after the district headquarters Barisal
input : বিবিধ  বাকেরগঞ্জ উপজেলার প্রায় ৮০ভাগের পেশাই চাষাবাদ
Ground Truth : Miscellaneous occupations of about 80 per cent of Bakerganj upazila 
prediction: Miscellaneous occupations of about 80 per cent of Bakerganj upazila
input : এই উপজেলার প্রায় ৮০ ভাগই ইসলাম ধর্ম অনুসারি
Ground Truth : About 80 percent of this upazila is Islamic 
prediction: About 80 percent of this upazila is Islamic
input : বাকি ২০ভাগ হিন্দু এবং খ্রীষ্টান
Ground Truth : The remaining 20 percent are Hindus and Christians 
prediction: The remaining 20 percent are Hindus and Christians
input : এই উপজেলায় ১টি সরকারি কলেজ রয়ে

## Reference
1. https://torchtutorialstaging.z5.web.core.windows.net/beginner/translation_transformer.html
2. https://arusl.medium.com/japanese-english-language-translation-with-transformer-using-pytorch-243738146806
