In [22]:
import math
import torchtext
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from collections import Counter
from torchtext.vocab import Vocab
import io
import time
import pandas as pd
import numpy as np
import pickle
import sentencepiece as spm
from torch.nn import TransformerEncoder, TransformerDecoder, TransformerEncoderLayer, TransformerDecoderLayer
torch.manual_seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 3090 Ti


In [23]:
data_path = "./process_data/merge_data.txt"

In [24]:
def read_data(data_path):
    with open(data_path, "r") as f:
        data = f.readlines()
    return data

In [25]:
data = read_data(data_path)

In [26]:
# data1 = list(map(lambda x: x.split("\t")[:2][::-1], data))
data = list(map(lambda x: [x.split("\t")[0], x.split("\t")[1].replace("\n", "")], data))

# Data Preprocessing

In [27]:
bn = ["০","১","২","৩","৪","৫","৬","৭","৮","৯"]
en = ["0","1","2","3","4","5","6","7","8","9"]

bn_en = dict(map(lambda x, y: [x, y] ,bn, en ))

In [28]:
print(bn_en)

{'০': '0', '১': '1', '২': '2', '৩': '3', '৪': '4', '৫': '5', '৬': '6', '৭': '7', '৮': '8', '৯': '9'}


In [29]:
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility

import re

# bn_to_en_number = lambda x
bn_regex = r'[০-৯]+'
en_regex = r'[0-9]+'

def bn_number(bn_matches):
    return [match[0] for matchNum, match in enumerate(bn_matches, start=1)]
def en_number(en_matches):
    return [match[0] for matchNum, match in enumerate(en_matches, start=1)]

def convert_bn2en(bn_n):
    bn2en = "".join(map(lambda x: bn_en[x], bn_n))
    return bn2en

def check_lenght(bn2en_n, en_n):
    bn_lenght= list(map(len, bn2en_n))
    en_lenght = list(map(len, en_n))
    if len(bn_lenght) != len(en_lenght):
        return {}, False
    if bn_lenght != en_lenght:
        bn2en_n = bn2en_n[::-1]
    en_bn_map  = dict(map(lambda x, y: [x, y] , en_n, bn2en_n ))
    return en_bn_map, True
    
    
def get_number_data(bn, en):
    bn_matches = list(re.finditer(bn_regex, bn, re.UNICODE))
    en_matches = list(re.finditer(en_regex, en, re.UNICODE))
    bn_n= bn_number(bn_matches)
    en_n = en_number(en_matches)
    bn2en_n= list(map(convert_bn2en, bn_n))
    maping, status = check_lenght(bn2en_n, en_n)
    return maping, status, en_n

def get_process_data(i):    
    maping, status, en_n= get_number_data((i[0]), (i[1]))
    en_string = i[1]
    if status:
        for miss_anno_number in en_n:
            en_string = en_string.replace(miss_anno_number, maping[miss_anno_number])
    return [i[0], en_string]

In [30]:
data = list(map(get_process_data, data))

In [31]:
data[-10:]

[['বাকেরগঞ্জ জেলা নামটি ১৭৯৭ থেকে ১৯৯৩ সালপর্যন্ত ছিল',
  'The name Bakerganj district was from 1797 to 1993 '],
 ['জেলা সদর বরিশালের নামে বিভাগের নামকরণ করা হয়',
  'The division was named after the district headquarters Barisal '],
 ['বিবিধ  বাকেরগঞ্জ উপজেলার প্রায় ৮০ভাগের পেশাই চাষাবাদ',
  'Miscellaneous occupations of about 80 per cent of Bakerganj upazila '],
 ['এই উপজেলার প্রায় ৮০ ভাগই ইসলাম ধর্ম অনুসারি',
  'About 80 percent of this upazila is Islamic '],
 ['বাকি ২০ভাগ হিন্দু এবং খ্রীষ্টান',
  'The remaining 20 percent are Hindus and Christians '],
 ['এই উপজেলায় ১টি সরকারি কলেজ রয়েছে',
  'There is 1 government college in this upazila '],
 ['বাকেরগঞ্জ সরকারি কলেজ', 'Bakerganj Government College '],
 ['তালু  মুখগহ্বর  তালু মুখগহ্বরের ছাদ',
  'The palate is the roof of the palate '],
 ['২০০৮ ২০০৮ গ্রেগরীয় বর্ষপঞ্জীর একটি অধিবর্ষ',
  '2006 is a leap year in the Gregorian calendar '],
 ['১৯০০ ১৯০০ গ্রেগরীয় বর্ষপঞ্জীর একটি সাধারণ বছর',
  '1900 is a typical year of the 1900 Grego

In [32]:
trainen = [i[1] for i in data]
val_en = [i[1] for i in data[-20000:]]
len(trainen)

195775

In [33]:
val_en[:10]

["Grand's style Grand's style is a deviant nature of mathematics ",
 'The clause is 1 1 1 1 ',
 'There is debate among mathematicians about this borrowing of infinite nature ',
 'According to some, its value will be zero 0 ',
 'According to others, the sum of this section is 1 ',
 'This section is known as the Grand section ',
 'He has made several acclaimed films ',
 'He finished eighth in the 100 meters ',
 'He finished the race in 12 60 seconds ',
 'The film is directed by renowned director Belal Ahmed ']

In [34]:
trainen[-10:]

['The name Bakerganj district was from 1797 to 1993 ',
 'The division was named after the district headquarters Barisal ',
 'Miscellaneous occupations of about 80 per cent of Bakerganj upazila ',
 'About 80 percent of this upazila is Islamic ',
 'The remaining 20 percent are Hindus and Christians ',
 'There is 1 government college in this upazila ',
 'Bakerganj Government College ',
 'The palate is the roof of the palate ',
 '2006 is a leap year in the Gregorian calendar ',
 '1900 is a typical year of the 1900 Gregorian calendar ']

In [35]:
bn_tokenizer = spm.SentencePieceProcessor(model_file='model/bn_model.model')
en_tokenizer = spm.SentencePieceProcessor(model_file='model/en_model.model')

In [36]:
# en_tokenizer.encode("All residents aged 20 to 59 years who live in Japan must enroll in public pension system.")

In [37]:
# bn_tokenizer.encode("আমি আবার বিয়ে করেছি।")

In [38]:
# print(bn_tokenizer.encode_as_pieces('টমকে জিজ্ঞাসা করুন।'))
# print(bn_tokenizer.encode_as_ids('টমকে জিজ্ঞাসা করুন।'))

In [39]:
# from torchtext.vocab import vocab


In [40]:
from torch.nn import (TransformerEncoder, TransformerDecoder,
                      TransformerEncoderLayer, TransformerDecoderLayer)


class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
                 emb_size: int, src_vocab_size: int, tgt_vocab_size: int,
                 dim_feedforward:int = 512, dropout:float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        encoder_layer = TransformerEncoderLayer(
            d_model=emb_size, 
            nhead=NHEAD,
            dim_feedforward=dim_feedforward
            )
        self.transformer_encoder = TransformerEncoder(
            encoder_layer, 
            num_layers=num_encoder_layers
            )
        decoder_layer = TransformerDecoderLayer(
            d_model=emb_size, 
            nhead=NHEAD,
            dim_feedforward=dim_feedforward
            )
        self.transformer_decoder = TransformerDecoder(
            decoder_layer, 
            num_layers=num_decoder_layers
            )

        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor,
                tgt_mask: Tensor, src_padding_mask: Tensor,
                tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None,
                                        tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer_encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer_decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [41]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding +
                            self.pos_embedding[:token_embedding.size(0),:])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [42]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [43]:
file = open('model/bn_vocab.pkl', 'rb')
bn_vocal = pickle.load(file)
file.close()

In [44]:
file = open('model/en_vocab.pkl', 'rb')
en_vocal = pickle.load(file)
file.close()

In [45]:
from tqdm import tqdm


EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
NUM_ENCODER_LAYERS = 6
NUM_DECODER_LAYERS = 6
NUM_EPOCHS = 300

SRC_VOCAB_SIZE = len(bn_vocal)
TGT_VOCAB_SIZE = len(en_vocal)

PAD_IDX = bn_vocal['<pad>']
BOS_IDX = bn_vocal['<bos>']
EOS_IDX = bn_vocal['<eos>']

# transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
#                                EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
#                                  FFN_HID_DIM)
# optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

## Inference
Here the inference script after load sentencepice train tokenizer model, vocal and train model

In [46]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [47]:
bn_tokenizer = spm.SentencePieceProcessor(model_file='model/bn_model.model')
en_tokenizer = spm.SentencePieceProcessor(model_file='model/en_model.model')

In [48]:
PATH = "model/model_checkpoint.pt"

model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)
model.to(device)
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model.eval()

Seq2SeqTransformer(
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=512, bias=True)
        (dropout): D

In [49]:

def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()
        ys = torch.cat([ys,torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

def translate(model, src, src_vocab, tgt_vocab, src_tokenizer):
#     model.eval()
    tokens = [BOS_IDX] + [src_vocab.get_stoi()[tok] for tok in src_tokenizer.encode(src, out_type=str)]+ [EOS_IDX]
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    p_text = " ".join([tgt_vocab.get_itos()[tok] for tok in tgt_tokens]).replace("<bos>", "").replace("<eos>", "")
    pts = " ".join(list(map(lambda x : x , p_text.replace(" ", "").split("▁"))))
    return pts.strip()

In [50]:
for i in data[-10:]:
    text = "আমি আবার বিয়ে করেছি।"
    pre = translate(model, i[0], bn_vocal, en_vocal, bn_tokenizer)
    print(f"input : {i[0]}")
    print(f"Ground Truth : {i[1]}")
    print(f"prediction: {pre}")
    print("================================")

input : বাকেরগঞ্জ জেলা নামটি ১৭৯৭ থেকে ১৯৯৩ সালপর্যন্ত ছিল
Ground Truth : The name Bakerganj district was from 1797 to 1993 
prediction: The name Bakerganj district was from 1797 to 1993
input : জেলা সদর বরিশালের নামে বিভাগের নামকরণ করা হয়
Ground Truth : The division was named after the district headquarters Barisal 
prediction: The division was named after the district headquarters Barisal
input : বিবিধ  বাকেরগঞ্জ উপজেলার প্রায় ৮০ভাগের পেশাই চাষাবাদ
Ground Truth : Miscellaneous occupations of about 80 per cent of Bakerganj upazila 
prediction: Miscellaneous occupations of about 80 per cent of Bakerganj upazila
input : এই উপজেলার প্রায় ৮০ ভাগই ইসলাম ধর্ম অনুসারি
Ground Truth : About 80 percent of this upazila is Islamic 
prediction: About 80 percent of this upazila is Islamic
input : বাকি ২০ভাগ হিন্দু এবং খ্রীষ্টান
Ground Truth : The remaining 20 percent are Hindus and Christians 
prediction: The remaining 20 percent are Hindus and Christians
input : এই উপজেলায় ১টি সরকারি কলেজ রয়ে

In [76]:
# very short
from nltk.translate.bleu_score import sentence_bleu

def get_token(text):
    return en_tokenizer.encode_as_pieces(text)
def get_blue_score(gt, pt):
    score = sentence_bleu(gt, pt)
    return score

In [None]:

def evaluation(text):
    pre = translate(model, text[0], bn_vocab, en_vocab, bn_tokenizer)
    gt = tokenize_text = get_token(text[1])
    pt = tokenize_text = get_token(pre)
#     print(gt, pt)
    score = get_blue_score([gt], pt)
#     print(score)
    return score
    
score = list(map(evaluation, data))
print("BLUE SCORE : ", sum(score)/len(score))

## Reference
1. https://torchtutorialstaging.z5.web.core.windows.net/beginner/translation_transformer.html
2. https://arusl.medium.com/japanese-english-language-translation-with-transformer-using-pytorch-243738146806
3. https://github.com/hyunwoongko/transformer
4. https://www.kaggle.com/datasets/ari994/banglaenglishtransliteration
