In [12]:
import nltk
from nltk.translate.bleu_score import corpus_bleu
import models
import torch
import data_reader
import torch.nn as nn
import pandas as pd
from torchtext.vocab import build_vocab_from_iterator
import torchtext; torchtext.disable_torchtext_deprecation_warning()
from string import punctuation
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
data_vie = data_reader.read_text("./eng_vie/vi_sents")
data_eng = data_reader.read_text("./eng_vie/en_sents")

sents_vie = data_reader.to_lines(data_vie)
sents_eng = data_reader.to_lines(data_eng)

In [14]:
dataset = pd.DataFrame({'sents_eng': sents_eng, 'sents_vie': sents_vie})

In [15]:
def clean_text(text, lowercase=False, remove_punc=False, remove_num=False, sos_token='<sos>', eos_token='<eos>'):
    if lowercase:
        text = text.lower()
    if remove_punc:
        text = ''.join([ch for ch in text if ch not in punctuation])
    if remove_num:
        text = ''.join([ch for ch in text if ch not in '1234567890'])
    text = [sos_token] + word_tokenize(text) + [eos_token]
    return text

In [16]:
dataset['clean_eng'] = dataset['sents_eng'].apply(lambda x: clean_text(x, lowercase=True, remove_punc=True, remove_num=False))
dataset['clean_vie'] = dataset['sents_vie'].apply(lambda x: clean_text(x, lowercase=True, remove_punc=True, remove_num=False))

In [17]:
unk_token = '<unk>'
pad_token = '<pad>'
sos_token = '<sos>'
eos_token = '<eos>'

In [18]:
specials = [unk_token, pad_token, sos_token, eos_token]

In [19]:
eng_vocab = build_vocab_from_iterator(dataset['clean_eng'], specials = specials)
vie_vocab = build_vocab_from_iterator(dataset['clean_vie'], specials = specials)

In [20]:
def text_to_number(text, vocab):
    return vocab.lookup_indices(text)

In [21]:
dataset['eng_nums'] = dataset['clean_eng'].apply(lambda x: text_to_number(x, eng_vocab))
dataset['vie_nums'] = dataset['clean_vie'].apply(lambda x: text_to_number(x, vie_vocab))

In [22]:
def train_test(dataset, test_size = 0.2):
    train, test = train_test_split(dataset, test_size=test_size, random_state = 42)
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    return train, test

In [30]:
train, test = train_test(dataset)

In [23]:
pad_index = eng_vocab[pad_token]
unk_index = eng_vocab[unk_token]

In [24]:
vie_vocab.set_default_index(unk_index)
eng_vocab.set_default_index(unk_index)

In [25]:
SRC_VOCAB_SIZE = len(eng_vocab)
TGT_VOCAB_SIZE = len(vie_vocab)
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

In [26]:
best_model = models.EngToVieTranslation(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)



In [27]:
 best_model.load_state_dict(torch.load("best_translate_model.pth", map_location=device))
 best_model = best_model.to(device)

In [28]:
def translate(model, eng_sent, eng_vocab, vie_vocab, device, max_length=50):

    model.eval()

    eng_sent = eng_vocab.lookup_indices([sos_token] + word_tokenize(eng_sent.lower()) + [eos_token])
    eng_sent = torch.tensor(eng_sent).unsqueeze(1).to(device)
    
    answer = torch.tensor([[vie_vocab[sos_token]]]).to(device)
    
    
    with torch.no_grad():
        for _ in range(max_length):
            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(eng_sent, answer)
            output = model(eng_sent, answer, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask)
            
            output = output[-1].argmax().item()
            answer = torch.cat([answer, torch.tensor([[output]], device = device)], dim = 0)
            if output == eng_vocab[eos_token]:
                break
    return answer

In [31]:
eng_sents = test[0:10][['sents_eng', 'sents_vie']].values

In [33]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=device).type(torch.bool)

    src_padding_mask = (src == pad_index).transpose(0, 1)
    tgt_padding_mask = (tgt == pad_index).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [34]:
for eng_sent, vie_sent in eng_sents:
    translated = translate(best_model, eng_sent, eng_vocab, vie_vocab, device)
    translated = ' '.join([vie_vocab.get_itos()[i] for i in translated])
    print(f"Eng: {eng_sent}")
    print(f"Vie: {vie_sent}")
    print(f"Translated: {translated}")
    print("\n")



Eng: I lost my camera the other day.
Vie: tôi đã mất máy ảnh của tôi vào ngày khác.
Translated: <sos> tôi bị mất máy ảnh vào ngày khác <eos>


Eng: I'm not very good with children.
Vie: Tôi không tốt với trẻ em.
Translated: <sos> tôi rất giỏi vì con cái nhìn không phải bằng cách <eos>


Eng: Don't you worry about that?
Vie: bạn không lo lắng về điều đó?
Translated: <sos> bạn lo lắng về điều đó phải làm gì <eos>


Eng: He did his best to the end
Vie: anh ấy đã làm hết sức mình đến cuối cùng
Translated: <sos> anh ấy đã làm hết sức mình đến cuối cùng <eos>


Eng: Tom, I'm in trouble. I need you to come get me.
Vie: tom, tôi đang gặp rắc rối tôi cần bạn đến để có được tôi
Translated: <sos> tôi cần quay lại để khiến bạn ngắt tôi gặp rắc rối <eos>


Eng: I'm just going to run down to buy some tickets.
Vie: Tôi sẽ chạy xuống để mua vé.
Translated: <sos> tôi chỉ muốn mua một số vé để làm phiền <eos>


Eng: What more could I want?
Vie: tôi muốn gì hơn nữa
Translated: <sos> tôi có thể muốn gì hơ

In [35]:
from nltk.translate.bleu_score import corpus_bleu

In [36]:
# calculate BLEU score
def calculate_bleu_score(model, data, eng_vocab, vie_vocab, device):
    translated_corpus = []
    reference_corpus = []
    for eng_sent, vie_sent in data[['sents_eng', 'sents_vie']].values:
        translated = translate(model, eng_sent, eng_vocab, vie_vocab, device)
        translated = [vie_vocab.get_itos()[i] for i in translated]
        translated_corpus.append(translated[1:-1])
        reference_corpus.append([word_tokenize(vie_sent.lower())])
        
    return corpus_bleu(reference_corpus, translated_corpus)

In [37]:
calculate_bleu_score(best_model, test, eng_vocab, vie_vocab, device)



0.3393429640302753