In [None]:
!git clone https://github.com/quangster/machine-translation
%cd machine-translation

Cloning into 'machine-translation'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 22 (delta 1), reused 19 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (22/22), 5.98 MiB | 5.75 MiB/s, done.
Resolving deltas: 100% (1/1), done.
/content/machine-translation


In [None]:
!python setup.py

Retrieving folder contents
Retrieving folder 1OEYSMb7DHvhpHDkyErFfepIjeyDOiDR4 dev
Processing file 1Koyp92dplbh_S_9UW8wvskbzHW6Gb5Zw dev.en
Processing file 1KVzIWM8IUIS_NdWpctOd_l3FIm901e6L dev.vi
Retrieving folder 1FkG-m-LSXaXCrau3yD8s9_f8Llda3KoF test
Processing file 18XurJYc9T8i4JKzGRknNIMzEBD5bLDex test.en
Processing file 1atCidgee403dxm8mAWIXq9mlfdcYSXc_ test.vi
Retrieving folder 1jrfK8TmZghXISDq7JI-LTZyItZRgnEn2 train
Processing file 1jR128Bdo7vyQc1OPBCE6zEz0gXF6eUDY train.en
Processing file 1hKt2ww1-zZHzXRPl_0ijxUxdWK57XKp1 train.vi
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1Koyp92dplbh_S_9UW8wvskbzHW6Gb5Zw
To: /content/machine-translation/data/dev/dev.en
100% 1.42M/1.42M [00:00<00:00, 124MB/s]
Downloading...
From: https://drive.google.com/uc?id=1KVzIWM8IUIS_NdWpctOd_l3FIm901e6L
To: /content/machine-translation/data/dev/dev.vi
100% 1.89M/1.89M [00:00<00:00, 48.6MB/s

In [7]:
from src.data import Vocabulary, EnTokenizer, ViTokenizer, MTDataset
from src.utils.data import read_corpus

from tqdm import tqdm

Lưu ý khi tokenize: vì các bộ vocab được build bằng chữ thường nên lúc tokenize cũng cần chuyển về chữ thường bằng hàm .lower()

In [4]:
# vietnamese tokenizer
vi_tokenizer = ViTokenizer()
vi_tokenizer.tokenize("   Ðảm baỏ chất lựơng phòng  , thí nghịêm       hoá học".lower())

['đảm', 'bảo', 'chất', 'lượng', 'phòng', ',', 'thí', 'nghiệm', 'hóa', 'học']

In [5]:
# english tokenizer
en_tokenizer = EnTokenizer()
en_tokenizer.tokenize("Hello,     world!, I'm a student.".lower())

['hello', ',', 'world', '!', ',', 'i', "'m", 'a', 'student', '.']

In [6]:
en_vocab = Vocabulary.load('./ckpts/en_vocab.json')
vi_vocab = Vocabulary.load('./ckpts/vi_vocab.json')
print(en_vocab)
print(vi_vocab)

Vocabulary[language=english, size=34687]
Vocabulary[language=vietnamese, size=21681]


In [11]:
def get_dataset():
    vi_tokenizer = ViTokenizer()
    en_tokenizer = EnTokenizer()

    global en_vocab, vi_vocab

    train_en_sents, train_vi_sents = read_corpus("./data", "train")
    train_en_sents = train_en_sents[1300000:1400000]
    train_vi_sents = train_vi_sents[1300000:1400000]

    train_en_sents = [en_tokenizer.tokenize(sent.lower()) for sent in tqdm(train_en_sents)]
    train_vi_sents = [vi_tokenizer.tokenize(sent.lower()) for sent in tqdm(train_vi_sents)]
    
    train_dataset = MTDataset(
        inputs=[en_vocab.words2indexes(sent, add_sos_eos=True) for sent in train_en_sents],
        outputs=[vi_vocab.words2indexes(sent, add_sos_eos=True) for sent in train_vi_sents],
        max_length=20,
        padding_idx=en_vocab['<pad>'],
    )

    val_en_sents, val_vi_sents = read_corpus("./data", "dev")
    val_en_sents = [en_tokenizer.tokenize(sent.lower()) for sent in tqdm(val_en_sents)]
    val_vi_sents = [vi_tokenizer.tokenize(sent.lower()) for sent in tqdm(val_vi_sents)]

    val_dataset = MTDataset(
        inputs=[en_vocab.words2indexes(sent, add_sos_eos=True) for sent in val_en_sents],
        outputs=[vi_vocab.words2indexes(sent, add_sos_eos=True) for sent in val_vi_sents],
        max_length=20,
        padding_idx=en_vocab['<pad>'],
    )
    
    return train_dataset, val_dataset

quá trình build torch Dataset có lược bớt các câu dài hơn max_length nên sẽ có số lượng ít hơn so với inputs

In [12]:
train_dataset, val_dataset = get_dataset()
len(train_dataset), len(val_dataset)

100%|██████████| 100000/100000 [00:05<00:00, 18942.26it/s]
100%|██████████| 100000/100000 [00:06<00:00, 16639.19it/s]
100%|██████████| 18719/18719 [00:01<00:00, 15111.10it/s]
100%|██████████| 18719/18719 [00:01<00:00, 10613.20it/s]


(90919, 11668)

In [15]:
# double check
X, y = train_dataset[10]

print(en_vocab.indexes2words(X.numpy()))
print(vi_vocab.indexes2words(y.numpy()))

['<sos>', 'how', 'far', 'are', 'we', 'planning', 'on', 'taking', 'this', ',', 'dude', '?', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<sos>', 'cậu', 'định', 'đóng', 'giả', 'đến', 'khi', 'nào', ',', 'anh', 'bạn', '?', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [None]:
# your code goes here

