In [1]:
import os
import sys

notebook_dir = os.path.dirname(os.path.abspath('__file__'))
sys.path.append(os.path.join(notebook_dir, '..'))

In [2]:
from src.data import Vocabulary, MTDataset, ViTokenizer, EnTokenizer
from src.utils.data import read_corpus

from tqdm import tqdm

In [3]:
en_vocab = Vocabulary.load('../ckpts/en_vocab.json')
vi_vocab = Vocabulary.load('../ckpts/vi_vocab.json')

In [4]:
en_vocab, vi_vocab

(Vocabulary[language=english, size=34687],
 Vocabulary[language=vietnamese, size=21681])

In [5]:
train_en_sents, train_vi_sents = read_corpus("../data", "train")
len(train_en_sents), len(train_vi_sents)

(2977999, 2977999)

In [6]:
train_en_sents = train_en_sents[1300000:1400000]
train_vi_sents = train_vi_sents[1300000:1400000]

In [None]:
vi_tokenizer = ViTokenizer()
en_tokenizer = EnTokenizer()

train_en_sents = [en_tokenizer.tokenize(sent) for sent in tqdm(train_en_sents)]
train_vi_sents = [vi_tokenizer.tokenize(sent) for sent in tqdm(train_vi_sents)]

  0%|          | 0/100000 [00:00<?, ?it/s]

100%|██████████| 100000/100000 [00:05<00:00, 19538.57it/s]
100%|██████████| 100000/100000 [00:05<00:00, 17736.85it/s]


In [8]:
print(train_en_sents[1])
print(train_vi_sents[1])

['When', 'did', 'you', 'guys', 'stop', 'dancing', '?']
['Các', 'anh', 'ngưng', 'nhảy', 'lúc', 'nào', 'vậy', '?']


In [9]:
print(en_vocab.words2indexes(train_en_sents[1]))
print(en_vocab.indexes2words(en_vocab.words2indexes(train_en_sents[1])))

[3, 79, 10, 425, 298, 2964, 20]
['<unk>', 'did', 'you', 'guys', 'stop', 'dancing', '?']


In [10]:
print(vi_vocab.words2indexes(train_vi_sents[1]))
print(vi_vocab.indexes2words(vi_vocab.words2indexes(train_vi_sents[1])))

[3, 23, 2033, 1044, 277, 89, 100, 24]
['<unk>', 'anh', 'ngưng', 'nhảy', 'lúc', 'nào', 'vậy', '?']


In [11]:
print(vi_vocab.words2indexes(train_vi_sents[1], add_sos_eos=True))
print(vi_vocab.indexes2words(vi_vocab.words2indexes(train_vi_sents[1], add_sos_eos=True)))

[1, 3, 23, 2033, 1044, 277, 89, 100, 24, 2]
['<sos>', '<unk>', 'anh', 'ngưng', 'nhảy', 'lúc', 'nào', 'vậy', '?', '<eos>']


In [12]:
train_dataset = MTDataset(
    inputs=[en_vocab.words2indexes(sent, add_sos_eos=True) for sent in train_en_sents],
    outputs=[vi_vocab.words2indexes(sent, add_sos_eos=True) for sent in train_vi_sents],
    max_length=20,
    padding_idx=en_vocab['<pad>'],
)

In [13]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [14]:
X, y = next(iter(train_loader))
X.shape, y.shape

(torch.Size([32, 20]), torch.Size([32, 20]))

In [16]:
print(en_vocab.indexes2words(X[0].numpy()))
print(vi_vocab.indexes2words(y[0].numpy()))

['<sos>', '<unk>', 'are', 'they', 'wasting', 'their', 'time', 'with', 'him', '?', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<sos>', '<unk>', 'phí', 'thời', 'gian', 'với', 'hắn', 'làm', 'gì', '?', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
