In [6]:
import os
import pandas as pd

from torchtext import data, datasets

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import torch
import time

In [8]:
# import spacy
# from spacy.lang.vi import Vietnamese
# from spacy.lang.zh import Chinese

# nlp = {
#     'en': spacy.load('en_core_web_sm'),
#     'vi': Vietnamese(),
#     'zh': Chinese()
# }

In [9]:
languages = ['vi', 'en']
names = ['src', 'tgt']
num_examples = 100

In [556]:
basepath = './iwslt-vi-en'
filename = '{}.tok.{}'

for t in ['train']:
    paths = [os.path.join(basepath, filename.format(t, l)) for l in languages]

    # read data from multiple files in one df
    df = pd.concat(
        [pd.read_csv(path, sep="\t", names=[names[idx]]) for idx, path in enumerate(paths)], 
        axis=1
    )
        
    # save to csv
    df.to_csv(os.path.join(basepath, filename.format(t, 'csv')), sep='\t', index=False)

In [542]:
a, b = [pd.read_csv(path, 
                    sep=";", 
                    names=[names[idx]],
                    doublequote=False,
                    encoding='utf-8') for idx, path in enumerate(paths)]

In [555]:
a.to_csv('test.csv', sep='\t', index=False, header=None, line_terminator='\n\n\n')
# a

In [11]:
BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"

In [12]:
# v = 'Khi tôi còn nhỏ, Tôi nghĩ rằng BắcTriều Tiên là đất nước tốt nhất trên thế giới và tôi thường hát bài "Chúng ta chẳng có gì phải ghen tị."'
# v = '我 11 岁 那年    记得 得有 一天 早晨 醒来   听见 家里 有 愉悦 的 声音'

In [13]:
# def tokenize(language):
#     return lambda sentence: [tok.text for tok in nlp[language](sentence)]

In [14]:
SRC = data.Field(pad_token=BLANK_WORD)
TGT = data.Field(init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD)

In [17]:
train_file = filename.format('train', 'csv')
val_file = filename.format('dev', 'csv')

train, val = data.TabularDataset.splits(
    path=basepath,
    train=train_file, validation=val_file,
    format='tsv',
    skip_header=True,
    fields=[('src', SRC), ('tgt', TGT)])

In [18]:
MIN_FREQ = 1
SRC.build_vocab(train.src, min_freq=MIN_FREQ)
TGT.build_vocab(train.tgt, min_freq=MIN_FREQ)

In [19]:
# print(TGT.vocab.freqs.most_common(10))
print(len(TGT.vocab))
# print(SRC.vocab.freqs.most_common(10))
print(len(SRC.vocab))

54173
42150


In [20]:
class MyIterator(data.Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in data.batch(d, self.batch_size * 100):
                    p_batch = data.batch(
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b
            self.batches = pool(self.data(), self.random_shuffler)
            
        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size,
                                          self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))

In [21]:
Iterator = MyIterator
# Iterator = data.Iterator

In [23]:
global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.src))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.tgt) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)


def rebatch(pad_idx, batch):
    "Fix order in torchtext to match ours"
    src, tgt = batch.src.transpose(0, 1), batch.tgt.transpose(0, 1)
    return Batch(src, tgt, pad_idx)

In [24]:
# BATCH_SIZE = 12000
# train_iter = Iterator(train, batch_size=BATCH_SIZE, device=None,
#                         repeat=False, sort_key=lambda x: (len(x.src), len(x.tgt)),
#                         batch_size_fn=batch_size_fn, train=True)


# batch = next(iter(train_iter))
# batch

In [25]:
# ' '.join(list(map(lambda x: TGT.vocab.itos[x], batch.tgt.transpose(0, 1)[-5])))

In [26]:
from model import make_model, LabelSmoothing, NoamOpt, Batch, SimpleLossCompute

In [85]:
pad_idx = TGT.vocab.stoi["<blank>"]
model = make_model(len(SRC.vocab), len(TGT.vocab), N=6)
# model.cuda()
criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1)
# criterion.cuda()
BATCH_SIZE = 8



In [492]:
train_iter, valid_iter = data.BucketIterator.splits(
            (train, val), 
            batch_size=BATCH_SIZE, 
            sort_key=lambda x: (len(x.src), len(x.tgt)))

In [443]:
train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=None,
                        repeat=False, sort_key=lambda x: (len(x.src), len(x.tgt)),
                        batch_size_fn=batch_size_fn, train=True)

valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=None,
                        repeat=False, sort_key=lambda x: (len(x.src), len(x.tgt)),
                        batch_size_fn=batch_size_fn, train=False)

In [63]:
# e = train.examples[199]
# e.src, e.tgt

In [494]:
i = iter(train_iter)

In [495]:
b = next(i)
b


[torchtext.data.batch.Batch of size 8]
	[.src]:[torch.LongTensor of size 34x8]
	[.tgt]:[torch.LongTensor of size 30x8]

In [498]:
b.src.shape

torch.Size([34, 8])

In [28]:
def run_epoch(data_iter, model, loss_compute):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(batch.src, batch.tgt, 
                            batch.src_mask, batch.tgt_mask)
        loss = loss_compute(out, batch.tgt_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
                    (i, loss / batch.ntokens, tokens / elapsed))
            start = time.time()
            tokens = 0
    return total_loss / total_tokens

In [33]:
model_opt = NoamOpt(model.src_embed[0].d_model, 1, 2000, 
                    torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
for epoch in range(1):
    model.train()
    run_epoch((rebatch(pad_idx, b) for b in train_iter), 
                  model, 
                  SimpleLossCompute(model.generator, criterion, opt=model_opt))
    model.eval()
    loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), 
                          model_par, 
                          SimpleLossCompute(model.generator, criterion, opt=None))
    print(loss)

RuntimeError: invalid argument 3: Index is supposed to be a vector at /Users/soumith/code/builder/wheel/pytorch-src/aten/src/TH/generic/THTensorMath.cpp:569