In [1]:
import os
import sys
sys.path.append('/'.join(os.getcwd().split('/')[:-1]+['paper_code/TRANSFORMER']))
# package load
import time
import torch
import numpy as np
import matplotlib.pyplot as plt

from models import Transformer
from train import cal_performance, get_pos
from torchtext.data import Field, Iterator
from torchtext import datasets
import spacy

In [2]:
import torch.optim as optim
from schoptim import ScheduledOptim
from lbs import LabelSmoothing
from utils import get_padding_mask
from trans_dataloader import TranslateDataset

In [6]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

SOS = '<s>'
EOS = '</s>'
SRC = Field(tokenize=tokenize_de, lower=True, batch_first=True)
TRG = Field(tokenize=tokenize_en, init_token=SOS, eos_token=EOS, lower=True, batch_first=True)

MAX_LEN = 50
train, valid, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(SRC, TRG), 
        root='../data/', filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
        len(vars(x)['trg']) <= MAX_LEN)
MIN_FREQ = 2
SRC.build_vocab(train.src, min_freq=MIN_FREQ)
TRG.build_vocab(train.trg, min_freq=MIN_FREQ)

In [4]:
train_loader, valid_loader, test_loader = Iterator.splits(datasets=(train, valid, test), 
    batch_sizes=(16, 16, 16), repeat=False)

In [5]:
for b in train_loader:
     break

In [6]:
print('len_data - train: {} valid: {} test: {}'.format(len(train), len(valid), len(test)))
print('len_loader - train: {} valid: {} test: {}'.format(len(train_loader), len(valid_loader), len(test_loader)))

len_data - train: 196546 valid: 992 test: 1305
len_loader - train: 12285 valid: 62 test: 82


In [7]:
DEVICE = None
# Parameters
N_LAYER = 6
N_HEAD = 8
D_K = 64
D_V = 64
D_MODEL = D_K * N_HEAD
D_F = D_MODEL * 4
SAVE_PATH = '../paper_code/TRANSFORMER/model/transformer.chkpt'

In [8]:
model = Transformer(enc_vocab_len=len(SRC.vocab.stoi),
                    enc_max_seq_len=MAX_LEN, 
                    dec_vocab_len=len(TRG.vocab.stoi), 
                    dec_max_seq_len=MAX_LEN+2, 
                    n_layer=N_LAYER, 
                    n_head=N_HEAD, 
                    d_model=D_MODEL, 
                    d_k=D_K,
                    d_v=D_V,
                    d_f=D_F, 
                    pad_idx=SRC.vocab.stoi['<pad>'],
                    drop_rate=0.1, 
                    use_conv=False, 
                    return_attn=True, 
                    linear_weight_share=True, 
                    embed_weight_share=False).to(DEVICE)

In [9]:
model.load_state_dict(torch.load(SAVE_PATH))

In [10]:
def process_to_tensor(sent, vocab, unk='<unk>'):
    sent = [vocab.get(x) if vocab.get(x) is not None else vocab.get(unk) for x in sent]
    return torch.LongTensor([sent])

In [11]:
test_ex = np.random.choice(test.examples)
src_sent, trg_sent = test_ex.src, test_ex.trg
src = process_to_tensor(src_sent, vocab=SRC.vocab.stoi)
src_pos = get_pos(src)
trg = process_to_tensor(['<s>'], vocab=TRG.vocab.stoi)

---

In [55]:
memory, enc_attns = model.encoder(src, src_pos)

In [67]:
output, dec_attns, dec_enc_attns = model.decoder(trg, trg_pos, src, memory)
output = model.projection(output[:, -1])

In [68]:
trg = torch.cat((trg[0], output.max(1)[1])).unsqueeze(0)

In [69]:
trg_pos = get_pos(trg)

In [70]:
trg

tensor([[2, 2, 2, 2, 2]])

---

In [12]:
def decode(model, src, src_pos, trg, max_len=100, eos_idx=3):
    trg_pos = get_pos(trg)
    memory, enc_attns = model.encoder(src, src_pos)
    for i in range(max_len):
        output, dec_attns, dec_enc_attns = model.decoder(trg, trg_pos, src, memory)
        output = model.projection(output[:, -1])
        trg = torch.cat((trg[0], output.max(1)[1])).unsqueeze(0)
        if output.max(1)[1].item() == eos_idx:
            break
        trg_pos = get_pos(trg)
    return trg.squeeze(0)


In [13]:
pred = decode(model, src, src_pos, trg, max_len=50)

In [17]:
o, attns = model(src, src_pos, trg, get_pos(trg))

In [19]:
o.max(1)

(tensor([24.5936], grad_fn=<MaxBackward0>), tensor([2]))

---

In [69]:
for i, batch in enumerate(train_loader):
    src, trg = batch.src, batch.trg
    src_pos, trg_pos = map(get_pos, [src, trg])
    break

In [70]:
a = model.encoder.pos_layer(src_pos)

In [71]:
b = model.encoder.embed_layer(src)

In [74]:
a.size(), b.size()

(torch.Size([16, 42, 512]), torch.Size([16, 42, 512]))

---

In [None]:
# optimizer = ScheduledOptim(optim.Adam(filter(lambda x: x.requires_grad, model.parameters()),
#                        betas=(0.9, 0.98), eps=1e-09), 
#                        D_MODEL, 
#                        4000)
# loss_function = LabelSmoothing(trg_vocab_size=len(TRG.vocab.stoi), 
#                                pad_idx=TRG.vocab.stoi['<pad>'], 
#                                eps=0.1)

# model = model.to(DEVICE)
# model.train()
# loss_per_step = 0
# total_words = 0
# correct_words = 0

# start_time = time.time()
# for i, batch in enumerate(train_loader):
#     src_pos, trg_pos = map(get_pos, [batch.src, batch.trg])
#     src, src_pos, trg, trg_pos = map(lambda x: x.to(DEVICE), [batch.src, src_pos, batch.trg, trg_pos])
#     model.zero_grad()
#     # forward
#     output = model(enc=src, enc_pos=src_pos, dec=trg, dec_pos=trg_pos)
#     # loss and backward
#     pred = output.to('cpu')
#     target = trg.detach().to('cpu')
#     loss, n_correct = cal_performance(pred, target, loss_function, pad_idx=model.pad_idx)
#     loss.backward()        
#     # update parameters
#     optimizer.step_and_update_lr()

#     # eval
#     n_words = target.view(-1).ne(model.pad_idx).sum().item()
#     total_words += n_words
#     correct_words += n_correct
#     loss_per_step += loss.item()

#     end_time = time.time()
#     total_time = end_time-start_time
#     print(' > [{}/{}] loss_per_batch: {:.4f} time: {:.1f} s'.format(i, len(train_loader), 
#                                                                      loss.item()/n_words, total_time))
#     start_time = time.time()

# accuracy = correct_words / total_words
# loss_per_step = loss_per_step / total_words

---

http://nlp.seas.harvard.edu/2018/04/03/attention.html

In [None]:
# cuda settings
DEVICE = None
# TRAIN_PATH = '../data/translation/de_en_small.txt'
# VALID_PATH = '../data/translation/de_en_small_valid.txt'
TRAIN_PATH = '../data/translation/en_fa2.train'
VALID_PATH = '../data/translation/en_fa2.valid'
TEST_PATH = '../data/translation/en_fa2.test'
EXTS = 'src-trg'
SOS = '<s>'
EOS = '</s>'
STEP = 10
BATCH = 3
# Create Dataset
train = TranslateDataset(path=TRAIN_PATH, exts=EXTS, sos=SOS, eos=EOS)
SRC_VOCAB = train.src_vocab
TRG_VOCAB = train.trg_vocab
valid = TranslateDataset(path=VALID_PATH, sos=SOS, eos=EOS, vocab=[('src', SRC_VOCAB), ('trg', TRG_VOCAB)])
test = TranslateDataset(path=TEST_PATH, sos=SOS, eos=EOS, vocab=[('src', SRC_VOCAB), ('trg', TRG_VOCAB)])
# Parameters
N_LAYER = 6
N_HEAD = 8
D_K = 64
D_V = 64
D_MODEL = D_K * N_HEAD
D_F = D_MODEL * 4
SAVE_PATH = '../paper_code/TRANSFORMER/model/enfa2.chkpt'