In [1]:
import os
import sys
sys.path.append('/'.join(os.getcwd().split('/')[:-1]+['paper_code']))
# package load
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as torchdata

from TRANSFORMER.models import Transformer
from TRANSFORMER.dataloader import TranslateDataset
import numpy as np
import time
import matplotlib.pyplot as plt

http://nlp.seas.harvard.edu/2018/04/03/attention.html

In [2]:
# cuda settings
USE_CUDA = torch.cuda.is_available()
DEVICE = 'cuda' if USE_CUDA else None 
# Create Dataset
train = TranslateDataset(path='../data/translation/de_en_small.txt', device=DEVICE)
SRC_MAXLEN = train.src_maxlen
TRG_MAXLEN = train.trg_maxlen
SRC_VOCAB = train.src_vocab
TRG_VOCAB = train.trg_vocab

In [3]:
# Parameters
BATCH = 32
N_LAYER = 3
N_HEAD = 6
D_K = 32
D_V = 32
D_MODEL = D_K * N_HEAD
D_F = D_MODEL * 4
SMOOTHING = True
WARMUP = 4000

In [4]:
train_loader = torchdata.DataLoader(dataset=train,
                                    collate_fn=train.collate_fn,
                                    batch_size=BATCH, 
                                    shuffle=True, 
                                    drop_last=False)

In [5]:
model = Transformer(enc_vocab_len=len(SRC_VOCAB.stoi),
                    enc_max_seq_len=SRC_MAXLEN, 
                    dec_vocab_len=len(TRG_VOCAB.stoi), 
                    dec_max_seq_len=TRG_MAXLEN, 
                    n_layer=N_LAYER, 
                    n_head=N_HEAD, 
                    d_model=D_MODEL, 
                    d_k=D_K,
                    d_v=D_V,
                    d_f=D_F, 
                    pad_idx=SRC_VOCAB.stoi['<pad>'],
                    drop_rate=0.1, 
                    use_conv=False, 
                    return_attn=False, 
                    linear_weight_share=True, 
                    embed_weight_share=False)
if USE_CUDA:
    model = model.cuda()

In [6]:
class ScheduledOptim(object):
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        "Zero out the gradients by the inner optimizer"
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr


In [7]:
def cal_loss(pred, target, smoothing, pad_idx=1):
    """
    Calculate cross entropy loss, apply label smoothing if needed. 
    borrowed from: 
    https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/train.py
    """
    target = target.contiguous().view(-1)
    if smoothing:
        eps = 0.1
        n_class = pred.size(1)

        one_hot = torch.zeros_like(pred).scatter(1, target.view(-1, 1), 1)
        one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
        log_prb = F.log_softmax(pred, dim=1)

        non_pad_mask = target.ne(pad_idx)
        loss = -(one_hot * log_prb).sum(dim=1)
        loss = loss.masked_select(non_pad_mask).sum()  # average later
    else:
        loss = F.cross_entropy(pred, target, ignore_index=pad_idx, reduction='sum')
    return loss

def run_step(loader, model, optimizer, smoothing):
    model.train()
    loss_per_step = 0
    
    for i, batch in enumerate(loader):
        src, src_pos, trg, trg_pos = map(lambda x: x.to(DEVICE), batch)
        model.zero_grad()
        # forward
        output = model(enc=src, enc_pos=src_pos, dec=trg, dec_pos=trg_pos)
        # eval
        pred = output.cpu()
        target = trg.cpu().view(-1)
        loss = cal_loss(pred, target, smoothing, pad_idx=model.pad_idx)
        loss.backward()        
        # update parameters
        optimizer.step_and_update_lr()
        total_words = target.ne(model.pad_idx).sum().item()
        loss_per_step += loss.item() / total_words
        if i % 250 == 0:
            print('> [{}/{}] loss_per_step: {:.4f}'.format(i, len(loader), loss.item()))
    return loss_per_step

In [8]:
optimizer = ScheduledOptim(optim.Adam(filter(lambda x: x.requires_grad, model.parameters()),
                           betas=(0.9, 0.98), eps=1e-09), 
                           D_MODEL, 
                           WARMUP)

In [10]:
loss = run_step(train_loader, model, optimizer, smoothing=SMOOTHING)

[0/1563] loss_per_step: 1353.5496826171875
[100/1563] loss_per_step: 1351.23486328125


KeyboardInterrupt: 

In [115]:
start_time = time.time()
model.train()

for step in range(STEP):
    loss = run_step(train_loader, model, optimizer, smoothing=SMOOTHING)
    print('[{}/{}] loss: {} \n'.format(step+1, STEP, loss))