In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)
cwd = '/content/drive/MyDrive/BookingcomChallenge2021/'

import os
os.chdir(cwd)

Mounted at /content/drive


In [2]:
import argparse
import time
import math
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.onnx
from torch.utils.tensorboard import SummaryWriter

import data
import model

In [3]:
parser = argparse.ArgumentParser(description='PyTorch RNN/LSTM/GRU/Transformer/LSTMTransformer Language Model')
parser.add_argument('--data', type=str, default='../data/wikitext-2',
                    help='location of the data corpus')
parser.add_argument('--model', type=str, default='LSTM',
                    help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer, LSTMTransformer)')
parser.add_argument('--emsize', type=int, default=200,
                    help='size of word embeddings')
parser.add_argument('--nhid', type=int, default=200,
                    help='number of hidden units per layer')
parser.add_argument('--nlayers', type=int, default=2,
                    help='number of layers')
parser.add_argument('--warm-up', type=int, default=4000,
                    help='number of steps to warm up')
parser.add_argument('--step-size', type=int, default=1,
                    help='period of learning rate decay')
parser.add_argument('--clip', type=float, default=0.25,
                    help='gradient clipping')
parser.add_argument('--epochs', type=int, default=40,
                    help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=20, metavar='N',
                    help='batch size')
parser.add_argument('--bptt', type=int, default=35,
                    help='sequence length')
parser.add_argument('--dropout', type=float, default=0.2,
                    help='dropout applied to layers (0 = no dropout)')
parser.add_argument('--tied', action='store_true',
                    help='tie the word embedding and softmax weights')
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')
parser.add_argument('--cuda', action='store_true',
                    help='use CUDA')
parser.add_argument('--val-interval', type=int, default=5000, metavar='N',
                    help='validation interval')
parser.add_argument('--log-interval', type=int, default=200, metavar='N',
                    help='report interval')
parser.add_argument('--save', type=str, default='model.pt',
                    help='path to save the final model')
parser.add_argument('--tb-log', type=str, default='logs',
                    help='path to save tensorboard log')
parser.add_argument('--onnx-export', type=str, default='',
                    help='path to export the final model in onnx format')

parser.add_argument('--nhead', type=int, default=2,
                    help='the number of heads in the encoder/decoder of the transformer model')

_StoreAction(option_strings=['--nhead'], dest='nhead', nargs=None, const=None, default=2, type=<class 'int'>, choices=None, help='the number of heads in the encoder/decoder of the transformer model', metavar=None)

In [4]:
args = parser.parse_args(args=["--data", "../booking_com_data/legs_4legs/",\
                               "--model", "LSTMTransformer",\
                               "--emsize", "512",\
                               "--nhid", "1024",\
                               "--nlayers", "5",\
                               "--clip", "5.0",\
                               "--batch_size", "16", \
                               "--bptt", "4",\
                               "--dropout", "0.1",\
                               "--log-interval", "200", \
                               "--save", "../models/lstm_transformer.pth",\
                               "--nhead", "8",\
                               "--cuda"])

In [5]:
# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if args.cuda else "cpu")
device

device(type='cuda')

In [6]:
args.data

'../booking_com_data/legs_4legs/'

In [None]:
args.model

In [7]:
###############################################################################
# Load data
###############################################################################

corpus = data.Corpus(args.data)

In [8]:

# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data

eval_batch_size = 10
train_data = batchify(corpus.train, args.batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)


In [9]:

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
if args.model == 'LSTMTransformer':
    model = model.LSTMTransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device)
elif args.model == 'Transformer':
    model = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device)
else:
    model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device)

class TransformerLR(torch.optim.lr_scheduler.LambdaLR):
    """Warmup linearly and then decay proportionally to the inverse square root of the step number.
    """
    def __init__(self, optimizer, d_model, warmup_steps=4000, step_size=1, last_epoch=-1):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.step_size = step_size
        super(TransformerLR, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)

    def lr_lambda(self, step):
        if step // self.step_size == 0:
            return 0
        return (self.d_model ** -0.5) * \
               min((step // self.step_size) ** -0.5, (step // self.step_size) * (self.warmup_steps ** -1.5))

optimizer = optim.Adam(model.parameters(), lr=1)
scheduler = TransformerLR(optimizer, args.emsize, warmup_steps=4000, step_size=args.step_size)
criterion = nn.NLLLoss()

writer = SummaryWriter(log_dir=args.tb_log)

In [10]:

###############################################################################
# Training code
###############################################################################

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)


# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.

def get_batch(source, i, seq_len):
    seq_len = min(seq_len, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data.to(device), target.to(device)


def evaluate(data_source, seq_len):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    if args.model == 'LSTMTransformer':
        inp = torch.empty(0, dtype=torch.int64).to(device)
        hidden = model.init_hidden(eval_batch_size)
        mems = None
    elif args.model == 'Transformer':
        inp = torch.empty(0, dtype=torch.int64).to(device)
    else:
        hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, seq_len):
            data, targets = get_batch(data_source, i, seq_len)
            if args.model == 'LSTMTransformer':
                output, hidden, mems = model(data, hidden, mems)
                output = output.view(-1, ntokens)
                hidden = repackage_hidden(hidden)
                mems = [mem[-args.bptt + seq_len:] for mem in mems] if args.bptt != seq_len else None
            elif args.model == 'Transformer':
                inp = torch.cat([inp, data], 0)[-args.bptt:]
                output = model(inp)
                output = output[-data.size(0):].view(-1, ntokens)
            else:
                output, hidden = model(data, hidden)
                hidden = repackage_hidden(hidden)
            total_loss += len(data) * criterion(output, targets).item()
    model.train()
    return total_loss / (len(data_source) - 1)


iteration = 0
best_val_loss = None

def train():
    global iteration
    global best_val_loss
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    if args.model != 'Transformer':
        hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i, args.bptt)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        optimizer.zero_grad()
        if args.model == 'LSTMTransformer':
            hidden = repackage_hidden(hidden)
            output, hidden, _ = model(data, hidden)
            output = output.view(-1, ntokens)
        elif args.model == 'Transformer':
            output = model(data)
            output = output.view(-1, ntokens)
        else:
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden)
        loss = criterion(output, targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        writer.add_scalar('train/loss', loss.item(), iteration)
        writer.add_scalar('train/ppl', math.exp(loss.item()), iteration)
        writer.flush()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.5f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, scheduler.get_lr()[0],
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

        if iteration % args.val_interval == 0 and iteration > 0:
            val_loss = evaluate(val_data, args.bptt)
            print('-' * 89)
            print('| end of iteration {} | valid loss {:5.2f} | '
                    'valid ppl {:8.2f}'.format(iteration, val_loss, math.exp(val_loss)))
            print('-' * 89)
            writer.add_scalar('val/loss', val_loss, iteration)
            writer.add_scalar('val/ppl', math.exp(val_loss), iteration)
            writer.flush()
            # Save the model if the validation loss is the best we've seen so far.
            if not best_val_loss or val_loss < best_val_loss:
                with open(args.save, 'wb') as f:
                    torch.save(model, f)
                best_val_loss = val_loss

        iteration += 1

def export_onnx(path, batch_size, seq_len):
    print('The model is also exported in ONNX format at {}'.
          format(os.path.realpath(args.onnx_export)))
    model.eval()
    dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
    hidden = model.init_hidden(batch_size)
    torch.onnx.export(model, (dummy_input, hidden), path)


In [None]:
# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, args.epochs+1):
        train()
except:
    print('-' * 89)
    print('Exiting from training early')
    writer.close()
writer.close()



| epoch   1 |   200/22495 batches | lr 0.00004 | ms/batch 58.83 | loss 11.03 | ppl 61475.83
| epoch   1 |   400/22495 batches | lr 0.00007 | ms/batch 37.13 | loss  8.94 | ppl  7658.21
| epoch   1 |   600/22495 batches | lr 0.00010 | ms/batch 37.03 | loss  8.52 | ppl  5006.38
| epoch   1 |   800/22495 batches | lr 0.00014 | ms/batch 37.38 | loss  8.34 | ppl  4197.86
| epoch   1 |  1000/22495 batches | lr 0.00017 | ms/batch 37.41 | loss  8.30 | ppl  4009.47
| epoch   1 |  1200/22495 batches | lr 0.00021 | ms/batch 37.41 | loss  8.12 | ppl  3362.22
| epoch   1 |  1400/22495 batches | lr 0.00024 | ms/batch 37.39 | loss  7.92 | ppl  2745.80
| epoch   1 |  1600/22495 batches | lr 0.00028 | ms/batch 37.29 | loss  7.84 | ppl  2548.99
| epoch   1 |  1800/22495 batches | lr 0.00031 | ms/batch 37.32 | loss  7.71 | ppl  2228.39
| epoch   1 |  2000/22495 batches | lr 0.00035 | ms/batch 37.29 | loss  7.54 | ppl  1878.15
| epoch   1 |  2200/22495 batches | lr 0.00038 | ms/batch 37.26 | loss  7.57 | p

In [None]:

# Load the best saved model.
with open(args.save, 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    # Currently, only rnn model supports flatten_parameters function.
    if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
        model.rnn.flatten_parameters()

In [None]:
# Run on test data.
test_loss = evaluate(test_data, 1)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)