In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchtext
from torchtext import data
from torchtext import datasets
from torch.autograd import Variable

import spacy, random
import numpy as np
from tqdm import tqdm, tqdm_notebook

# Some utility functions
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

global USE_CUDA
USE_CUDA = torch.cuda.is_available()
DEVICE = 0 if USE_CUDA else -1
MAX_LEN = 20
MIN_FREQ = 5
BATCH_SIZE = 32

In [3]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

DE = data.Field(tokenize=tokenize_de)
EN = data.Field(tokenize=tokenize_en, init_token = '<s>', eos_token = '</s>') # only target needs BOS/EOS
train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN), filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN)

DE.build_vocab(train.src, min_freq=MIN_FREQ)
EN.build_vocab(train.trg, min_freq=MIN_FREQ)

train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=BATCH_SIZE, device=DEVICE, repeat=False, sort_key=lambda x: len(x.src))

def str_to_tensor(string, src_lang = DE):
    string = string.split()
    word_ids = [src_lang.vocab.stoi[word] for word in string]
    word_tensor = Variable(torch.LongTensor(word_ids))
    return word_tensor
    
def tensor_to_kaggle(tensor, trg_lang = EN):
    return '|'.join([trg_lang.vocab.itos[word_id] for word_id in tensor])
    
def tensor_to_str(tensor, trg_lang = EN):
    return ' '.join([trg_lang.vocab.itos[word_id] for word_id in tensor])

In [4]:
class Encoder(nn.Module):
    def __init__(self, src_vsize, hidden_dim, n_layers = 1):
        super(Encoder, self).__init__()
        
        self.src_vsize = src_vsize
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.embeddings = nn.Embedding(src_vsize, hidden_dim, padding_idx = DE.vocab.stoi[DE.pad_token])
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers = n_layers, batch_first = False)
        
    def forward(self, src_words):
        embedded = self.embeddings(src_words)
        out, hdn = self.lstm(embedded)
        return out, hdn

class Decoder(nn.Module):
    def __init__(self, hidden_dim, trg_vsize, n_layers = 1):
        super(Decoder, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.trg_vsize = trg_vsize
        self.n_layers = n_layers
        
        self.embeddings = nn.Embedding(trg_vsize, hidden_dim, padding_idx = EN.vocab.stoi[EN.pad_token])
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers = n_layers, batch_first = False)
        self.proj = nn.Linear(hidden_dim, trg_vsize)
        
    def forward(self, trg_words, hidden):
        embedded = self.embeddings(trg_words)
        out, hdn = self.lstm(embedded, hidden)
        output = self.proj(out)
        return output, hdn
    
class Seq2Seq(nn.Module):
    def __init__(self, src_vsize, trg_vsize, hidden_dim, n_layers = 1):
        super(Seq2Seq, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.encoder = Encoder(src_vsize, hidden_dim)
        self.decoder = Decoder(hidden_dim, trg_vsize)

In [28]:
class Trainer:
    def __init__(self, train_iter, val_iter):
        """ Initialize trainer class with Torchtext iterators """
        self.train_iter = train_iter
        self.val_iter = val_iter
        
    def train(self, num_epochs, model, lr = 1e-3, clip = 5):
        """ Train using Adam """
        weight = torch.FloatTensor(len(EN.vocab.itos)).fill_(1)
        self.padding_id = EN.vocab.stoi[EN.pad_token]
        weight[self.padding_id] = 0
        weight = Variable(weight)
        if USE_CUDA: 
            weight = weight.cuda()

        criterion = nn.CrossEntropyLoss(weight = weight, size_average = False)
        parameters = filter(lambda p: p.requires_grad, model.parameters())
        optimizer = torch.optim.Adam(params = parameters, lr = lr)
        best_ppl = 1e10
        
        all_losses = []
        for epoch in tqdm(range(1, num_epochs + 1)):

            epoch_loss = []
            
            for batch in tqdm(self.train_iter):
                
                optimizer.zero_grad()

                batch_loss = self.train_batch(batch, criterion, model, teacher_forcing_ratio = 0)
                batch_loss.backward()

                nn.utils.clip_grad_norm(model.parameters(), clip)
                
                optimizer.step()

                epoch_loss.append(batch_loss.data[0])
                                
                if len(epoch_loss) % 100 == 0:
                    step = len(epoch_loss)
                    cur_loss = np.mean(epoch_loss)
                    train_ppl = np.exp(np.mean(epoch_loss))
                    print('Step: {0} | Loss: {1} | Train PPL: {2}'.format(step, cur_loss, train_ppl))
                    print('Wie würde eine solche Zukunft aussehen ? -->', self.translate('Wie würde eine solche Zukunft aussehen ?', model))
                
            epoch_loss = np.mean(epoch_loss)
            train_ppl = np.exp(epoch_loss)
            val_ppl = self.validate(criterion, model)

            print('Epoch: {0} | Loss: {1} | Train PPL: {2} | Val PPL: {3}'.format(epoch, epoch_loss, train_ppl, val_ppl))
            all_losses.append(epoch_loss)
            
            # early stopping
            if val_ppl < best_ppl:
                best_ppl = val_ppl
                best_model = model
        
        torch.save(best_model.cpu(), best_model.__class__.__name__ + ".pth")
        return best_model.cpu(), all_losses        
                
    def train_batch(self, batch, criterion, model, teacher_forcing_ratio = 0):
        """ Compute training batch using teacher forcing """
        # Initialize batch loss to zero, get size of target sentences
        loss = 0
        target_length = batch.trg.size()[0]

        # Run words through encoder
        encoder_outputs, decoder_hidden = model.encoder(batch.src)

        # Prepare input and output variables
        use_teacher_forcing = random.random() > teacher_forcing_ratio
        if use_teacher_forcing:

            # With teacher forcing, we use the previous true target as the next word input.
            # This allows us to batch the softmax, resulting in large speed-ups.
            shift = Variable(torch.LongTensor(batch.batch_size).fill_(1)).unsqueeze(0)
            if USE_CUDA:
                shift = shift.cuda()

            # Run words through encoder
            encoder_outputs, encoder_hidden = model.encoder(batch.src)

            # Get outputs for batch, using encoder hidden as initialization for decoder hidden
            decoder_outputs, decoder_hidden = model.decoder(batch.trg, encoder_hidden)

            # Reshape outputs, add shift tensor to targets
            preds = decoder_outputs.view(target_length * batch.batch_size, -1)
            targets = torch.cat((batch.trg[1:], shift), dim = 0).view(-1)

            # Compute loss in a batch (more efficient than loop)
            num_words = targets.ne(self.padding_id).float().sum()
            loss = criterion(preds, targets)
            loss /= num_words
            return loss
        
        else:

            # Without teacher forcing: use network's own prediction as the next input
            decoder_inputs = batch.trg[0, :].unsqueeze(0)
            for trg_word_idx in range(target_length-1):
                decoder_output, decoder_hidden = model.decoder(decoder_inputs, decoder_hidden)

                # Get most likely word index (highest value) from output
                _, topk_words_idx = decoder_output.data.topk(1, dim = 2)

                # Chosen word is next input
                decoder_inputs = Variable(topk_words_idx).squeeze(2)
                if USE_CUDA: 
                    decoder_inputs = decoder_inputs.cuda()

                # Compute loss for all words in batch
                num_words = batch.trg[trg_word_idx, :].ne(trainer.padding_id).float().sum()
                loss += (criterion(decoder_output.squeeze(0), batch.trg[trg_word_idx+1, :]) / num_words) if num_words.data[0] > 0 else 0
                
        loss /= batch.batch_size
                
        return loss
    
    def translate(self, string, model, maxlength = None):  
        """ Predict translation for an input string """
        # Make string a tensor
        tensor = str_to_tensor(string)
        tensor = tensor.unsqueeze(1)
        if USE_CUDA:
            tensor = tensor.cuda()

        # Run words through encoder
        encoder_outputs, decoder_hidden = model.encoder(tensor)

        # First token must always start of sentence <s>
        decoder_inputs = Variable(torch.LongTensor([EN.vocab.stoi[EN.init_token]])).unsqueeze(0)
        if USE_CUDA: 
            decoder_inputs = decoder_inputs.cuda()

        # if no maxlength, let it be 3*length original
        maxlength = maxlength if maxlength else 3 * tensor.shape[0]
        out_string = []

        # Predict words until an <eos> token or maxlength
        for trg_word_idx in range(maxlength):
            decoder_output, decoder_hidden = model.decoder(decoder_inputs, decoder_hidden)

            # Get most likely word index (highest value) from output
            prob_dist = F.log_softmax(decoder_output, dim = 2)
            top_probs, top_word_idx = prob_dist.data.topk(1, dim = 2)
            ni = top_word_idx.squeeze(0)

            decoder_inputs = Variable(ni) # Chosen word is next input
            out_string.append(ni[0][0])

            # Stop at end of sentence (not necessary when using known targets)
            if ni[0][0] == EN.vocab.stoi[EN.eos_token]: 
                break

        out_string = tensor_to_str(out_string)
        return out_string
    
    def evaluate_kaggle(self, string, model, ngrams = 3, context = 0, top_k = 100):
        """ Beam search the best starting trigrams for Kaggle input sentences. 'beam_size' here is more like add'l context.. """
        # Convert string to tensor for embedding lookups
        tensor = str_to_tensor(string)
        tensor = tensor.unsqueeze(1)
        if USE_CUDA:
            tensor = tensor.cuda()

        # Run words through encoder to get init hidden for decoder
        encoder_outputs, encoder_hidden = model.encoder(tensor)

        # Start collecting hiddens, prepare initial input variables
        decoder_inputs = Variable(torch.LongTensor([EN.vocab.stoi[EN.init_token]])).unsqueeze(0)
        if USE_CUDA: 
            decoder_inputs = decoder_inputs.cuda()

        # Compute the top K first words, so that we have something to work with
        decoder_output, decoder_hidden = model.decoder(decoder_inputs, encoder_hidden)
        prob_dist = F.log_softmax(decoder_output, dim = 2)
        top_probs, top_word_idx = prob_dist.data.topk(top_k, dim = 2)
        decoder_inputs = Variable(top_word_idx)
        if USE_CUDA:
            decoder_inputs = decoder_inputs.cuda()

        # Begin table to keep our outputs, output_probs
        outputs = [[word] for word in list(decoder_inputs.data[0][0])]
        output_probs = list(top_probs[0][0])

        # For using the correct hidden to predict next word. Initially it is 100x copy
        all_hiddens = [decoder_hidden for _ in range(top_k)]

        # Get top_k beams for 
        for trg_word_idx in range(1, ngrams+context):
            beam_search_idx, beam_search_probs = [], []
            for k in range(top_k):
                decoder_output, new_hdn = model.decoder(decoder_inputs[:, :, k], all_hiddens[k])
                prob_dist = F.log_softmax(decoder_output, dim = 2)
                top_probs, top_word_idx = prob_dist.data.topk(top_k, dim = 2)
                beam_search_idx.append(list(top_word_idx[0][0]))
                beam_search_probs.append(list(top_probs[0][0]))
                all_hiddens[k] = new_hdn

            # Top K words idx
            next_word_idx = np.argsort(np.hstack(beam_search_probs))[::-1][:top_k] 

            # Backpointers to the input word that each top word was drawn from
            back_pointers = [int(np.floor(word / top_k)) for word in next_word_idx] 

            # Update output list with new decoder inputs and their corresponding probabilities
            next_words = [np.hstack(beam_search_idx)[ids] for ids in next_word_idx]
            next_probs = [np.hstack(beam_search_probs)[ids] for ids in next_word_idx]
            decoder_inputs = Variable(torch.LongTensor([int(word) for word in next_words])).unsqueeze(0).unsqueeze(0)
            if USE_CUDA:
                decoder_inputs = decoder_inputs.cuda()

            # update hiddens, outputs
            all_hiddens = [all_hiddens[pointer] for pointer in back_pointers]
            outputs = [outputs[pointer] + [word] for pointer, word in zip(back_pointers, next_words)]
            output_probs = [output_probs[pointer] + new_p for pointer, new_p in zip(back_pointers, next_probs)]

        prob_sort_idx = np.argsort(output_probs)[::-1]
        outputs = [outputs[idx] for idx in prob_sort_idx]
        outputs = [output[:ngrams] for output in outputs]
        out = [tensor_to_kaggle(tsr) for tsr in outputs]
        return ' '.join(out)
        
    def validate(self, criterion, model):
        """ Compute validation set perplexity """
        loss = []
        for batch in tqdm(self.val_iter):
            batch_loss = self.train_batch(batch, criterion, model)
            loss.append(batch_loss.data[0])
        
        val_ppl = np.exp(np.mean(loss))
        return val_ppl
    
    def write_kaggle(self, test_file, model):
        """ Write outputs to kaggle """
        with open(test_file, 'r') as fh:
            datasource = fh.read().splitlines()
            
        print('Evaluating on {0}...'.format(test_file))
        with open('output.txt', 'w') as fh:
            fh.write('id,word\n')
            for idx, string in tqdm(enumerate(datasource)):
                output = self.evaluate_kaggle(string, model)
                output = str(idx+1) + ',' + self.escape_kaggle(output) + '\n'
                fh.write(output)
        print('File saved.')
        
    def escape_kaggle(self, l):
        """ So kaggle doesn't yell at you when submitting results """
        return l.replace("\"", "<quote>").replace(",", "<comma>")

In [29]:
model = Seq2Seq(src_vsize = len(DE.vocab.itos), trg_vsize = len(EN.vocab.itos), hidden_dim = 200)
trainer = Trainer(train_iter, val_iter)
if USE_CUDA:
    model = model.cuda()
print('Using cuda: ', np.all([parameter.is_cuda for parameter in model.parameters()]))
model, all_losses = trainer.train(15, model)
if USE_CUDA: model = model.cuda() # don't know why but this will throw errors on GPU otherwise if not re-cuda'd
trainer.write_kaggle('../data/source_test.txt', model)

  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:00<00:06,  1.50it/s][A
 20%|██        | 2/10 [00:01<00:05,  1.55it/s][A
 30%|███       | 3/10 [00:01<00:04,  1.55it/s][A
 40%|████      | 4/10 [00:02<00:03,  1.58it/s][A
 50%|█████     | 5/10 [00:03<00:03,  1.66it/s][A
 60%|██████    | 6/10 [00:03<00:02,  1.69it/s][A
 70%|███████   | 7/10 [00:04<00:01,  1.69it/s][A
 80%|████████  | 8/10 [00:04<00:01,  1.67it/s][A
 90%|█████████ | 9/10 [00:05<00:00,  1.67it/s][A
100%|██████████| 10/10 [00:05<00:00,  1.70it/s][A
[A
  0%|          | 0/18 [00:00<?, ?it/s][A
 11%|█         | 2/18 [00:00<00:01, 11.47it/s][A
 17%|█▋        | 3/18 [00:00<00:01, 10.78it/s][A
 28%|██▊       | 5/18 [00:00<00:01, 10.77it/s][A
 33%|███▎      | 6/18 [00:00<00:01,  9.93it/s][A
 39%|███▉      | 7/18 [00:00<00:01,  9.23it/s][A
 44%|████▍     | 8/18 [00:00<00:01,  9.08it/s][A
 50%|█████     | 9/18 [00:01<00:01,  8.83it/s][A
 56%|█████▌    | 10/18

Epoch: 1 | Loss: 9.017608547210694 | Train PPL: 8247.031094360527 | Val PPL: 3545.5138889986415



 10%|█         | 1/10 [00:00<00:05,  1.59it/s][A
 20%|██        | 2/10 [00:01<00:04,  1.64it/s][A
 30%|███       | 3/10 [00:01<00:04,  1.67it/s][A
 40%|████      | 4/10 [00:02<00:03,  1.64it/s][A
 50%|█████     | 5/10 [00:03<00:03,  1.64it/s][A
 60%|██████    | 6/10 [00:03<00:02,  1.65it/s][A
 70%|███████   | 7/10 [00:04<00:01,  1.66it/s][A
 80%|████████  | 8/10 [00:04<00:01,  1.67it/s][A
 90%|█████████ | 9/10 [00:05<00:00,  1.70it/s][A
100%|██████████| 10/10 [00:05<00:00,  1.71it/s][A
[A
  0%|          | 0/18 [00:00<?, ?it/s][A
 11%|█         | 2/18 [00:00<00:01, 10.94it/s][A
 17%|█▋        | 3/18 [00:00<00:01, 10.35it/s][A
 28%|██▊       | 5/18 [00:00<00:01, 10.37it/s][A
 33%|███▎      | 6/18 [00:00<00:01,  9.59it/s][A
 39%|███▉      | 7/18 [00:00<00:01,  8.95it/s][A
 44%|████▍     | 8/18 [00:00<00:01,  8.84it/s][A
 50%|█████     | 9/18 [00:01<00:01,  8.58it/s][A
 56%|█████▌    | 10/18 [00:01<00:00,  8.18it/s][A
 61%|██████    | 11/18 [00:01<00:00,  7.87it/s][A


Epoch: 2 | Loss: 6.9129250049591064 | Train PPL: 1005.1831120678406 | Val PPL: 183.86663452568501



 10%|█         | 1/10 [00:00<00:05,  1.72it/s][A
 20%|██        | 2/10 [00:01<00:04,  1.84it/s][A
 30%|███       | 3/10 [00:01<00:03,  1.79it/s][A
 40%|████      | 4/10 [00:02<00:03,  1.83it/s][A
 50%|█████     | 5/10 [00:02<00:02,  1.84it/s][A
 60%|██████    | 6/10 [00:03<00:02,  1.91it/s][A
 70%|███████   | 7/10 [00:03<00:01,  1.89it/s][A
 80%|████████  | 8/10 [00:04<00:01,  1.90it/s][A
 90%|█████████ | 9/10 [00:04<00:00,  1.86it/s][A
100%|██████████| 10/10 [00:05<00:00,  1.84it/s][A
[A
  0%|          | 0/18 [00:00<?, ?it/s][A
 11%|█         | 2/18 [00:00<00:01, 11.27it/s][A
 17%|█▋        | 3/18 [00:00<00:01, 10.36it/s][A
 28%|██▊       | 5/18 [00:00<00:01, 10.34it/s][A
 33%|███▎      | 6/18 [00:00<00:01,  9.69it/s][A
 39%|███▉      | 7/18 [00:00<00:01,  9.12it/s][A
 44%|████▍     | 8/18 [00:00<00:01,  8.86it/s][A
 50%|█████     | 9/18 [00:01<00:01,  8.54it/s][A
 56%|█████▌    | 10/18 [00:01<00:00,  8.14it/s][A
 61%|██████    | 11/18 [00:01<00:00,  7.91it/s][A


Epoch: 3 | Loss: 4.798767352104187 | Train PPL: 121.36073023307311 | Val PPL: 76.66816568504365



 10%|█         | 1/10 [00:00<00:04,  2.14it/s][A
 20%|██        | 2/10 [00:00<00:03,  2.47it/s][A
 30%|███       | 3/10 [00:01<00:03,  2.23it/s][A
 40%|████      | 4/10 [00:01<00:02,  2.07it/s][A
 50%|█████     | 5/10 [00:02<00:02,  1.99it/s][A
 60%|██████    | 6/10 [00:03<00:02,  1.93it/s][A
 70%|███████   | 7/10 [00:03<00:01,  1.97it/s][A
 80%|████████  | 8/10 [00:04<00:01,  1.92it/s][A
 90%|█████████ | 9/10 [00:04<00:00,  1.89it/s][A
100%|██████████| 10/10 [00:05<00:00,  1.87it/s][A
[A
  0%|          | 0/18 [00:00<?, ?it/s][A
 11%|█         | 2/18 [00:00<00:01, 11.28it/s][A
 17%|█▋        | 3/18 [00:00<00:01, 10.63it/s][A
 28%|██▊       | 5/18 [00:00<00:01, 10.66it/s][A
 33%|███▎      | 6/18 [00:00<00:01,  9.91it/s][A
 39%|███▉      | 7/18 [00:00<00:01,  9.28it/s][A
 44%|████▍     | 8/18 [00:00<00:01,  9.04it/s][A
 50%|█████     | 9/18 [00:01<00:01,  8.72it/s][A
 56%|█████▌    | 10/18 [00:01<00:00,  8.29it/s][A
 61%|██████    | 11/18 [00:01<00:00,  8.09it/s][A


Epoch: 4 | Loss: 3.8190979957580566 | Train PPL: 45.56309167797375 | Val PPL: 57.109087457576045



 10%|█         | 1/10 [00:00<00:04,  1.90it/s][A
 20%|██        | 2/10 [00:01<00:04,  1.74it/s][A
 30%|███       | 3/10 [00:01<00:04,  1.73it/s][A
 40%|████      | 4/10 [00:02<00:03,  1.72it/s][A
 50%|█████     | 5/10 [00:02<00:02,  1.71it/s][A
 60%|██████    | 6/10 [00:03<00:02,  1.75it/s][A
 70%|███████   | 7/10 [00:03<00:01,  1.76it/s][A
 80%|████████  | 8/10 [00:04<00:01,  1.78it/s][A
 90%|█████████ | 9/10 [00:05<00:00,  1.77it/s][A
100%|██████████| 10/10 [00:05<00:00,  1.77it/s][A
[A
  0%|          | 0/18 [00:00<?, ?it/s][A
 11%|█         | 2/18 [00:00<00:01, 11.34it/s][A
 17%|█▋        | 3/18 [00:00<00:01, 10.58it/s][A
 28%|██▊       | 5/18 [00:00<00:01, 10.62it/s][A
 33%|███▎      | 6/18 [00:00<00:01,  9.91it/s][A
 39%|███▉      | 7/18 [00:00<00:01,  9.21it/s][A
 44%|████▍     | 8/18 [00:00<00:01,  9.09it/s][A
 50%|█████     | 9/18 [00:01<00:01,  8.78it/s][A
 56%|█████▌    | 10/18 [00:01<00:00,  8.40it/s][A
 61%|██████    | 11/18 [00:01<00:00,  8.17it/s][A


Epoch: 5 | Loss: 4.163698053359985 | Train PPL: 64.3089011581296 | Val PPL: 49.40731267176248



 10%|█         | 1/10 [00:00<00:05,  1.66it/s][A
 20%|██        | 2/10 [00:01<00:04,  1.86it/s][A
 30%|███       | 3/10 [00:01<00:03,  1.81it/s][A
 40%|████      | 4/10 [00:02<00:03,  1.86it/s][A
 50%|█████     | 5/10 [00:02<00:02,  1.81it/s][A
 60%|██████    | 6/10 [00:03<00:02,  1.81it/s][A
 70%|███████   | 7/10 [00:03<00:01,  1.78it/s][A
 80%|████████  | 8/10 [00:04<00:01,  1.79it/s][A
 90%|█████████ | 9/10 [00:05<00:00,  1.78it/s][A
100%|██████████| 10/10 [00:05<00:00,  1.77it/s][A
[A
  0%|          | 0/18 [00:00<?, ?it/s][A
 11%|█         | 2/18 [00:00<00:01, 11.51it/s][A
 17%|█▋        | 3/18 [00:00<00:01, 10.56it/s][A
 28%|██▊       | 5/18 [00:00<00:01, 10.68it/s][A
 33%|███▎      | 6/18 [00:00<00:01,  9.94it/s][A
 39%|███▉      | 7/18 [00:00<00:01,  9.36it/s][A
 44%|████▍     | 8/18 [00:00<00:01,  9.15it/s][A
 50%|█████     | 9/18 [00:01<00:01,  8.84it/s][A
 56%|█████▌    | 10/18 [00:01<00:00,  8.47it/s][A
 61%|██████    | 11/18 [00:01<00:00,  8.20it/s][A


Epoch: 6 | Loss: 4.327410387992859 | Train PPL: 75.74787475776951 | Val PPL: 46.31315406976407



 10%|█         | 1/10 [00:00<00:05,  1.67it/s][A
 20%|██        | 2/10 [00:01<00:04,  1.65it/s][A
 30%|███       | 3/10 [00:01<00:03,  1.85it/s][A
 40%|████      | 4/10 [00:02<00:03,  1.80it/s][A
 50%|█████     | 5/10 [00:02<00:02,  1.82it/s][A
 60%|██████    | 6/10 [00:03<00:02,  1.88it/s][A
 70%|███████   | 7/10 [00:03<00:01,  1.85it/s][A
 80%|████████  | 8/10 [00:04<00:01,  1.83it/s][A
 90%|█████████ | 9/10 [00:04<00:00,  1.81it/s][A
100%|██████████| 10/10 [00:05<00:00,  1.80it/s][A
[A
  0%|          | 0/18 [00:00<?, ?it/s][A
 11%|█         | 2/18 [00:00<00:01, 11.62it/s][A
 17%|█▋        | 3/18 [00:00<00:01, 10.96it/s][A
 28%|██▊       | 5/18 [00:00<00:01, 10.82it/s][A
 33%|███▎      | 6/18 [00:00<00:01,  9.98it/s][A
 39%|███▉      | 7/18 [00:00<00:01,  9.40it/s][A
 44%|████▍     | 8/18 [00:00<00:01,  9.27it/s][A
 50%|█████     | 9/18 [00:00<00:00,  9.04it/s][A
 56%|█████▌    | 10/18 [00:01<00:00,  8.64it/s][A
 61%|██████    | 11/18 [00:01<00:00,  8.40it/s][A


Epoch: 7 | Loss: 3.862162232398987 | Train PPL: 47.56809352559382 | Val PPL: 43.67900312246136



 10%|█         | 1/10 [00:00<00:05,  1.64it/s][A
 20%|██        | 2/10 [00:00<00:03,  2.02it/s][A
 30%|███       | 3/10 [00:01<00:03,  1.98it/s][A
 40%|████      | 4/10 [00:02<00:03,  1.90it/s][A
 50%|█████     | 5/10 [00:02<00:02,  1.85it/s][A
 60%|██████    | 6/10 [00:03<00:02,  1.86it/s][A
 70%|███████   | 7/10 [00:03<00:01,  1.83it/s][A
 80%|████████  | 8/10 [00:04<00:01,  1.80it/s][A
 90%|█████████ | 9/10 [00:04<00:00,  1.82it/s][A
100%|██████████| 10/10 [00:05<00:00,  1.83it/s][A
[A
  0%|          | 0/18 [00:00<?, ?it/s][A
 11%|█         | 2/18 [00:00<00:01, 11.66it/s][A
 17%|█▋        | 3/18 [00:00<00:01, 10.59it/s][A
 28%|██▊       | 5/18 [00:00<00:01, 10.70it/s][A
 33%|███▎      | 6/18 [00:00<00:01,  9.98it/s][A
 39%|███▉      | 7/18 [00:00<00:01,  9.45it/s][A
 44%|████▍     | 8/18 [00:00<00:01,  9.21it/s][A
 50%|█████     | 9/18 [00:01<00:01,  8.99it/s][A
 56%|█████▌    | 10/18 [00:01<00:00,  8.51it/s][A
 61%|██████    | 11/18 [00:01<00:00,  8.31it/s][A


Epoch: 8 | Loss: 3.8789171457290648 | Train PPL: 48.37180708346625 | Val PPL: 41.799916744638104



 10%|█         | 1/10 [00:00<00:05,  1.71it/s][A
 20%|██        | 2/10 [00:01<00:04,  1.66it/s][A
 30%|███       | 3/10 [00:01<00:04,  1.70it/s][A
 40%|████      | 4/10 [00:02<00:03,  1.71it/s][A
 50%|█████     | 5/10 [00:02<00:02,  1.70it/s][A
 60%|██████    | 6/10 [00:03<00:02,  1.76it/s][A
 70%|███████   | 7/10 [00:03<00:01,  1.77it/s][A
 80%|████████  | 8/10 [00:04<00:01,  1.82it/s][A
 90%|█████████ | 9/10 [00:04<00:00,  1.82it/s][A
100%|██████████| 10/10 [00:05<00:00,  1.81it/s][A
[A
  0%|          | 0/18 [00:00<?, ?it/s][A
 11%|█         | 2/18 [00:00<00:01, 11.46it/s][A
 17%|█▋        | 3/18 [00:00<00:01, 10.85it/s][A
 28%|██▊       | 5/18 [00:00<00:01, 10.85it/s][A
 33%|███▎      | 6/18 [00:00<00:01,  9.94it/s][A
 39%|███▉      | 7/18 [00:00<00:01,  9.28it/s][A
 44%|████▍     | 8/18 [00:00<00:01,  9.02it/s][A
 50%|█████     | 9/18 [00:01<00:01,  8.85it/s][A
 56%|█████▌    | 10/18 [00:01<00:00,  8.41it/s][A
 61%|██████    | 11/18 [00:01<00:00,  8.23it/s][A


Epoch: 9 | Loss: 3.7423595905303957 | Train PPL: 42.197441469975004 | Val PPL: 40.59977096423215



 10%|█         | 1/10 [00:00<00:05,  1.62it/s][A
 20%|██        | 2/10 [00:01<00:04,  1.72it/s][A
 30%|███       | 3/10 [00:01<00:04,  1.71it/s][A
 40%|████      | 4/10 [00:02<00:03,  1.69it/s][A
 50%|█████     | 5/10 [00:02<00:02,  1.73it/s][A
 60%|██████    | 6/10 [00:03<00:02,  1.77it/s][A
 70%|███████   | 7/10 [00:03<00:01,  1.85it/s][A
 80%|████████  | 8/10 [00:04<00:01,  1.86it/s][A
 90%|█████████ | 9/10 [00:04<00:00,  1.90it/s][A
100%|██████████| 10/10 [00:05<00:00,  1.88it/s][A
[A
  0%|          | 0/18 [00:00<?, ?it/s][A
 11%|█         | 2/18 [00:00<00:01, 10.69it/s][A
 17%|█▋        | 3/18 [00:00<00:01, 10.26it/s][A
 28%|██▊       | 5/18 [00:00<00:01, 10.46it/s][A
 33%|███▎      | 6/18 [00:00<00:01,  9.74it/s][A
 39%|███▉      | 7/18 [00:00<00:01,  9.24it/s][A
 44%|████▍     | 8/18 [00:00<00:01,  9.05it/s][A
 50%|█████     | 9/18 [00:01<00:01,  8.82it/s][A
 56%|█████▌    | 10/18 [00:01<00:00,  8.45it/s][A
 61%|██████    | 11/18 [00:01<00:00,  8.21it/s][A


Epoch: 10 | Loss: 3.4612477302551268 | Train PPL: 31.85670029602002 | Val PPL: 39.752187507279146



  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
