In [46]:
from collections import Counter
from gensim.models import Word2Vec
from random import random
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from torch import nn
from torch.autograd import Variable

import numpy as np
import torch
import torch.nn.functional as F

In [47]:
use_cuda = 1

In [48]:
# Build the data.
train_sentences = ["The man ate the apple.", "Two women running on the beach.", "Everybody read this book."]
val_sentences = ["A blue bird is singing on the tree.", "People were excited about the election."]

print(len(train_sentences))
print(len(val_sentences))
print(train_sentences[0])

3
2
The man ate the apple.


In [173]:
# Lower--case the training sentences, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in train_sentences]

# Create the vocabulary.
word_counts = Counter([word for sentence in sentences for word in sentence])
vocabulary = ["<UNK>"] + [e[0] for e in word_counts.most_common(vocabulary_size-1)]
vocabulary_size = len(vocabulary)

# Build one-hot embeddings.
one_hot_embeddings = np.eye(vocabulary_size)

# Build a word-to-index dictionary.
word2index = {word:index for index,word in enumerate(vocabulary)}

def preprocess_numberize(sentence):
    """
    Given a sentence in the form of a string, preprocess it into a list of numbers denoting the index 
    of each word in the vocabulary.
    """
    tokenized = word_tokenize(sentence.lower())
    tokenized = ["<SOS>"] + tokenized + ["<EOS>"]
    numberized = [word2index.get(word) for word in tokenized]
    return numberized

def preprocess_one_hot(sentence):
    """
    Given a sentence in the form of a string, preprocess it into a a matrix of one-hot vectors corresponding
    to each word in the vocabulary.
    """
    numberized = preprocess_numberize(sentence)
    one_hot_embedded = one_hot_embeddings[numberized]
    return one_hot_embedded

In [174]:
print(vocabulary)
print(len(vocabulary))

['<UNK>', '<SOS>', 'the', '.', '<EOS>', 'man', 'ate', 'apple', 'two', 'women', 'running', 'on', 'beach', 'everybody', 'read', 'this', 'book']
17


In [175]:
# Building the LSTM network:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size 
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        self.softmax = nn.Softmax()
        
    def forward(self, input, hidden):
        #output = F.relu(input)
        input = input.view(1,1,-1)
        output, hidden = self.lstm(input, hidden)
        output = self.out(output[0])
        return output, hidden
    
    def initHidden(self):
        h0 = torch.zeros(1, 1, self.hidden_size)
        c0 = torch.zeros(1, 1, self.hidden_size)
        result = (h0, c0)
        if use_cuda:
            result = (h0.cuda(), c0.cuda())
        return result

In [176]:
sentence = "the man ate the apple."
print(sentence)
word_indices = preprocess_numberize(sentence.lower())
print(word_indices)
target_tensor = torch.LongTensor(word_indices[1:]).cuda()
print(target_tensor.shape)
#decoder_input = Variable(torch.FloatTensor([[embeddings[target_variable[0].data[0]]]]))
print(target_tensor[0].data)

the man ate the apple.
[1, 2, 5, 6, 2, 7, 3, 4]
torch.Size([7])
tensor(2, device='cuda:0')


In [177]:
# Let's see what the model looks like:
model = EncoderLSTM(input_size=vocabulary_size, hidden_size=300, output_size=vocabulary_size)
if use_cuda:
    model = model.cuda()
model

EncoderLSTM(
  (lstm): LSTM(17, 300)
  (out): Linear(in_features=300, out_features=17, bias=True)
  (softmax): Softmax()
)

In [188]:
# Training the network:
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

teacher_forcing_ratio = 0.0

def train(target_tensor,
         model,
         optimizer,
         criterion,
         embeddings = one_hot_embeddings):
    """
    Given a signle training sample, go through a single step of training.
    """
    loss = 0
    optimizer.zero_grad()
    
    input_tensor = torch.FloatTensor(embeddings[word2index['<SOS>']]).unsqueeze(0)
    input_tensor = input_tensor.cuda() if use_cuda else input_tensor
    print("input_shape:", input_tensor.shape)
    target_tensor = target_tensor.view(-1,1)
    print("target_shape:", target_tensor.shape)
    hidden = model.initHidden()
    
    use_teacher_forcing = True if np.random.random() < teacher_forcing_ratio else False
    
    predictions = []
    
    target_length = target_tensor.size(0)
    
    if use_teacher_forcing:
        # Teacher force:
        for i in range(1, target_length): # ignore first word
            output, hidden = model(input_tensor, hidden)
            loss += criterion(output, target_tensor[i])
            
            _, topi = model.softmax(output).data.topk(1) 
            predictions.append(vocabulary[topi])
            
            # Set model input to next ground-truth word:
            input_tensor = torch.FloatTensor(embeddings[target_tensor[i].data])
            input_tensor = input_tensor.cuda() if use_cuda else input_tensor
            
    else:
        # No teacher force:
        for i in range(target_length): 
            print(target_tensor[i].shape)
            print(input_tensor.shape)
            print("last-t", target_tensor[i].data)
            print("last-i", input_tensor.data)
            output, hidden = model(input_tensor, hidden)
            loss += criterion(output, target_tensor[i])
            
            topv, topi = model.softmax(output).data.topk(1)
            print("topv:", topv)
            print("topi", topi)
            predictions.append(vocabulary[topi[0][0]])
            
            # Set model input to its current output:
            input_tensor = torch.FloatTensor([embeddings[topi]])
            input_tensor = input_tensor.cuda() if use_cuda else input_tensor
            
    loss.backward()
    optimizer.step()
    
    print(loss)
    return loss.item() / target_length

In [190]:
# Let's train the network:
num_epochs = 100

for epoch in range(num_epochs):
    total_loss = 0
    for i, sentence in enumerate(train_sentences):
        word_indices = preprocess_numberize(sentence.lower())
        if len(word_indices) < 3:
            continue
        target_tensor = torch.LongTensor(word_indices[1:])
        target_tensor = target_tensor.cuda() if use_cuda else target_tensor # start from first word instead of <SOS>
        loss = train(target_tensor, model, optimizer, criterion)
        total_loss += loss
        if i % 10 == 0:
            print('epoch[%d], loss: %.3f' % (epoch+1, total_loss/10))
            total_loss = 0

input_shape: torch.Size([1, 17])
target_shape: torch.Size([7, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0635]], device='cuda:0')
topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([5], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0654]], device='cuda:0')
topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([6], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0667]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.



tensor(14.3227, device='cuda:0', grad_fn=<AddBackward0>)
input_shape: torch.Size([1, 17])
target_shape: torch.Size([7, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0674]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([5], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0793]], device='cuda:0')
topi tensor([[3]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([6], device='cuda:0')
last-i tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1054]], device='cuda:0')
topi tensor([[3]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 0

last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1001]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([16], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1484]], device='cuda:0')
topi tensor([[3]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([3], device='cuda:0')
last-i tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2430]], device='cuda:0')
topi tensor([[3]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([4], device='cuda:0')
last-i tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3557]], device='cuda:0')
topi tensor([[4]], device='cuda:0')
tensor(14.2409, device='cuda:0'

tensor(13.9770, device='cuda:0', grad_fn=<AddBackward0>)
input_shape: torch.Size([1, 17])
target_shape: torch.Size([7, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0699]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([5], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0843]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([6], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1129]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 1

topv: tensor([[0.2319]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([7], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2915]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([3], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3979]], device='cuda:0')
topi tensor([[3]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([4], device='cuda:0')
last-i tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.6154]], device='cuda:0')
topi tensor([[4]], device='cuda:0')
tensor(13.0704, device='cuda:0', grad_fn=<AddBackward0>)
epoch[17], loss: 0.187
input_shape: torch.Size([1, 17])
target_shape: torch.Size([8, 

topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([7], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2735]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([3], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.4385]], device='cuda:0')
topi tensor([[3]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([4], device='cuda:0')
last-i tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.5960]], device='cuda:0')
topi tensor([[4]], device='cuda:0')
tensor(12.4768, device='cuda:0', grad_fn=<AddBackward0>)
epoch[21], loss: 0.178
input_shape: torch.Size([1, 17])
target_shape: torch.Size([8, 1])
torch.Size([1])
torch.Size([1, 17])
la

topv: tensor([[0.6181]], device='cuda:0')
topi tensor([[4]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([4], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.7424]], device='cuda:0')
topi tensor([[4]], device='cuda:0')
tensor(15.1697, device='cuda:0', grad_fn=<AddBackward0>)
input_shape: torch.Size([1, 17])
target_shape: torch.Size([6, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([13], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0720]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([14], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0941]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
t

epoch[28], loss: 0.168
input_shape: torch.Size([1, 17])
target_shape: torch.Size([8, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([8], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0719]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([9], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0942]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([10], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1621]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([11], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0

tensor(12.5643, device='cuda:0', grad_fn=<AddBackward0>)
input_shape: torch.Size([1, 17])
target_shape: torch.Size([7, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0714]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([5], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0920]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([6], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1577]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 1

input_shape: torch.Size([1, 17])
target_shape: torch.Size([7, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0716]], device='cuda:0')
topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([5], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0942]], device='cuda:0')
topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([6], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1515]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

torch.Size([1])
torch.Size([1, 17])
last-t tensor([6], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1367]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3179]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([7], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3125]], device='cuda:0')
topi tensor([[3]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([3], device='cuda:0')
last-i tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.4322]], device='cuda:

topi tensor([[14]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([5], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1051]], device='cuda:0')
topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([6], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1475]], device='cuda:0')
topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3162]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([7], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
t

last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1686]], device='cuda:0')
topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2935]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([7], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3211]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([3], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.4214]], device='cuda:0')
topi tensor([[3]], device='cuda:0')
torch.Size([1])
torch.Size([1, 1

topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([6], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1899]], device='cuda:0')
topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2959]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([7], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3201]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([3], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
to

topi tensor([[3]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([4], device='cuda:0')
last-i tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.5722]], device='cuda:0')
topi tensor([[4]], device='cuda:0')
tensor(10.5351, device='cuda:0', grad_fn=<AddBackward0>)
epoch[56], loss: 0.151
input_shape: torch.Size([1, 17])
target_shape: torch.Size([8, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([8], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0864]], device='cuda:0')
topi tensor([[14]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([9], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1227]], device='cuda:0')
topi tensor([[14]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])


input_shape: torch.Size([1, 17])
target_shape: torch.Size([7, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.0893]], device='cuda:0')
topi tensor([[14]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([5], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1339]], device='cuda:0')
topi tensor([[14]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([6], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2233]], device='cuda:0')
topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

topv: tensor([[0.0919]], device='cuda:0')
topi tensor([[14]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([14], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1455]], device='cuda:0')
topi tensor([[14]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([15], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2430]], device='cuda:0')
topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([16], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3121]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([3], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([11], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2993]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3318]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([12], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3996]], device='cuda:0')
topi tensor([[3]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([3], device='cuda:0')
last-i tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')


last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3866]], device='cuda:0')
topi tensor([[3]], device='cuda:0')
tensor(9.1140, device='cuda:0', grad_fn=<AddBackward0>)
input_shape: torch.Size([1, 17])
target_shape: torch.Size([7, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1064]], device='cuda:0')
topi tensor([[13]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([5], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1660]], device='cuda:0')
topi tensor([[14]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([6], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
       dev

topi tensor([[13]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([9], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1825]], device='cuda:0')
topi tensor([[14]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([10], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2721]], device='cuda:0')
topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([11], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3156]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0'

topv: tensor([[0.1234]], device='cuda:0')
topi tensor([[13]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([5], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2023]], device='cuda:0')
topi tensor([[14]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([6], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2781]], device='cuda:0')
topi tensor([[15]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]],
       device='cuda:0')
topv: tensor([[0.3118]], device='cuda:0')
topi tensor([[2]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([7], device='cuda:0')
last-i tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

input_shape: torch.Size([1, 17])
target_shape: torch.Size([6, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([13], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1335]], device='cuda:0')
topi tensor([[13]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([14], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2193]], device='cuda:0')
topi tensor([[14]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([15], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2876]], device='cuda:0')
topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([16], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 

topi tensor([[13]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([9], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2374]], device='cuda:0')
topi tensor([[14]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([10], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2937]], device='cuda:0')
topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([11], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3144]], device='cuda:0')
topi tensor([[16]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]],
       device='cuda:0

tensor(8.0654, device='cuda:0', grad_fn=<AddBackward0>)
input_shape: torch.Size([1, 17])
target_shape: torch.Size([7, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1572]], device='cuda:0')
topi tensor([[13]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([5], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2532]], device='cuda:0')
topi tensor([[14]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([6], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3002]], device='cuda:0')
topi tensor([[15]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0.,

topi tensor([[3]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([3], device='cuda:0')
last-i tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.5619]], device='cuda:0')
topi tensor([[4]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([4], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.8657]], device='cuda:0')
topi tensor([[4]], device='cuda:0')
tensor(8.9141, device='cuda:0', grad_fn=<AddBackward0>)
input_shape: torch.Size([1, 17])
target_shape: torch.Size([6, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([13], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1698]], device='cuda:0')
topi tensor([[13]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([14], devi

input_shape: torch.Size([1, 17])
target_shape: torch.Size([8, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([8], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1835]], device='cuda:0')
topi tensor([[13]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([9], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2761]], device='cuda:0')
topi tensor([[14]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([10], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3100]], device='cuda:0')
topi tensor([[6]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([11], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.

tensor(7.4966, device='cuda:0', grad_fn=<AddBackward0>)
input_shape: torch.Size([1, 17])
target_shape: torch.Size([7, 1])
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.1977]], device='cuda:0')
topi tensor([[13]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([5], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.2878]], device='cuda:0')
topi tensor([[9]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([6], device='cuda:0')
last-i tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')
topv: tensor([[0.3173]], device='cuda:0')
topi tensor([[15]], device='cuda:0')
torch.Size([1])
torch.Size([1, 17])
last-t tensor([2], device='cuda:0')
last-i tensor([[0., 0., 

In [112]:
# Let's test the training

In [191]:
for i in range(5):
    print(i)

0
1
2
3
4


In [196]:
from tqdm import tqdm
import time
for i in tqdm(range(10000)):
    time.sleep(0.1)

  3%|▎         | 284/10000 [00:28<16:28,  9.82it/s]

KeyboardInterrupt: 