In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import pdb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Attention Module

In [None]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        self.softmax = torch.nn.Softmax(dim=1)
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):

        # Create variable to store attention energies
        # hidden is 32 by 256
        # encoder_outputs is 32 by 22 by 256
        batch_size = hidden.size()[0]
        attn_energies = []
        for i in range(batch_size):
            attn_energies.append(score(hidden[i], encoder_output[i]))
        
        # attn_energies is 32 by 22
        attn_energies = self.softmax(torch.tensor(attn_energies))
        
        context_vectors = []
        for i in range(batch_size):
            context_vectors.append(torch.matmul(attn_energies[i], encoder_output[i]))
            
        context_vectors = torch.tensor(context_vectors)
        
        # Return context vectors
        return context_vectors
    
    def score(self, hidden, encoder_output):
        
        if self.method == 'dot':
            # hidden is 1 by 256
            # encoder_output is 22 by 256
            encoder_output = torch.transpose(encoder_output, 0, 1)
            # encoder_output is 256 by 22
            energy = torch.matmul(hidden, encoder_output)
            return energy
        
        elif self.method == 'general':
            # hidden is 1 by 256
            # encoder_output is 256 by 22
            energy = torch.matmul(hidden, self.attn(encoder_output))
            return energy
        
        elif self.method == 'concat':
            len_encoder_output = encoder_output.size()[1]
            # hidden is 1 by 256
            # encoder_output is 256 by 22
            hidden = torch.transpose(hidden, 0, 1)
            # hidden is 256 by 1
            hidden = hidden.repeat(hidden_size, len_encoder_output)
            # hidden is 256 by 22
            concat = torch.cat((hidden, encoder_output), dim=0)
            # concat is 512 by 22
            # self.attn(concat) --> 256 by 22
            energy = torch.matmul(self.v, F.tanh(self.attn(concat)))
            return energy

In [None]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=PAD_TOKEN)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        # Choose attention model
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_seq, last_hidden, encoder_outputs):
        # Note: we run this one step at a time

        # Get the embedding of the current input word (last output word)
        batch_size = input_seq.size(0)
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        embedded = embedded.view(1, batch_size, self.hidden_size) # S=1 x B x N

        # Get current hidden state from input word and last hidden state
        rnn_output, hidden = self.gru(embedded, last_hidden)

        # Calculate attention from current RNN state and all encoder outputs;
        # apply to encoder outputs to get weighted average
        context = self.attn(rnn_output, encoder_outputs)

        # Attentional vector using the RNN hidden state and context vector
        # concatenated together (Luong eq. 5)
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1)       # B x S=1 x N -> B x N
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = F.tanh(self.concat(concat_input))

        # Finally predict next token (Luong eq. 6, without softmax)
        output = self.out(concat_output)

        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden

In [None]:
class Encoder_Batch_AttentionRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder_Batch_RNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        
    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

    def forward(self, sents, sent_lengths):
        '''
            sents is a tensor with the shape (batch_size, padded_length )
            when we evaluate sentence by sentence, you evaluate it with batch_size = 1, padded_length.
            [[1, 2, 3, 4]] etc. 
        '''
        batch_size = sents.size()[0]
        sent_lengths = list(sent_lengths)
        # We sort and then do pad packed sequence here. 
        descending_lengths = [x for x, _ in sorted(zip(sent_lengths, range(len(sent_lengths))), reverse=True)]
        descending_indices = [x for _, x in sorted(zip(sent_lengths, range(len(sent_lengths))), reverse=True)]
        descending_lengths = np.array(descending_lengths)
        descending_sents = torch.index_select(sents, 0, torch.tensor(descending_indices).to(device))
        
        # get embedding
        embed = self.embedding(descending_sents)
        # pack padded sequence
        embed = torch.nn.utils.rnn.pack_padded_sequence(embed, descending_lengths, batch_first=True)
        
        # fprop though RNN
        self.hidden = self.init_hidden(batch_size)
        packed_outputs, self.hidden = self.gru(embed, self.hidden)
        # change the order back
        change_it_back = [x for _, x in sorted(zip(descending_indices, range(len(descending_indices))))]
        self.hidden = torch.index_select(self.hidden, 1, torch.LongTensor(change_it_back).to(device))
        unpacked_outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out, batch_first=True)
        # reorder them
        outputs = torch.index_select(rnn_out, 0, torch.LongTensor(change_it_back).to(device))
        
        return outputs, self.hidden

In [None]:
teacher_forcing_ratio = 0.5


# example of input_tensor: [2, 43, 23, 9, 19, 4]. Indexed on our vocabulary. 
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    # iterate GRU over words --> final hidden state is representation of source sentence. 
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        # encoder_output and encoder_hidden are the same vectors
        # 1 by 1 by 256
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
#             decoder_output, decoder_hidden, decoder_attention = decoder(
#                 decoder_input, decoder_hidden, encoder_outputs)
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
#             decoder_output, decoder_hidden, decoder_attention = decoder(
#                 decoder_input, decoder_hidden, encoder_outputs)
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [7]:
z = torch.randn(3, 1)
print(torch.transpose(z, 0, 1))
print(z)
print(z.repeat(1, 2))

tensor([[ 0.0505, -1.3764, -0.7923]])
tensor([[ 0.0505],
        [-1.3764],
        [-0.7923]])
tensor([[ 0.0505,  0.0505],
        [-1.3764, -1.3764],
        [-0.7923, -0.7923]])


In [None]:
class Decoder_Batch_AttentionRNN(nn.Module):
    def __init__(self, output_size, hidden_size):
        super(Decoder_Batch_RNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.log_softmax = nn.LogSoftmax()
        
    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

    def forward(self, word, decoder_hidden, encoder_outputs):
        
        # embed the word
        embed = self.embedding(word)
        # 1 by 256
        
        # calculate the attention weights
        # decoder_hidden z = 1 by 256
        # encoder_outputs is 22 by 256
        # encoder_outputs = torch.transpose(encoder_outputs, 0, 1)
        # encoder_outputs is 256 by 22
        
        # make z 256 by 1
        # decoder_hidden = torch.transpose(decoder_hidden, 0, 1)
        # now it's 256 by 1
        # stack_zs = decoder_hidden.repeat(1, 22)
        # now it's 256 by 22
        # concatenated = torch.cat((stack_zs, encoder_outputs), dim=0)
        # now it's 512 by 22
        # linear layer (512, 1)
        # now you have attn_numbers = 1 by 22
        # softmax --> attn_weights
        # context vector --> linear combination of attn_weights, encoder_outputs
        # previous decoder hidden = decoder_hidden
        
        # concatenate decoder_hidden (z), context vector (c), and embed together
        # gru (3*hidden_size, hidden_size)
        # self.hidden is 256 by 1
        # output = self.log_softmax(self.out(self.hidden), dim=1)
        # return output
        
        
        
        
        self.hidden = 

        # get embedding
        embed = self.embedding(descending_sents)
        # pack padded sequence
        embed = torch.nn.utils.rnn.pack_padded_sequence(embed, descending_lengths, batch_first=True)
        
        # fprop though RNN
        self.hidden = hidden
        rnn_out, self.hidden = self.gru(embed, self.hidden)
        
        change_it_back = [x for _, x in sorted(zip(descending_indices, range(len(descending_indices))))]
        self.hidden = torch.index_select(self.hidden, 1, torch.LongTensor(change_it_back).to(device))
        rnn_out, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out, batch_first=True)
        output = self.softmax(self.out(rnn_out))
        # now output is the size 28 by 31257 (vocab size)
        return output, self.hidden

In [None]:
print_every=1000, plot_every=1000, learning_rate=0.0001):
    """
    lang1 is the Lang object for language 1 
    Lang2 is the Lang object for language 2
    Max length generation is the max length generation you want 
    """
    start = time.time()
    plot_losses = []
    val_losses = [] 
    count = 0 
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    val_loss_total = 0
    plot_val_loss = 0
    encoder_optimizer = torch.optim.Adadelta(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.Adadelta(decoder.parameters(), lr=learning_rate)
    #encoder_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, mode="min")
    #decoder_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(decoder_optimizer, mode="min")

    criterion = nn.NLLLoss(ignore_index=PAD_token) # this ignores the padded token. 
    plot_loss =[]
    val_loss = []
    for epoch in range(n_epochs):
        plot_loss = []
        val_loss = []
        for step, (sent1s, sent1_lengths, sent2s, sent2_lengths) in enumerate(train_loader):
            encoder.train() # what is this for?
            decoder.train()
            
            sent1_batch, sent2_batch = sent1s.to(device), sent2s.to(device) 
            sent1_length_batch, sent2_length_batch = sent1_lengths.to(device), sent2_lengths.to(device)
            batch_size = sent1_batch.size()[0]
            
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()
            
            encoder_outputs, encoder_hidden = encoder(sent1_batch, sent1_length_batch)
            # encoder_outputs is 32 by 22 by 256
            # encoder_hidden is 1 by 32 by 256
            
            decoder_hidden = encoder_hidden     
            
            # decoder_input is incoming word token
            # decoder_hidden starts the last hidden state of the encoder
            # encoder_outputs is all of the encoder hidden states
            for i in range(batch_size):
                # we're going over each sentence in the batch
                decoder_hidden = encoder_hidden[i] # 1 by 256
                encoder_outputs = encoder_outputs[i] # 1 by 22 by 256
                decoder_input = torch.tensor([[SOS_token]], device=device)
                for j in range(len(sent2_batch[i])):
                    # we're going over each word in the target sentence
                    # "I am a robot"
                    decoder_output, decoder_hidden, decoder_attention = decoder(
                        decoder_input, decoder_hidden, encoder_outputs)
                    loss += criterion(decoder_output, target_tensor[di])
                    decoder_input = sent2_batch[i][j]# Teacher forcing
                
            
            
            
            
            

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)