In [1]:
%load_ext autoreload
%autoreload 2
from requirements import *
from dataUtils import *

Reading lines...
Read 135842 sentence pairs
Trimmed to 10853 sentence pairs
Counting words...
Counted words:
fra 4489
eng 2925
['le probleme ce n est pas moi .', 'i m not the problem .']


In [34]:
class Encoder(nn.Module):
    def __init__(self, source_vocab_size, input_size, hidden_size, num_layers = 1, dropout=0, bidirectional=False):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embed = nn.Embedding(source_vocab_size, input_size)
        self.gru = nn.GRU(input_size, hidden_size, num_layers, dropout=0, bidirectional=False)
    
    def forward(self, input_x, prev_hidden_h):
        embedding = self.embed(input_x).view(1, 1, -1)
        out, h = self.gru(embedding, prev_hidden_h)
        return out, h
    
    def init_hidden_state(self):
        return Variable(torch.zeros(self.num_layers, 1, self.hidden_size)).cuda()

In [35]:
class Decoder(nn.Module):
    def __init__(self, target_vocab_size, input_size, hidden_size, num_layers = 1, dropout=0, bidirectional=False):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embed = nn.Embedding(target_vocab_size, input_size)
        self.gru = nn.GRU(input_size, hidden_size, num_layers, dropout=0, bidirectional=False)
        self.h2o = nn.Linear(hidden_size, target_vocab_size)
    
    def forward(self, input_x, prev_hidden_h):
        embedding = embed(input_x).view(1, 1, -1)
        out, h = gru(embedding, prev_hidden_h)
        output = self.h2o(out[0])
        return F.log_softmax(output)

In [50]:
class AttentionDecoder(nn.Module):
    def __init__(self, max_length, target_vocab_size, input_size, hidden_size, num_layers = 1, dropout=0, bidirectional=False):
        super(AttentionDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.max_length = max_length
        self.embed = nn.Embedding(target_vocab_size, input_size)
        self.attn_weights = nn.Linear(hidden_size + input_size, max_length)
        self.attn_combine = nn.Linear(input_size + hidden_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers, dropout=0, bidirectional=False)
        self.h2o = nn.Linear(hidden_size, target_vocab_size)
    
    def forward(self, input_x, prev_hidden_h, encoder_states):
        embedding = F.dropout(self.embed(input_x).view(1, 1, -1), 0.1, self.training)
        i2aw = torch.cat([embedding[0], prev_hidden_h[0]], 1)
        attn_wts = F.softmax(self.attn_weights(i2aw))
        ci = torch.bmm(attn_wts.unsqueeze(0), encoder_states.unsqueeze(0))
        i2gru = F.relu(self.attn_combine(torch.cat([embedding[0], ci[0]], 1)))
        out, h = self.gru(i2gru.unsqueeze(0), prev_hidden_h)
        output = self.h2o(out[0])
        return F.log_softmax(output), h
    
    def init_hidden_states(self):
        return Variable(torch.zeros(self.num_layers, 1, self.hidden_size)).cuda()
        

In [51]:
hidden_size = 256
encoder_input_size = 256
decoder_input_size = 256
encoder = Encoder(input_lang.n_words, encoder_input_size, hidden_size)
decoder = AttentionDecoder(10, output_lang.n_words, decoder_input_size, hidden_size)
encoder = encoder.cuda()
decoder = decoder.cuda()

In [60]:
trainEpochs(encoder, decoder, 75000, print_every=5000)

3m 9s (- 44m 9s) (5000 6%) 0.5246
6m 41s (- 43m 32s) (10000 13%) 0.5531


KeyboardInterrupt: 

In [26]:
def trainEpochs(encoder, decoder, n_epochs, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    training_pairs = [variablesFromPair(random.choice(pairs)) for i in range(n_epochs)]
    objective = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        training_pair = training_pairs[epoch - 1]
        input_seq = training_pair[0]
        target_seq = training_pair[1]

        loss = train(input_seq, target_seq, encoder, decoder, encoder_optimizer, decoder_optimizer, objective, 10)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                         epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [59]:
def train(input_seq, target_seq, encoder, decoder, encoder_optimizer, decoder_optimizer, objective, max_length):
    encoder_hidden = encoder.init_hidden_state()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    input_seq_len = input_seq.size()[0]
    encoder_hidden_states = Variable(torch.zeros(max_length, encoder.hidden_size)).cuda()
    
    for i in range(input_seq_len):
        encoder_out, encoder_hidden = encoder(input_seq[i], encoder_hidden)
        encoder_hidden_states[i] = encoder_out[0][0]
    
    decoder_hidden = encoder_hidden
    start_symbol = Variable(torch.LongTensor([SOS_token])).cuda()
    decoder_input = start_symbol
    target_seq_len = target_seq.size()[0]
    loss = 0
    
    for i in range(target_seq_len):
        prob_dist, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_hidden_states)
        loss += objective(prob_dist, target_seq[i])
        _, idx = torch.max(prob_dist, 1)
        decoder_input = idx
        if i == EOS_token:
            break
    
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.data[0] / target_seq_len