In [11]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import pad_packed_sequence as unpack
from torch.utils.data import Dataset

from nltk.translate.bleu_score import corpus_bleu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Prepare data

In [2]:
SOS_IDX = 0
EOS_IDX = 1
PAD_IDX = 2
UNK_IDX = 3

class Language:
    def __init__(self, name):
        self.name = name
        self.word2idx = {SOS_IDX: "<SOS>", EOS_IDX: "<EOS>", PAD_IDX: "<PAD>", UNK_IDX: "<UNK>"}
        self.idx2word = ["<SOS>", "<EOS>", "<PAD>", "<UNK>"]
        self.sentence_list = []

    def build_vocab(self, sentence_list):
        self.idx2word += list(set([word for sentence in sentence_list for word in sentence.split()]))
        self.word2idx = dict(zip(self.idx2word, range(0, len(self.idx2word))))

In [3]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicode2ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode2ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [21]:
def read_data(language_name, path, data_type):
    sentence_lines = open(path + '%s.tok.%s' % (data_type, language_name), encoding='utf-8').read().strip().split('\n')
    if language_name == "en":
        sentence_lines = [normalize_string(cur_line) for cur_line in sentence_lines]
    return sentence_lines

path = "./iwslt-zh-en/"
train_data = [read_data("zh", path, "train"), read_data("en", path, "train")]

In [7]:
source_language = Language("zh")
source_language.build_vocab(train_data[0])

target_language = Language("en")
target_language.build_vocab(train_data[1])

### Dataset and Dataloader

In [15]:
class LanguageDataset(Dataset):
    def __init__(self, data):
        self.source_sentence = data[0]
        self.target_sentence = data[1]
        assert len(self.source_sentence) == len(self.target_sentence)
    
    def __len__(self):
        return len(self.source_dataset)
    
    def __getitem__(self, idx):
        source_idx_list = [source_language.word2idx[cur_word] if cur_word in source_language.word2idx else UNK_IDX 
                           for cur_word in self.source_sentence[idx]]
        target_idx_list = [target_language.word2idx[cur_word] if cur_word in target_language.word2idx else UNK_IDX 
                           for cur_word in self.source_sentence]
        return ((source_idx_list, target_idx_list), (len(source_idx_list), len(target_idx_list)))

In [20]:
MAX_SENTENCE_LENGTH = 200
BATCH_SIZE = 32

def padding(batch):
    padded_source_list = []
    padded_target_list = []
    source_length_list = []
    target_length_list = []
    
    for data in batch:
        
        if data[1][0] > MAX_SENTENCE_LENGTH or data[1][1] > MAX_SENTENCE_LENGTH:
            continue
        source_length_list.append(data[1][0])
        target_length_list.append(data[1][1])
        
        padded_source = np.pad(np.array(data[0][0]), pad_width = ((0, MAX_SENTENCE_LENGTH - data[1][0])), mode="constant", constant_values=0)
        padded_source_list.append(padded_source)
        
        padded_target = np.pad(np.array(data[0][1]), pad_width = ((0, MAX_SENTENCE_LENGTH - data[1][1])), mode="constant", constant_values=0)
        padded_target_list.append(padded_target)
        
    
    return ((padded_source_list, padded_target_list), (source_length_list, target_length_list))


train_dataset = LanguageDataset(train_data)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, collate_fn=padding, shuffle=True)
        
    

### Encoder

In [9]:
class RNN(nn.Module):
    def __init__(self, embed_dim, hidden_dim, layer_num, vocab_size):
        super(RNN, self).__init__()
        
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.layer_num = layer_num
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        #self.embedding.load_state_dict({'weight': torch.from_numpy(pretrained_embeddings)})
        self.GRU = nn.GRU(embed_dim, hidden_dim, batch_first=True, bidirectional=False)
    
    def init_hidden(self):
        return torch.randn(self.layer_num, self.batch_size, self.hidden_dim, device=device)
    
    def forward(self, sentence_list, sentence_length_list):
        
        #sentence_sort_idx, sentence_unsort_idx = batch_sort(sentence_length_list)
        #sentence_list = sentence_list[sentence_sort_idx]
        #sentence_length_list = sentence_length_list[sentence_sort_idx]
        
        embed = pack(self.embedding(sentence_list), sentence_length_list, batch_first=True)
        batch_size, _ = sentence_list.size()
        hidden = self.init_hidden(batch_size)
        packed_outputs, hidden = self.GRU(embed, hidden)
        outputs, _ = unpack(packed_outputs, batch_first=True)
        
        return outputs, hidden
        

### Decoder w/o Attention

In [10]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_dim, hidden_dim, layer_num, vocab_size):
        super(DecoderRNN, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.layer_num = layer_num
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.GRU = nn.GRU(embed_dim, hidden_dim, batch_first=True, bidirectional=False)
        self.out = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sentence_list, sentence_length, hidden):
        embed = F.relu(pack(self.embedding(sentence_list), sentence_length_list, batch_first=True))
        packed_output, hidden = self.gru(output, hidden)
        output, _ = unpack(packed_outputs, batch_first=True)
        output = self.softmax(self.out(output[0]))
        return output, hidden
    
    def initHidden(self):
        return torch.randn(self.layer_num, self.batch_size, self.hidden_dim, device=device)

### Training the Model

In [45]:
teacher_forcing_ratio = 0.5


def batch_sort(length_list):
    sort_idx = np.argsort(-length_list)
    unsort_idx = np.argsort(sort_idx)
    return (sort_idx, unsort_idx)


def train(encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    encoder_hidden = encoder.initHidden()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

This is a helper function to print time elapsed and estimated time
remaining given the current time and progress %.




In [46]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [47]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

### Evaluation

In [56]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    """
    Function that generate translation.
    First, feed the source sentence into the encoder and obtain the hidden states from encoder.
    Secondly, feed the hidden states into the decoder and unfold the outputs from the decoder.
    Lastly, for each outputs from the decoder, collect the corresponding words in the target language's vocabulary.
    And collect the attention for each output words.
    @param encoder: the encoder network
    @param decoder: the decoder network
    @param sentence: string, a sentence in source language to be translated
    @param max_length: the max # of words that the decoder can return
    @output decoded_words: a list of words in target language
    @output decoder_attentions: a list of vector, each of which sums up to 1.0
    """    
    # process input sentence
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        # encode the source lanugage
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS
        # decode the context vector
        decoder_hidden = encoder_hidden # decoder starts from the last encoding sentence
        # output of this function
        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            # for each time step, the decoder network takes two inputs: previous outputs and the previous hidden states
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            
            # hint: print out decoder_output and decoder_attention
            # TODO: add your code here to populate decoded_words and decoder_attentions
            # TODO: do this in 2 ways discussed in class: greedy & beam_search
            
            # END TO DO
            
            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

### Visualizing Attention

In [16]:
def showAttention(input_sentence, output_words, attentions):
    """
    Function that takes in attention and visualize the attention.
    @param - input_sentence: string the represent a list of words from source language
    @param - output_words: the gold translation in target language
    @param - attentions: a numpy array
    """
    # Set up figure with colorbar    

    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    # TODO: Add your code here to visualize the attention
    # look at documentation for imshow https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.matshow.html

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)
