Machine translation is a challenge for computers not to only understand human languages but also to generate languages. A machine translation can be viewed as a conditional language model, given a source sentence $x_i$, we needed to calculate the probability of generated sentence $p(y_i|x_i)$. In early years, statistical machine translation(SMT) was a focus, amongst which IBM models were basis, if you are interested, please visit Michael Collins' [webpage](http://www.cs.columbia.edu/~mcollins/), there he provided many useful and explicit lecture notes to illustrate basis terms and models of SMT.

In recent years, with the development of artificial neural networks as well as deep learning applications, neural translation models were explored, especially [seq2seq](https://arxiv.org/pdf/1406.1078v3.pdf) model as well as later models has improved performances of machine translation.

This project aims to realize a neural machine translation model through seq2seq concept.

Specify the paths of the original dataset.

In [68]:
# Data Parameters
data_dir = 'temp'
data_file = 'eng_ger.txt'

In [69]:
# Test Translation from English (lowercase, no punct)
test_english = ['hello where is my computer',
                'the quick brown fox jumped over the lazy dog',
                'is it going to rain tomorrow']


# Make data directory
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

## Data Acquiry

Download the data from website if it does not exist.

In [70]:
import os
class dataReader:
    '''
    Read text files from local drive.
    If not exists, download it.
    '''
    def __init__(self, file_path):
        self.file_path = file_path
        if not self.__checkExists():
            self.__download()
        
            
    def loadData(self):
        #print('Data Exists!')
        eng_ger_data = []
        with open(self.file_path, 'r') as in_conn:
            for row in in_conn:
                eng_ger_data.append(row[:-1])
        return eng_ger_data

    def __download(self):
        '''Download text files which contain translation pairs'''
        print('Data not found, downloading Eng-Ger sentences from www.manythings.org')
        sentence_url = 'http://www.manythings.org/anki/deu-eng.zip'
        r = urllib.request.urlopen(sentence_url)
        z = ZipFile(io.BytesIO(r.read()))
        file = z.read('deu.txt')
        # Format Data
        eng_ger_data = file.decode()
        eng_ger_data = eng_ger_data.encode('ascii',errors='ignore')
        eng_ger_data = eng_ger_data.decode().split('\n')
        # Write to file
        with open(self.file_path, 'w') as out_conn:
            for sentence in eng_ger_data:
                out_conn.write(sentence + '\n')
                
    def __checkExists(self):
        '''Check the file'''
        return os.path.isfile(self.file_path)

In [71]:
dr = dataReader('eng_ger.txt')
eng_ger_data = dr.loadData()

In [72]:
eng_ger_data[:10]

['Hi.\tHallo!',
 'Hi.\tGr Gott!',
 'Run!\tLauf!',
 'Wow!\tPotzdonner!',
 'Wow!\tDonnerwetter!',
 'Fire!\tFeuer!',
 'Help!\tHilfe!',
 'Help!\tZu Hlf!',
 'Stop!\tStopp!',
 'Wait!\tWarte!']

## Data Processig

Preprocess the original data.

In [73]:
import string
vocab_size = 10000
class textHandler:
    '''Split sentences into pairs of Source-Target language'''
    def __init__(self, data, vocab_size):
        self.data = data
        self.vocab_size = vocab_size
        self.__sentSplit()
        
    def __removePunctuation(self):
        '''Remove punctuation'''
        # Remove punctuation
        punct = string.punctuation
        pair_data = [''.join(char for char in sent if char not in punct) for sent in self.data]
        return pair_data
    
    def __sentSplit(self):
        # Break each sentence pair by tabs, one part is English, the other is German. 
        pair_data = self.__removePunctuation()
        s_t_data = [x.split('\t') for x in pair_data if len(x)>=1]
        [source_sentence, target_sentence] = [list(x) for x in zip(*s_t_data)]
        #Split each sentence into words
        self.source_sentence = [x.lower().split() for x in source_sentence]
        self.target_sentence = [x.lower().split() for x in target_sentence]
        #return source_sentence, target_sentence
    
    def __buildVocab(self, sents):
        '''Build Vocabulary for both languages'''
        # Process the English Vocabulary
        all_words = [word for sent in sents for word in sent]
        #Count the frequency of English words
        all_words_counts = Counter(all_words)
        #Get the most frequent vocab_size words, left regarded as unknow
        word_keys = [x[0] for x in all_words_counts.most_common(self.vocab_size-3)] 
        #Word to ID, set Starting token as 'SOS', ending token as 'EOS'
        vocab2ix = dict(zip(word_keys, range(2,self.vocab_size-1)))
        vocab2ix['SOS'] = 0
        vocab2ix['EOS'] = 1
        #ID to Word
        ix2vocab = {val:key for key, val in vocab2ix.items()}
        return vocab2ix, ix2vocab
    
    def getSents(self):
        '''Get preprocessed sentences'''
        return self.source_sentence, self.target_sentence
    
    def sent2vec(self, sents, vocab2ix):
        '''Transform sentences into Ids'''
        processed = []
        for sent in sents:
            temp_sentence = []
            for word in sent:
                try:
                    temp_sentence.append(vocab2ix[word])
                except:
                    #Unknown words
                    temp_sentence.append(self.vocab_size-1)
            processed.append(temp_sentence)
        return processed
    
    def generateVocab(self):
        '''Generate Vocabulary'''
        #source_sentence, target_sentence = self.__sentSplit()
        source_vocab2ix, source_ix2vocab = self.__buildVocab(self.source_sentence)
        target_vocab2ix, target_ix2vocab = self.__buildVocab(self.target_sentence)
        return source_vocab2ix, source_ix2vocab, target_vocab2ix, target_ix2vocab
        


In [74]:
th = textHandler(data=eng_ger_data, vocab_size=vocab_size)

In [75]:
english_sentence, german_sentence = th.getSents()

In [76]:
eng_vocab2ix, eng_ix2vocab, ger_vocab2ix, ger_ix2vocab = th.generateVocab()

In [77]:
english_processed = th.sent2vec(english_sentence, eng_vocab2ix)
german_processed = th.sent2vec(german_sentence, ger_vocab2ix)

In [97]:
test_data = ['I love this dog', 'What a nice day', 'This is a book']
test_data = [x.lower().split() for x in test_data]
test_data = th.sent2vec(test_data, eng_vocab2ix)

In [98]:
test_data

[[5, 168, 17, 191], [24, 7, 392, 117], [17, 8, 7, 123]]

## Build a simple encoder-decoder architecture

In this demo, we use a simple encoder-decoder architecture to train and infer translations. We encode all the word vectors in source sentences into a fixed vector, then make the fixed vector as an input for the decoder.

In [99]:
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

use_cuda = torch.cuda.is_available()

In [100]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        #Get embedding series of input words
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        #Compress the input vectors into RNN
        for i in range(self.n_layers):
            output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        #Create a initial zero hidden state
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        result = result.cuda() if use_cuda else result
        return result

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        #Create embedding
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax()

    def forward(self, input, hidden):
        #Transform input word id into embedding
        output = self.embedding(input).view(1, 1, -1)
        #Generate output through input and last hidden state
        for i in range(self.n_layers):
            output = F.relu(output)
            output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        #Create a initial zero hidden state
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        result = result.cuda() if use_cuda else result
        return result

## Training Data
In order to understand the mechanism of neural machine translation, we wrap a pair of translation sentences each time instead of a batch of them.

In [128]:
hidden_size = 256
max_length = 10
encoder1 = EncoderRNN(vocab_size, hidden_size)

In [129]:
index = 100
input_variable, target_variable = english_processed[index], german_processed[index]

In [130]:
#Transform the input data and target into Variable vectors
input_variable = Variable(torch.LongTensor(input_variable).view(-1, 1))
target_variable = Variable(torch.LongTensor(target_variable).view(-1, 1))
input_length = input_variable.size()[0]
target_length = target_variable.size()[0]

### Encoder
We can compress a sereis of word embeddings into a final hidden state and output through RNN.
$$h_t = f(h_{t-1}, x_t)$$

In [131]:
encoder_hidden = encoder1.initHidden()
encoder_outputs = Variable(torch.zeros(max_length, encoder1.hidden_size)).cuda()
#Calculate the final state of input words
for ei in range(input_length):
    encoder_output, encoder_hidden = encoder1(
        input_variable[ei], encoder_hidden)
    encoder_outputs[ei] = encoder_output[0][0]

### Decoder without Attention

In the decoder part,for training, we only take two inputs into consideration: The first is the target variables provided, and the second is the previous hidden state initialized by the final state($C_T$) of the encoder.
$$h_t = f(h_{t-1}, y_{t-1}), h_0=C_T$$

In [132]:
#Create an instance for decoder
decoder1 = DecoderRNN(hidden_size, vocab_size)

In [133]:
learning_rate = 0.001
encoder_optimizer = optim.SGD(encoder1.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder1.parameters(), lr=learning_rate)

In [137]:
loss = 0
criterion = nn.NLLLoss()
decoder_input = Variable(torch.LongTensor([[0]]))
#Set the beginning hidden state of decoder as the final state of encoder
decoder_hidden = encoder_hidden
for di in range(target_length):
    decoder_output, decoder_hidden = decoder1(
        decoder_input, decoder_hidden)
    loss += criterion(decoder_output[0], target_variable[di])
    #Set the target as input
    decoder_input = target_variable[di]  # Teacher forcing

In [135]:
print(loss)

Variable containing:
 18.2935
[torch.FloatTensor of size 1]



## Wrap it up

Now, we can put the training procedures in one function.

In [110]:
compressed = list(zip(english_processed, german_processed))

In [151]:
#Filter those long sentences
pairs_filtered = []
#Because we need to add one ending tokens later, so substract 1 here
for item in compressed:
    if len(item[0]) <= (max_length-1) and len(item[0]) > 3:
        pairs_filtered.append(item)

In [152]:
len(pairs_filtered)

122833

In [153]:
import numpy as np
criterion = nn.NLLLoss()
def training(encoder, decoder, encoder_optimizer, decoder_optimizer, epochs=1):
    for e in range(epochs):
        np.random.shuffle(pairs_filtered)
        for c, pair in enumerate(pairs_filtered):
            #Add ending tokens for each pair
            input_data, target_data = pair[0], pair[1]
            input_data.append(1)
            target_data.append(1)
            #Transform the input data and target into Variable vectors
            input_variable = Variable(torch.LongTensor(input_data).view(-1, 1))
            target_variable = Variable(torch.LongTensor(target_data).view(-1, 1))
            input_length = input_variable.size()[0]
            target_length = target_variable.size()[0]
            encoder_hidden = encoder.initHidden()
            encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
            #Calculate the final state of input words
            for i in range(input_length):
                encoder_output, encoder_hidden = encoder(
                    input_variable[i], encoder_hidden)
                if i == max_length:
                    print(c, pair)
                encoder_outputs[i] = encoder_output[0][0]
            #Clear grads
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()
            loss = 0
            decoder_input = Variable(torch.LongTensor([[0]]))
            #Set the beginning hidden state of decoder as the final state of encoder
            decoder_hidden = encoder_hidden
            for di in range(target_length):
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden)
                #print(decoder_output[0].size())
                #print('*'*20)
                #print(target_variable[di])
                loss += criterion(decoder_output[0], target_variable[di])
                #Set the target as input
                decoder_input = target_variable[di]  # Teacher forcing
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()
            if c%200 == 0:
                print(loss.data[0] / target_length)

In [154]:
encoder1 = EncoderRNN(vocab_size, hidden_size)
decoder1 = DecoderRNN(hidden_size, vocab_size)
encoder1_optimizer = optim.SGD(encoder1.parameters(), lr=learning_rate)
decoder1_optimizer = optim.SGD(decoder1.parameters(), lr=learning_rate)
training(encoder1, decoder1, encoder1_optimizer, decoder1_optimizer)

9.267947605678014
7.598428998674665
6.105644226074219
5.3449232918875555
7.253068542480468
7.965466181437175
6.930266571044922
5.051567077636719
6.856137084960937
6.641273498535156
6.831576824188232
7.747750759124756
7.481189727783203
7.446408952985491
4.015239715576172
7.633642408582899
6.758612738715278
5.9428151448567705
6.0509490966796875
6.263080596923828
6.491754531860352
6.0975807189941404
5.787358093261719
5.547765731811523
5.210450914171007
5.621366228376116
5.877564748128255
6.181207275390625
6.781126658121745
6.061775970458984
7.495883941650391
2.7922122478485107
5.699005550808376
4.900494893391927
4.520580927530925
5.432256426130023
5.0531158447265625
6.049356460571289
4.832286198933919
3.9473625818888345
4.982153574625651
6.0291748046875
6.019882678985596
7.092292022705078
4.728400230407715
6.9780455695258246
4.893213272094727
6.640546625310725
6.496949005126953
6.048442459106445
4.841963450113933
6.238411903381348
5.215022193060981
5.563482284545898
4.446040562220982
6.46

KeyboardInterrupt: 

In [155]:
def evaluate(encoder, decoder, sentence, max_length=10):
    input_variable = Variable(torch.LongTensor(sentence).view(-1, 1))
    input_length = input_variable.size()[0]
    encoder_hidden = encoder.initHidden()

    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
    #encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_variable[ei],
                                                 encoder_hidden)
        #encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0]
    
    #Set the inital value as SOS token
    decoder_input = Variable(torch.LongTensor([[0]]))  # SOS


    decoder_hidden = encoder_hidden

    decoded_words = []


    for di in range(max_length):
        decoder_output, decoder_hidden = decoder(
            decoder_input, decoder_hidden)
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni == 1:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(ger_ix2vocab[ni])

        decoder_input = Variable(torch.LongTensor([[ni]]))
        #decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    print(decoded_words)

In [156]:
test_data[1]

[24, 7, 392, 117]

In [157]:
evaluate(encoder1, decoder1, test_data[2])

['tom', 'hat', 'sich', 'nicht', '<EOS>']
