In [1]:
import tensorflow as tf

In [3]:
from collections import Counter

import NMT_Model
import nmt_data_utils
import nmt_model_utils

In [4]:
# load the english texts
with open('europarl-v7.de-en.en','r', encoding = 'utf-8') as f:
        en = f.readlines()

In [5]:
# load the german texts
with open('europarl-v7.de-en.de','r',encoding = 'utf-8') as f:
    de = f.readlines()

In [6]:
len(en), len(de)

(1920209, 1920209)

In [7]:
# first 5 sentence pairs. 
for line in zip(en[:5], de[:5]):
    print(line, '\n')

('Resumption of the session\n', 'Wiederaufnahme der Sitzungsperiode\n') 

('I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.\n', 'Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.\n') 

("Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.\n", 'Wie Sie feststellen konnten, ist der gefürchtete "Millenium-Bug " nicht eingetreten. Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden.\n') 

('You have requested a debate on this subject in the course of the next few days, during this p

In [8]:
# remove unnecessary new lines. 
de = [line.strip() for line in de]
en = [line.strip() for line in en]

In [9]:
# we will only use sentences of similar lengths in order to make training easier. 
len_en = [len(sent) for sent in en if 20 < len(sent) < 50]
len_dist = Counter(len_en).most_common()
len_dist

[(47, 7266),
 (49, 7113),
 (45, 6928),
 (35, 6833),
 (48, 6813),
 (21, 6642),
 (44, 6519),
 (46, 6491),
 (43, 6443),
 (40, 6130),
 (42, 6108),
 (37, 5824),
 (41, 5793),
 (34, 5711),
 (39, 5682),
 (29, 5659),
 (38, 5599),
 (33, 5496),
 (36, 5452),
 (31, 4651),
 (32, 4554),
 (30, 4441),
 (27, 4117),
 (28, 4062),
 (26, 3989),
 (25, 3911),
 (24, 3762),
 (23, 3473),
 (22, 2776)]

In [10]:
# 158238 sentences that contain betwenn 20 and 50 words.
len(len_en)

158238

In [11]:
_de = []
_en = []
for sent_de, sent_en in zip(de, en):
    if 20 < len(sent_en) < 50:
        _de.append(sent_de)
        _en.append(sent_en)

In [12]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pares\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
%%time
# but we will not use all 150 000 sentences, only 5000 for the beginning. 
en_preprocessed, en_most_common = nmt_data_utils.preprocess(_en[:5000])
de_preprocessed, de_most_common = nmt_data_utils.preprocess(_de[:5000], language = 'german')

Wall time: 2.6 s


In [14]:
len(en_preprocessed), len(de_preprocessed)


(5000, 5000)

In [15]:
# for some of the sentences there is not german or english counterpart, i.e. only an empy array []
# therefore we will remove those sentence pairs.
en_preprocessed_clean, de_preprocessed_clean = [], []

for sent_en, sent_de in zip(en_preprocessed, de_preprocessed):
    if sent_en != [] and sent_de != []:
        en_preprocessed_clean.append(sent_en)
        de_preprocessed_clean.append(sent_de)
    else:
        continue

In [16]:
len(en_preprocessed_clean), len(de_preprocessed_clean)

(4988, 4988)

In [17]:
for e, d in zip(en_preprocessed_clean, de_preprocessed_clean[:5]):
    print('English:\n', e)
    print('German:\n', d, '\n'*3)

English:
 ['resumption', 'of', 'the', 'session']
German:
 ['wiederaufnahme', 'der', 'sitzungsperiode'] 



English:
 ['please', 'rise', ',', 'then', ',', 'for', 'this', 'minute', "'", 's', 'silence', '.']
German:
 ['ich', 'bitte', 'sie', ',', 'sich', 'zu', 'einer', 'schweigeminute', 'zu', 'erheben', '.'] 



English:
 ['(', 'the', 'house', 'rose', 'and', 'observed', 'a', 'minute', "'", 's', 'silence', ')']
German:
 ['(', 'das', 'parlament', 'erhebt', 'sich', 'zu', 'einer', 'schweigeminute', '.', ')'] 



English:
 ['madam', 'president', ',', 'on', 'a', 'point', 'of', 'order', '.']
German:
 ['frau', 'präsidentin', ',', 'zur', 'geschäftsordnung', '.'] 



English:
 ['madam', 'president', ',', 'on', 'a', 'point', 'of', 'order', '.']
German:
 ['frau', 'präsidentin', ',', 'zur', 'geschäftsordnung', '.'] 





In [18]:
en_most_common[:15], len(en_most_common), len(de_most_common)

([('.', 3981),
  ('the', 1864),
  ('is', 1371),
  ('this', 860),
  (',', 842),
  ('to', 822),
  ('we', 736),
  ('i', 677),
  ('that', 619),
  ('a', 611),
  ('of', 592),
  ('it', 486),
  ('not', 474),
  (')', 451),
  ('(', 450)],
 4135,
 5410)

## Create vocab

In [19]:
# now we can create oyr lookup dicts for english and german, i.e. our vocab. 
# we will also include special tokens, later on used in the model. 
specials = ["<unk>", "<s>", "</s>", '<pad>']

en_word2ind, en_ind2word, en_vocab_size = nmt_data_utils.create_vocab(en_most_common, specials)
de_word2ind, de_ind2word, de_vocab_size = nmt_data_utils.create_vocab(de_most_common, specials)

In [20]:
en_vocab_size, de_vocab_size

(4139, 5414)

## Convert to indices

In [21]:
# in order to feed the sentences to the network, we have to convert them to ints, corresponding to their indices
# in the lookup dicts. 
# we reverse the source language sentences, i.e. the english sentences as this alleviates learning for the seq2seq 
# model. Apart from this we also include EndOfSentence and StartOfSentence tags, which are needed as well. 
en_inds, en_unknowns = nmt_data_utils.convert_to_inds(en_preprocessed_clean, en_word2ind, reverse = True, eos = True)
de_inds, de_unknowns = nmt_data_utils.convert_to_inds(de_preprocessed_clean, de_word2ind, sos = True, eos = True)

In [22]:
[nmt_data_utils.convert_to_words(sentence, en_ind2word) for sentence in  en_inds[:2]]

[['session', 'the', 'of', 'resumption', '</s>'],
 ['.',
  'silence',
  's',
  "'",
  'minute',
  'this',
  'for',
  ',',
  'then',
  ',',
  'rise',
  'please',
  '</s>']]

## Train the model

In [23]:
# hyperparams. 
# those are probably not perfect, but work fine for now. 
num_layers_encoder = 4
num_layers_decoder = 4
rnn_size_encoder = 128
rnn_size_decoder = 128
embedding_dim = 300

batch_size = 64
epochs = 5 
clip = 5
keep_probability = 0.8
learning_rate = 0.01
learning_rate_decay_steps = 1000
learning_rate_decay = 0.9

In [24]:
# create the graph and train the model. 
nmt_model_utils.reset_graph()

nmt = NMT_Model.NMT(en_word2ind,
                    en_ind2word,
                    de_word2ind,
                    de_ind2word,
                    './models/local_one/my_model',
                    'TRAIN',
                    embedding_dim = embedding_dim,
                    num_layers_encoder = num_layers_encoder,
                    num_layers_decoder = num_layers_decoder,
                    batch_size = batch_size,
                    clip = clip,
                    keep_probability = keep_probability,
                    learning_rate = learning_rate,
                    epochs = epochs,
                    rnn_size_encoder = rnn_size_encoder,
                    rnn_size_decoder = rnn_size_decoder, 
                    learning_rate_decay_steps = learning_rate_decay_steps,
                    learning_rate_decay = learning_rate_decay)
  
nmt.build_graph()
nmt.train(en_inds, de_inds)

Graph built.
-------------------- Epoch 0 of 5 --------------------
Iteration: 0 of 77	train_loss: 8.5966
Iteration: 2 of 77	train_loss: 15.3424
Iteration: 4 of 77	train_loss: 8.3517
Iteration: 6 of 77	train_loss: 7.8113
Iteration: 8 of 77	train_loss: 7.1683
Iteration: 10 of 77	train_loss: 6.5226
Iteration: 12 of 77	train_loss: 6.3681
Iteration: 14 of 77	train_loss: 5.9449
Iteration: 16 of 77	train_loss: 5.9254
Iteration: 18 of 77	train_loss: 5.7976
Iteration: 20 of 77	train_loss: 5.3574
Iteration: 22 of 77	train_loss: 5.4124
Iteration: 24 of 77	train_loss: 5.3263
Iteration: 26 of 77	train_loss: 5.3544
Iteration: 28 of 77	train_loss: 5.3601
Iteration: 30 of 77	train_loss: 5.1930
Iteration: 32 of 77	train_loss: 5.1226
Iteration: 34 of 77	train_loss: 4.8164
Iteration: 36 of 77	train_loss: 4.9414
Iteration: 38 of 77	train_loss: 4.5892
Iteration: 40 of 77	train_loss: 4.8798
Iteration: 42 of 77	train_loss: 4.7126
Iteration: 44 of 77	train_loss: 4.5648
Iteration: 46 of 77	train_loss: 4.4728


Iteration: 72 of 77	train_loss: 2.1395
Iteration: 74 of 77	train_loss: 2.1505
Iteration: 76 of 77	train_loss: 2.3596
Iteration: 77 of 77	train_loss: 2.1795
Average Score for this Epoch: 2.1676385402679443
--- new best score ---


-------------------- Epoch 5 of 5 --------------------
Iteration: 0 of 77	train_loss: 2.2201
Iteration: 2 of 77	train_loss: 2.0553
Iteration: 4 of 77	train_loss: 1.6248
Iteration: 6 of 77	train_loss: 1.7976
Iteration: 8 of 77	train_loss: 1.6297
Iteration: 10 of 77	train_loss: 1.8938
Iteration: 12 of 77	train_loss: 1.9537
Iteration: 14 of 77	train_loss: 1.9417
Iteration: 16 of 77	train_loss: 1.9504
Iteration: 18 of 77	train_loss: 1.8164
Iteration: 20 of 77	train_loss: 2.0102
Iteration: 22 of 77	train_loss: 1.8986
Iteration: 24 of 77	train_loss: 1.9646
Iteration: 26 of 77	train_loss: 1.9346
Iteration: 28 of 77	train_loss: 2.1057
Iteration: 30 of 77	train_loss: 2.0597
Iteration: 32 of 77	train_loss: 2.0495
Iteration: 34 of 77	train_loss: 2.2313
Iteration: 36 of 7

## Test the Model

In [28]:
_de_inds, _de_unknowns = nmt_data_utils.convert_to_inds(de_preprocessed_clean, de_word2ind, sos = True,  eos = True)

In [29]:
# the inference model does not necessaryly need to get input batches. we can just give it. the whole input
# data, but the the batchsize has to be specified as the lenght of the input data.
nmt_model_utils.reset_graph()

nmt = NMT_Model.NMT(en_word2ind,
                    en_ind2word,
                    de_word2ind,
                    de_ind2word,
                    './models/local_one/my_model',
                    'INFER',
                    num_layers_encoder = num_layers_encoder,
                    num_layers_decoder = num_layers_decoder,
                    batch_size = len(en_inds[:50]),
                    keep_probability = 1.0,
                    learning_rate = 0.0,
                    beam_width = 0,
                    rnn_size_encoder = rnn_size_encoder,
                    rnn_size_decoder = rnn_size_decoder)

nmt.build_graph()
preds = nmt.infer(en_inds[:50], restore_path =  './models/local_one/my_model', targets = _de_inds[:50])

Graph built.
Restore graph from  ./models/local_one/my_model
INFO:tensorflow:Restoring parameters from ./models/local_one/my_model


In [30]:
# show some of the created translations
# Note: the way bleu score is probably not the perfect way to do it
nmt_model_utils.sample_results(preds, en_ind2word, de_ind2word, en_word2ind, de_word2ind, _de_inds[:50], en_inds[:50])




 ----------------------------------------------------------------------------------------------------
Actual Text:
resumption of the session

Actual translation:
wiederaufnahme der sitzungsperiode

Created translation:
wiederaufnahme frauen sitzungsperiode sitzungsperiode dialog

Bleu-score: 1.4488496539373276e-231



 ----------------------------------------------------------------------------------------------------
Actual Text:
please rise , then , for this minute ' s silence .

Actual translation:
ich bitte sie , sich zu einer schweigeminute zu erheben .

Created translation:
einführung fordern erhebt erhebt erhebt erhebt aufgabe aufgabe aufgabe aufgabe aufgabe aufgabe bekommen

Bleu-score: 0



 ----------------------------------------------------------------------------------------------------
Actual Text:
( the house rose and observed a minute ' s silence )

Actual translation:
( das parlament erhebt sich zu einer schweigeminute . )

Created translation:
( beifall erhebt erhe

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
