## Language Translator

In [1]:
import nltk
import regex as re
import collections

import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros

from keras.models import Sequential
from keras import layers
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

#### Bring in text

In [2]:
nltk.download('comtrans')

[nltk_data] Downloading package comtrans to
[nltk_data]     /Users/reneehall/nltk_data...
[nltk_data]   Package comtrans is already up-to-date!


True

In [3]:
from nltk.corpus import comtrans
print(comtrans.aligned_sents('alignment-de-en.txt')[0])

<AlignedSent: 'Wiederaufnahme der S...' -> 'Resumption of the se...'>


In [4]:
len(comtrans.aligned_sents('alignment-de-en.txt'))

33334

In [23]:
als = comtrans.aligned_sents('alignment-de-en.txt')

In [24]:
de = [sent.words for sent in als]
en = [sent.mots for sent in als]

In [25]:
# for now, let's just select 10000 entries to reduce training time
de = de[:10000]
en = en[:10000]

In [8]:
de[3]

['Im',
 'Parlament',
 'besteht',
 'der',
 'Wunsch',
 'nach',
 'einer',
 'Aussprache',
 'im',
 'Verlauf',
 'dieser',
 'Sitzungsperiode',
 'in',
 'den',
 'nächsten',
 'Tagen',
 '.']

In [9]:
en[3]

['You',
 'have',
 'requested',
 'a',
 'debate',
 'on',
 'this',
 'subject',
 'in',
 'the',
 'course',
 'of',
 'the',
 'next',
 'few',
 'days',
 ',',
 'during',
 'this',
 'part-session',
 '.']

#### clean

In [10]:
def clean(docs):
    processed_features = []
    for sentence in docs:
        # Remove all the special characters
        words = [word for word in sentence if word.isalpha()]
    
        # Substituting multiple spaces with single space
        words = [re.sub(r"\s{2,}"," ", word) for word in words]

        # Converting to Lowercase
        words = [word.lower() for word in words]
    
        new_sent = " ".join(words)
        processed_features.append(new_sent)
    return processed_features

In [26]:
en = clean(en)

In [27]:
de = clean(de)

In [13]:
de[0]

'wiederaufnahme der sitzungsperiode'

#### Add keywords

In [28]:
input_sentences = en
output_sentences = []
output_sentences_inputs = []

for i in de:
    output = i + ' <eos>'
    output_input = '<sos> ' + i
    output_sentences.append(output)
    output_sentences_inputs.append(output_input)

In [29]:
output_sentences[5]

'das parlament erhebt sich zu einer schweigeminute <eos>'

In [30]:
output_sentences_inputs[5]

'<sos> das parlament erhebt sich zu einer schweigeminute'

In [17]:
english_words_counter = collections.Counter([word for sentence in en for word in sentence.split()])
de_words_counter = collections.Counter([word for sentence in de for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in en for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} German words.'.format(len([word for sentence in de for word in sentence.split()])))
print('{} unique German words.'.format(len(de_words_counter)))
print('10 Most common words in the German dataset:')
print('"' + '" "'.join(list(zip(*de_words_counter.most_common(10)))[0]) + '"')

188812 English words.
8992 unique English words.
10 Most common words in the English dataset:
"the" "of" "to" "and" "in" "is" "a" "that" "this" "we"

171280 German words.
15880 unique German words.
10 Most common words in the German dataset:
"die" "der" "und" "in" "wir" "zu" "ich" "das" "den" "ist"


#### Tokenize

In [31]:
from keras.preprocessing.text import Tokenizer

input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 8992
Length of longest sentence in input: 39


In [32]:
output_tokenizer = Tokenizer(filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the output: 15882
Length of longest sentence in the output: 39


#### Padding

In [33]:
# use 0's for padding, so that all sequences are the same length
from keras.preprocessing.sequence import pad_sequences

encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[0]:", encoder_input_sequences[0])

encoder_input_sequences.shape: (10000, 39)
encoder_input_sequences[0]: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0 2422    2    1 1296]


In [34]:
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[172]:", decoder_input_sequences[0])

decoder_input_sequences.shape: (10000, 39)
decoder_input_sequences[172]: [   4 2983    2 1488    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]


In [35]:
decoder_output_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_output_sequences.shape:", decoder_output_sequences.shape)
print("decoder_output_sequences[172]:", decoder_output_sequences[0])

decoder_output_sequences.shape: (10000, 39)
decoder_output_sequences[172]: [2983    2 1488    3    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]


#### Word Embedding

In [36]:
# english
# use GloVe

embeddings_dictionary = dict()

glove_file = open(r'glove/glove.6B.100d.txt', encoding='utf8')

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions

glove_file.close()

In [37]:
num_words = len(word2idx_inputs)+1
embedding_size = 100
embedding_matrix = zeros((num_words, embedding_size))
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [38]:
print(embeddings_dictionary['why'])

[ 1.8450e-01  5.1461e-01  6.5342e-01 -4.2173e-01 -8.1430e-01  5.0029e-02
 -4.1870e-01 -4.1949e-02  4.7558e-01 -5.4651e-01  4.3974e-01  2.6532e-01
  2.1381e-01 -7.1729e-02 -1.7475e-01 -1.8682e-01 -1.2933e-01  4.7129e-01
 -6.2407e-01  5.4606e-01 -4.2295e-02 -1.1002e-01 -3.1637e-01 -6.2179e-01
 -2.4532e-02  2.5281e-01 -1.8242e-02 -8.5596e-01  9.6847e-02  3.9929e-02
 -2.7546e-02  6.8141e-01  1.8839e-01  1.2421e-02 -1.8829e-01  3.3089e-01
 -2.0723e-02  2.8868e-01  5.7478e-01 -3.4546e-01 -6.1522e-01 -1.1323e-01
  9.6484e-02 -4.9250e-01 -6.4248e-01 -2.6363e-02  3.2317e-01 -3.1298e-01
 -4.2312e-01 -9.6755e-01  4.7128e-01  1.7033e-01  1.7940e-01  6.7091e-01
 -1.6210e-01 -1.9417e+00  2.3473e-01  1.3247e-01  1.2179e+00  6.0500e-01
 -2.5138e-01  1.0135e+00 -1.2321e-01 -2.3109e-01  1.1575e+00  3.2664e-01
  6.7967e-01  6.2483e-01 -4.3403e-01  2.4201e-01  2.2149e-01 -1.1082e-01
 -2.3777e-01 -2.1992e-01  4.7086e-02  2.5577e-01  3.3871e-01 -3.9868e-01
 -6.4011e-01  1.9467e-01  3.8811e-01 -3.4189e-01 -6

#### Create Embedding Layer

In [39]:
embedding_layer = Embedding(num_words,
                           embedding_size,
                           weights = [embedding_matrix],
                        input_length = max_input_len)

#### Creating the Model

In [40]:
# final output shape: (number of inputs, length of the output sentence, the number of words in the output)
# create empty output array
decoder_targets_one_hot = np.zeros((
        len(input_sentences),
        max_out_len,
        num_words_output
    ),
    dtype='float32'
)

In [41]:
decoder_targets_one_hot.shape

(10000, 39, 15883)

In [42]:
# make it dense
for i, d in enumerate(decoder_output_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

In [43]:
# define the encoder
LSTM_NODES = 256

encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

In [44]:
# define the decoder
decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

In [45]:
# create dense layer
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [46]:
# compile model
model = Model([encoder_inputs_placeholder,
  decoder_inputs_placeholder], decoder_outputs)

model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [47]:
# train model
BATCH_SIZE = 500
EPOCHS = 20

r = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


#### Modify for predictions

In [48]:
encoder_model = Model(encoder_inputs_placeholder, encoder_states)

decoder_state_input_h = Input(shape=(LSTM_NODES,))
decoder_state_input_c = Input(shape=(LSTM_NODES,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

decoder_outputs, h, c = decoder_lstm(decoder_inputs_single_x, initial_state=decoder_states_inputs)

decoder_states = [h, c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

In [49]:
idx2word_input = {v:k for k, v in word2idx_inputs.items()}
idx2word_target = {v:k for k, v in word2idx_outputs.items()}

In [50]:
def translate_sentence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']
    eos = word2idx_outputs['<eos>']
    output_sentence = []

    for _ in range(max_out_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        idx = np.argmax(output_tokens[0, 0, :])

        if eos == idx:
            break

        word = ''

        if idx > 0:
            word = idx2word_target[idx]
            output_sentence.append(word)

        target_seq[0, 0] = idx
        states_value = [h, c]

    return ' '.join(output_sentence)

In [51]:
i = np.random.choice(len(input_sentences))
input_seq = encoder_input_sequences[i:i+1]
translation = translate_sentence(input_seq)
print('-')
print('Input:', input_sentences[i])
print('Response:', translation)

-
Input: but as you will also be aware we are waiting for a commission communication on coherence together with the communication on development policy
Response: wir müssen wir nicht nicht die der union und die kommission und die kommission in der europäischen union


In [143]:
def clean_tests(docs):    
    # Substituting multiple spaces with single space
    words = [re.sub(r"\s{2,}"," ", word) for word in docs]

    # Converting to Lowercase
    words = [word.lower() for word in words]
    
    new_sent = " ".join(words)
    return new_sent

In [124]:
def to_translate(text):
    text = clean_tests(text)
    T_text = input_tokenizer.texts_to_sequences([text])
    pad_text = pad_sequences(T_text, maxlen=max_input_len)
    return pad_text

In [196]:
phrase = ['This is a project presentation on NLP']
test = to_translate(phrase)
translation = translate_sentence(test)

In [197]:
translation

'das ist die kommission'

In [183]:
de2 = [sent.words for sent in als]
en2 = [sent.mots for sent in als]

In [184]:
en2 = clean(en2)

In [191]:
en2[11030]

'i cannot therefore tell you whether we agreed or not on a specific action carried out by nato but it was part of an overall plan with which we do agree'

In [192]:
de2 = clean(de2)

In [193]:
de2[11030]

'deshalb kann ich ihnen nicht sagen ob wir mit einer konkreten einverstanden waren oder nicht sie gehören zu einem gesamtplan mit dem wir einverstanden sind'