### Neural Machine Translation (NMT) of names (cyrilic ->latin) using seq2seq

In [1]:
import numpy as np
import pandas as pd

Тук ще седи кратко описание на целия flow, от preprocessing до архитектурата

In [5]:
names = []
with open("data/guessed-names.ru-en") as names_file:
    for line in names_file:
        names.append(line)

def replace_delimiter(data):
    return [x.strip().replace("|||", "$") for x in data]

names = replace_delimiter(names)
        
print(names[0])
print(names[-1])
print(len(names))


Аамир Хан $ Aamir Khan
Яя Туре $ Yaya Touré
55496


In [6]:
names_dev = names[:40000]
names_test = names[39999:-1]
print(len(names_dev))
print(len(names_test))

40000
15496


In [9]:
names_ru = [x[:x.index("$")] for x in names]
names_en = [x[x.index("$")+2:-1] for x in names]
print(len(names_ru))
print(len(names_en))
print(names_en[:5])

55496
55496
['Aamir Kha', 'Aarne Arvone', 'Aarne Pohjone', 'Aarne Salovaar', 'Aarno Ruusuvuor']


Тъй като моделът ще се опитва да предсказва t(i+1)-тата буква спрямо наличните ще е нужно да видим с какви букви ще се борави като цяло

In [28]:
input_text = names_ru
# $ for start decoding and # for end
target_text = ["$" + en + "#" for en in names_en]

print(input_text[0])
print(target_text[0])

Аамир Хан 
$Aamir Kha#


In [30]:
input_ch = sorted(list(set([char for name in input_text for char in name ])))
target_ch = sorted(list(set([char for name in target_text for char in name ])))

print(input_ch)
print(len(input_ch))
print(target_ch)
print(len(target_ch))

[' ', '!', '-', '.', 'Ё', 'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З', 'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П', 'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч', 'Ш', 'Щ', 'Ы', 'Э', 'Ю', 'Я', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'ё', '—']
69
[' ', '#', '$', '-', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'À', 'Á', 'Ä', 'Å', 'Ç', 'È', 'É', 'Ê', 'Í', 'Ð', 'Ñ', 'Ó', 'Õ', 'Ö', 'Ø', 'Ú', 'Ü', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ú', 'û', 'ü', 'ý', 'ÿ', 'Ā', 'ā', 'ă', 'ą', 'Ć', 'ć', 'Ċ', 'Č', 'č', 'Ď', 'ď', 'Đ', 'đ', 'Ē', 'ē', 'ĕ', 'ė', 'ę', 'ě', 'Ğ', 'ğ', 'Ģ', 'ģ', 'Ī', 'ī', 'İ', 'ı', 'Ķ', 

In [32]:
input_ch_idx = dict([(char,i) for i, char in enumerate(input_ch)])
target_ch_idx = dict([(char,i) for i, char in enumerate(target_ch)])

In [33]:
max_encoder_name_len = max([len(name) for name in input_text])
max_decoder_name_len = max([len(name) for name in target_text])

print(max_encoder_name_len)
print(max_decoder_name_len)

33
33


In [34]:
encoder_input_data = np.zeros ( (len(input_text), max_encoder_name_len, len(input_ch)) )
decoder_input_data = np.zeros ( (len(input_text), max_decoder_name_len, len(target_ch)) )
decoder_target_data = np.zeros( (len(input_text), max_decoder_name_len, len(target_ch)) )

In [35]:
encoder_input_data.shape

(55496, 33, 69)

In [36]:
decoder_input_data.shape

(55496, 33, 164)

In [37]:
decoder_target_data.shape

(55496, 33, 164)

In [38]:
for i, (input_name, target_name) in enumerate(zip(input_text, target_text)):
    for t, char in enumerate(input_name):
        encoder_input_data[i, t, input_ch_idx[char]] = 1.
    for t, char in enumerate(target_name):
        decoder_input_data[i, t, target_ch_idx[char]] = 1.
        if t > 0:
            decoder_target_data[i, t - 1, target_ch_idx[char]] = 1.

Сега започваме да сглабяме encoder-decoder архитектурата

In [40]:
import keras
from keras.models import Model
from keras.layers import Input,Dense,LSTM, Embedding

Using TensorFlow backend.


In [41]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, len(input_ch)))
encoder = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)


In [42]:
encoder_states = [state_h, state_c]

In [44]:
decoder_inputs = Input(shape=(None, len(target_ch)))
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(len(target_ch), activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [45]:

# `encoder_input_data` & `decoder_input_data` -> `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=64,
          epochs=5,
          validation_split=0.2)

Train on 44396 samples, validate on 11100 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe9104b4080>

In [50]:
model.save('ru-en.h5')

ImportError: `save_model` requires h5py.

In [52]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)


In [53]:
reverse_input_ch_idx = dict(
    (i, char) for char, i in input_ch_idx.items())
reverse_target_ch_idx= dict(
    (i, char) for char, i in target_ch_idx.items())

In [64]:
def decode_sequence(input_seq):
  
    states_value = encoder_model.predict(input_seq)

  
    target_seq = np.zeros((1, 1, len(target_ch)))
   
    target_seq[0, 0, target_ch_idx['$']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_ch_idx[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '#' or
           len(decoded_sentence) > max_decoder_name_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, len(target_ch)))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [66]:
for seq_index in range(100):
    # Take one sequence (part of the training test)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_text[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: Аамир Хан 
Decoded sentence: Adrian Balla#
-
Input sentence: Аарне Арвонен 
Decoded sentence: Arthur Barri#
-
Input sentence: Аарне Похионен 
Decoded sentence: Arthur Barri#
-
Input sentence: Аарне Саловаара 
Decoded sentence: Arthur Barri#
-
Input sentence: Аарно Руусувуори 
Decoded sentence: Arthur Barri#
-
Input sentence: Аарно Юрьё-Коскинен 
Decoded sentence: Arthur Barri#
-
Input sentence: Аарон Авшаломов 
Decoded sentence: Adrian Balla#
-
Input sentence: Аарон Александр 
Decoded sentence: Adrian Balla#
-
Input sentence: Аарон Аппельфельд 
Decoded sentence: Adrian Balla#
-
Input sentence: Аарон Арроусмит 
Decoded sentence: Adrian Balla#
-
Input sentence: Аарон Барак 
Decoded sentence: Adrian Balla#
-
Input sentence: Аарон Бёрр 
Decoded sentence: Adrian Balla#
-
Input sentence: Аарон Буркхард 
Decoded sentence: Adrian Balla#
-
Input sentence: Аарон Галиндо 
Decoded sentence: Adrian Balla#
-
Input sentence: Аарон Гиллеспи 
Decoded sentence: Adrian Balla#
-
Input se