## Basic characted level rnn for nmt of English to French
### Dataset: [Link](http://www.manythings.org/anki/)

In [0]:
import numpy as np
from keras.models import Model
from keras.layers import Input, LSTM, Dense

In [0]:
d_path = './data/fra.txt'
batch_size=64
epochs=100
hidden_dim=256
num_samples=10000

## Read and preprocess data

In [81]:
with open(d_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
   
print(len(lines))

160873


In [0]:
inp_texts = []
target_texts = []
inp_chars = set()
target_chars = set()

for txt in lines[:min(num_samples, len(lines)-1)]:
    inp_txt, target_txt = txt.split('\t')
    # Since this is character level so we can only add one character as start and end token
    target_txt = '<' + target_txt + '>'
    inp_texts.append(inp_txt)
    target_texts.append(target_txt)
    
    for ch in inp_txt:
        inp_chars.add(ch)
    for ch in target_txt:
        target_chars.add(ch)

In [0]:
# Mappings
enc_ch2ix = {ch:i for i, ch in enumerate(inp_chars)}
enc_ix2ch = {i:ch for ch, i in enc_ch2ix.items()}
dec_ch2ix = {ch:i for i, ch in enumerate(target_chars)}
dec_ix2ch = {i:ch for ch, i in dec_ch2ix.items()}

In [0]:
def create_data(texts, vocab_size, mapping, mode='post'):
    max_len = max([len(txt) for txt in texts])
    data = np.zeros(shape=(len(texts), max_len, vocab_size))
    
    for i, txt in enumerate(texts):
        for t, ch in enumerate(txt):
            if mode == 'post':
                data[i, t, mapping[ch]] = 1
            elif mode == 'pre':
                t_minus = max_len - len(txt) + t
                data[i, t_minus, mapping[ch]] = 1
          
    return data

In [0]:
enc_len = len(inp_chars)
dec_len = len(target_chars)
enc_inp = create_data(inp_texts, enc_len, enc_ch2ix, mode='pre')
dec_inp = create_data(target_texts, dec_len, dec_ch2ix)
dec_target = np.zeros_like(dec_inp)
dec_target[:, :-1, :] = dec_inp[:, 1:, :]

## Training model

In [0]:
encoder_inputs = Input(shape=(None, enc_len))
encoder = LSTM(hidden_dim, return_state=True)
encoder_outputs, encoder_state_h, encoder_state_c = encoder(encoder_inputs)
encoder_states = [encoder_state_h, encoder_state_c]

decoder_inputs = Input(shape=(None, dec_len))
decoder = LSTM(hidden_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(dec_len, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)
model.compile(optimizer = 'rmsprop', loss='categorical_crossentropy')

In [88]:
model.fit([enc_inp, dec_inp], dec_target, batch_size=batch_size, epochs=10, validation_split=0.2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3a0ca70ac8>

## Inference Model

In [0]:
encoder_model = Model(inputs=encoder_inputs, outputs = encoder_states)

decoder_input_state_h = Input(shape=(hidden_dim,))
decoder_input_state_c = Input(shape=(hidden_dim,))
decoder_state_inputs = [decoder_input_state_h, decoder_input_state_c]

decoder_outputs, state_h, state_c = decoder(decoder_inputs, initial_state=decoder_state_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(inputs = [decoder_inputs] + decoder_state_inputs, outputs = [decoder_outputs] + decoder_states)

In [0]:
def translate(txt):
    enc_txt = create_data([txt], enc_len, enc_ch2ix, mode='pre')
    target_seq = np.zeros((1, 1, dec_len))
    target_seq[0, 0, dec_ch2ix['<']] = 1

    dec_states = encoder_model.predict(enc_txt)
    
    decoded = ''
    
    while True:
        # Call the decoder output
        dec_out, h, c = decoder_model.predict([target_seq] + dec_states)
        dec_states = [h, c]
        
        # Get the max token
        token = np.argmax(dec_out[0, 0])
        token_ch = dec_ix2ch[np.argmax(dec_out[0, 0])]

        if token_ch == '>' or len(decoded) > dec_len:
            break
        
        # Add the token and update the target_seq
        decoded += token_ch
        target_seq = np.zeros((1, 1, dec_len))
        target_seq[0, 0, token] = 1
    
    return decoded

In [0]:
def generate_samples(texts = inp_texts, trans_texts = target_texts, n_samples = 10):
    N = len(inp_texts)
    rand_ix = np.random.randint(0, N, n_samples)
    
    for i in rand_ix:
        txt = inp_texts[i]
        decoded = translate(txt)
        print(f'Text: {txt}')
        print(f'Target: {trans_texts[i][1:-1]}')
        print(f'Prediction: {decoded}')
        print()

In [92]:
generate_samples()

Text: They got it.
Target: Ils l'ont eu.
Prediction: Elles ont des noussas.

Text: Who's that boy?
Target: Qui est ce garçon ?
Prediction: Qui est eu vieux ?

Text: I serve no one.
Target: Je ne suis au service de personne.
Prediction: Je vous ai sauvées entrer.

Text: He is an actor.
Target: C'est un acteur.
Prediction: Il est en train de manger.

Text: The girls won.
Target: Les filles gagnèrent.
Prediction: Il est mon tourna.

Text: I'll need this.
Target: Je vais avoir besoin de ceci.
Prediction: Je veux le chancer.

Text: I'll marry you.
Target: Je t'épouserai.
Prediction: Je veux le trander.

Text: They're there.
Target: Ils sont là.
Prediction: Ils sont diventes.

Text: I love my home.
Target: J'adore mon chez-moi.
Prediction: J'adore les ffaires.

Text: Cows give milk.
Target: Les vaches donnent du lait.
Prediction: Les chaches conne nous ai sais de marrire.



### Since it is just character level rnn it is giving some good results after 10 epochs only, but is overfitting after 10 epochs.