## Preprocess Data

In [0]:
lines=[]
with open('fra.txt') as f:
  for line in f:
    lines.append(line)

In [0]:
lines=lines[:14000]

In [0]:
english=[]
french=[]
for line in lines:
  split_up=line.split('\t')
  english.append(split_up[0])
  french.append(split_up[1])

In [6]:
english[:3]

['Go.', 'Hi.', 'Hi.']

In [7]:
french[:3]

['Va !', 'Salut !', 'Salut.']

In [0]:
num_pairs=len(english)

In [0]:
#we want to use \t as the start symbol and \n as the stop symbol
processed_french=["\t"+word+"\n" for word in french]

In [10]:
processed_french[:3]

['\tVa !\n', '\tSalut !\n', '\tSalut.\n']

In [11]:
english[:2]

['Go.', 'Hi.']

In [0]:
input_characters=set()
target_characters=set()

for sequence in english:
  for char in sequence:
    if char not in input_characters:
      input_characters.add(char)

for sequence in processed_french:
  for char in sequence:
    if char not in target_characters:
      target_characters.add(char)

In [0]:
input_characters=sorted(list(input_characters))
target_characters=sorted(list(target_characters))

num_encoder_tokens=len(input_characters)
num_decoder_tokens=len(target_characters)

max_encoder_seq_length=max([len(s) for s in english])
max_decoder_seq_length=max([len(s) for s in processed_french])

In [14]:
print('Number of samples: ', len(english))
print('Number of unique input tokens: ',num_encoder_tokens )
print('Number of unique target tokens: ',num_decoder_tokens )
print('Max sequence length for inputs: ',max_encoder_seq_length)
print('Max sequence length for targets: ',max_decoder_seq_length)

Number of samples:  14000
Number of unique input tokens:  73
Number of unique target tokens:  98
Max sequence length for inputs:  17
Max sequence length for targets:  59


In [0]:
input_token_index={char:i for i,char in enumerate(input_characters)}
target_token_index={char:i for i,char in enumerate(target_characters)}

In [0]:
import numpy as np
encoder_input_data=np.zeros((len(english),max_encoder_seq_length,num_encoder_tokens),dtype='float32')
#one hot encoding english sentences by character

In [17]:
len(english)==len(processed_french)

True

In [0]:
decoder_input_data=np.zeros((len(processed_french),max_decoder_seq_length,num_decoder_tokens),dtype='float32')
#one hot encoding french sentences for input into a decoder
decoder_output_data=np.zeros((len(processed_french),max_decoder_seq_length,num_decoder_tokens),dtype='float32')
#one hot encoding french sentences for their target data

In [0]:
for i, (input_text,target_text) in enumerate(zip(english,processed_french)):
  for t, char in enumerate(input_text):
    encoder_input_data[i,t,input_token_index[char]] = 1 # filling up one hot encoded vector
  
  encoder_input_data[i,t+1:,input_token_index[' ']] = 1 #pad the sequences with spaces

  for t,char in enumerate(target_text):
    # decoder target data is ahead of decoder input data by one step and will not include the starting character
    decoder_input_data[i,t,target_token_index[char]] = 1
    if t > 0 :
      decoder_output_data[i,t-1,target_token_index[char]] = 1 # the target sequence starts one step ahead
    
  decoder_input_data[i,t+1:,target_token_index[' ']] = 1 #pad up the decoder input sequences with spaces
  decoder_output_data[i,t:,target_token_index[' ']] = 1 #pad up the decoder output sequences with spaces

In [20]:
encoder_input_data[0] # the last few rows are padded by spaces

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [21]:
decoder_input_data[0][1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [22]:
decoder_output_data[0][0] # the first character of the target sequence is the second
#character of in the decoder input sequence

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [0]:
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model

In [0]:
latent_dim = 256 # dimensionality of encoding space
epochs = 100
num_samples=10000 # number of samples to use for training

## Defining Model

In [0]:
#Defining ENCODER for training process
encoder_inputs=Input(shape=(None,num_encoder_tokens)) #RNNs can handle a sequence of any length
encoder=LSTM(latent_dim,return_state=True) # return state will return a list 
# where the first entry is a list of of the outputs and the rest are internal states
encoder_outputs, state_h, state_c=encoder(encoder_inputs)
#discarding encoder outputs as we only need the hidden state and internal cell state
encoder_states=[state_h,state_c] # hidden state and internal cell state

In [0]:
#Defining DECODER for training process
decoder_inputs=Input(shape=(None,num_decoder_tokens))
decoder_lstm=LSTM(latent_dim,return_sequences=True,return_state=True)
decoder_outputs, _, _=decoder_lstm(decoder_inputs,initial_state=encoder_states)

In [0]:
decoder_dense=Dense(num_decoder_tokens,activation='softmax')
decoder_outputs=decoder_dense(decoder_outputs)

In [0]:
model=Model([encoder_inputs,decoder_inputs],decoder_outputs) # this model is for learning mappings between
#english and french sentences

In [32]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 73)]   0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None, 98)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 337920      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  363520      input_3[0][0]                    
                                                                 lstm[0][1]                   

In [0]:
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])

In [35]:
model.fit([encoder_input_data,decoder_input_data],decoder_output_data,epochs=epochs,validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f1e89c61ac8>

In [0]:
model.save('s2s.h5')

In [0]:
encoder_model=Model(encoder_inputs,encoder_states)# will take a one hot encoded english sequence as input
# and will output hidden state and cell state
#shape=(None,num_encoder_tokens)
#this model is for encoding some input sequence into state values
#the encoder_model is using layers from the trained model

decoder_state_input_h=Input(shape=(latent_dim,)) #dimensions of the hidden state are same as that
# of encoding_space
decoder_state_input_c=Input(shape=(latent_dim,))

decoder_states_inputs=[decoder_state_input_h, decoder_state_input_c] # these states are passed in from the encoder
#model

decoder_outputs, state_h, state_c=decoder_lstm(decoder_inputs,initial_state=decoder_states_inputs)

decoder_states=[state_h,state_c]
#these states are the output states after the encoder states and the one hot encoded target sequence has been passed through the
#decoder LSTM layer which returns sequences
decoder_outputs=decoder_dense(decoder_outputs)
#this is the softmax output for predicting the next character based on the context passed in by the encoder
decoder_model=Model([decoder_inputs]+decoder_states_inputs,[decoder_outputs]+decoder_states)# the decoder model
#decoder_model is also using layers that have weight attached to them from a trained model
#reverse look up for english and french characters
reverse_input_char_index={char:i for i,char in input_token_index.items()}
reverse_target_char_index={char:i for i,char in target_token_index.items()}

In [0]:
reverse_input_char_index={char:i for i,char in input_token_index.items()}
reverse_target_char_index={char:i for i,char in target_token_index.items()}

In [0]:
def decode_sequence(input_seq):
  #encode input into states
  states_value = encoder_model.predict(input_seq) # forward pass through encoder model to get states

  # generate an empty target sequence of length 1, so a single one hot encoded character with french characters characters
  target_seq=np.zeros((1,1,num_decoder_tokens))
  #set the first character of the target sequence as the start symbol '\t'
  target_seq[0,0,target_token_index['\t']]=1

  stop_condition=False
  decoded_sentence=''

  while not stop_condition:
    output_tokens, h, c =decoder_model.predict([target_seq]+states_value) # the target sequence will
    #first start of with the start symbol and the hidden and cell state of the encoded english input.
    #which will then output tokens and hidden and cell states
    sampled_token_index=np.argmax(output_tokens[0,-1,:]) # index of the character that the model has
    #predicted to be the next character
    sampled_char=reverse_target_char_index[sampled_token_index]
    decoded_sentence += sampled_char

    if sampled_char=='\n' or len(decoded_sentence)> max_decoder_seq_length:
      #stop generating as soon as model predicts a stop symbol or the sequence length exceeds the maximum length of
      #french sequences
      stop_condition=True
    
    #again we'll generate an empty target sequence for one character that is one hot encoded with the
    #last predicted character
    target_seq=np.zeros((1,1,num_decoder_tokens))
    target_seq[0,0,sampled_token_index]=1

    states_value=[h,c] # update the input states
  
  return decoded_sentence

In [47]:
encoder_input_data[0:1]

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]]], dtype=float32)

In [53]:
for seq_index in range(50):
  print('*'*10)
  input_seq=encoder_input_data[seq_index:seq_index+1]# to add an extra dimension to the encoded sequence
  decoded_sentence=decode_sequence(input_seq)
  print('Input Sentence: ',english[seq_index])
  print('Decoded Sequence: ', decoded_sentence)

**********
Input Sentence:  Go.
Decoded Sequence:  Va !

**********
Input Sentence:  Hi.
Decoded Sequence:  Salut !

**********
Input Sentence:  Hi.
Decoded Sequence:  Salut !

**********
Input Sentence:  Run!
Decoded Sequence:  Cours !

**********
Input Sentence:  Run!
Decoded Sequence:  Cours !

**********
Input Sentence:  Who?
Decoded Sequence:  Qui ?

**********
Input Sentence:  Wow!
Decoded Sequence:  Ça alors !

**********
Input Sentence:  Fire!
Decoded Sequence:  Au feu !

**********
Input Sentence:  Help!
Decoded Sequence:  À l'aide !

**********
Input Sentence:  Jump.
Decoded Sequence:  Saute.

**********
Input Sentence:  Stop!
Decoded Sequence:  Stop !

**********
Input Sentence:  Stop!
Decoded Sequence:  Stop !

**********
Input Sentence:  Stop!
Decoded Sequence:  Stop !

**********
Input Sentence:  Wait!
Decoded Sequence:  Attends !

**********
Input Sentence:  Wait!
Decoded Sequence:  Attends !

**********
Input Sentence:  Go on.
Decoded Sequence:  Poursuis.

**********
In