In [1]:
!pip install -q kaggle

In [2]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [3]:
! kaggle datasets download -d danielwillgeorge/glove6b100dtxt
! kaggle datasets download -d anindya2906/english-to-french

Downloading glove6b100dtxt.zip to /content
 94% 124M/131M [00:01<00:00, 95.1MB/s]
100% 131M/131M [00:01<00:00, 93.7MB/s]
Downloading english-to-french.zip to /content
 81% 5.00M/6.17M [00:00<00:00, 41.0MB/s]
100% 6.17M/6.17M [00:00<00:00, 39.4MB/s]


In [4]:
!unzip glove6b100dtxt.zip -d glove100
!unzip english-to-french.zip -d eng2french

Archive:  glove6b100dtxt.zip
  inflating: glove100/glove.6B.100d.txt  
Archive:  english-to-french.zip
  inflating: eng2french/_about.txt   
  inflating: eng2french/fra-eng/_about.txt  
  inflating: eng2french/fra-eng/fra.txt  
  inflating: eng2french/fra.txt      


In [32]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

In [33]:
import tensorflow.keras.backend as K

def softmax_over_time(x):
  assert(K.ndim(x) > 2)
  e = K.exp(x - K.max(x, axis=1, keepdims=True))
  s = K.sum(e, axis=1, keepdims=True)
  return e / s

In [34]:
max_seq_len = 100
max_vocab_size = 20000
epochs = 100
batch_size = 64
embedding_dim = 100
validation_split = 0.2
latent_dim = 256
latent_dim_decoder = 256
num_samples = 2500

In [35]:
word_embeddings = {}
with open(os.path.join('/content/glove100/glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word_key = values[0]
        word_vector = np.array(values[1:], dtype='float32')
        
        word_embeddings[word_key] = word_vector

In [36]:
input_txts = []
target_input_maptxts = []
target_txts = []

In [37]:
idx = 0
special = u"\u202f"
for line in open('eng2french/fra-eng/fra.txt'):

  eng, fre = line.strip().replace(special, ' ').split('\t')
  
  input_txts.append(eng)
  target_input_maptxts.append('<sos> '+fre)
  target_txts.append(fre+' <eos>')

  idx += 1
  if idx == num_samples:
    break

In [38]:
tokenizer_eng = tf.keras.preprocessing.text.Tokenizer(num_words=max_vocab_size)
tokenizer_eng.fit_on_texts(input_txts)
input_sequences = tokenizer_eng.texts_to_sequences(input_txts)

In [39]:
word2index_eng = tokenizer_eng.word_index
num_words_eng = len(word2index_eng)
print(num_words_eng)

773


In [40]:
max_input_txt_len = max(len(s) for s in input_txts)
max_input_txt_len

12

In [41]:
tokenizer_fre = tf.keras.preprocessing.text.Tokenizer(num_words=max_vocab_size, filters='')
tokenizer_fre.fit_on_texts(target_input_maptxts+target_txts)
target_sequences = tokenizer_fre.texts_to_sequences(target_txts)
target_input_map_sequences = tokenizer_fre.texts_to_sequences(target_input_maptxts)

In [42]:
max_target_txt_len = max(len(s) for s in target_txts)
max_target_txt_len

43

In [43]:
word2index_fre = tokenizer_fre.word_index
num_words_fre = len(word2index_fre)
print(num_words_fre)

1881


In [44]:
num_words_fre = min(num_words_fre+1, max_vocab_size)
num_words_eng = min(num_words_eng+1, max_vocab_size)

In [45]:
assert '<sos>' in word2index_fre
assert '<eos>' in word2index_fre

In [46]:
target_txts[:20]

['Va ! <eos>',
 'Cours ! <eos>',
 'Courez ! <eos>',
 'Au feu ! <eos>',
 "À l'aide ! <eos>",
 'Saute. <eos>',
 'Ça suffit ! <eos>',
 'Stop ! <eos>',
 'Arrête-toi ! <eos>',
 'Attends ! <eos>',
 'Attendez ! <eos>',
 'Poursuis. <eos>',
 'Continuez. <eos>',
 'Poursuivez. <eos>',
 'Je comprends. <eos>',
 "J'essaye. <eos>",
 "J'ai gagné ! <eos>",
 "Je l'ai emporté ! <eos>",
 'Oh non ! <eos>',
 'Attaque ! <eos>']

In [47]:
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_input_txt_len, padding='pre')
decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(target_input_map_sequences, maxlen=max_target_txt_len, padding='post')
decoder_targets = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, maxlen=max_target_txt_len, padding='post')

In [48]:
num_words_inputs = min(max_vocab_size, len(word2index_eng)+1)
num_words_inputs

774

In [49]:
embedding_matrix = np.zeros((num_words_inputs, embedding_dim))
for word, idx in word2index_eng.items():
    if idx < max_vocab_size:
        embedding_vector = word_embeddings.get(word)
        
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector

In [50]:
print(f'Encoder Input Sequences shape: {encoder_inputs.shape}')
print(f'Decoder Input Sequences shape: {decoder_inputs.shape}')
print(f'Deocder Target Sequences shape: {decoder_targets.shape}')

Encoder Input Sequences shape: (2500, 12)
Decoder Input Sequences shape: (2500, 43)
Deocder Target Sequences shape: (2500, 43)


In [51]:
one_hot_targets = np.zeros((len(decoder_targets), max_target_txt_len, num_words_fre))
for i, target_sequence in enumerate(decoder_targets):
  for j, token_id in enumerate(target_sequence):
      if token_id > 0:
        one_hot_targets[i, j, token_id] = 1

one_hot_targets.shape

(2500, 43, 1882)

In [60]:
embedding_layer = tf.keras.layers.Embedding(input_dim=num_words_eng, output_dim=embedding_dim, weights=[embedding_matrix] ,input_length=max_input_txt_len)

input_ = tf.keras.Input(shape=(max_input_txt_len,))
x = embedding_layer(input_)

encoder_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(latent_dim, return_state=True))
encoder_output= encoder_lstm(x)

decoder_input_ = tf.keras.Input(shape=[max_target_txt_len,])
decoder_embedding = tf.keras.layers.Embedding(input_dim=num_words_fre, output_dim=embedding_dim)
decoder_x = decoder_embedding(decoder_input_)

In [61]:
attention_repeat_layer = tf.keras.layers.RepeatVector(max_input_txt_len)
attention_concatenate_layer = tf.keras.layers.Concatenate(axis=-1)
attention_dense_1 = tf.keras.layers.Dense(10, activation='tanh')
attention_dense_2 = tf.keras.layers.Dense(1, activation=softmax_over_time)
attention_dot = tf.keras.layers.Dot(axes=1)

In [62]:
def one_step_attention(h, st_1):

  st_1 = attention_repeat_layer(st_1)
  x = attention_concatenate_layer([h, st_1])
  x = attention_dense_1(x)
  alphas = attention_dense_2(x)
  contexts = attention_dot([h, alphas])

  return contexts

In [63]:
decoder_lstm = tf.keras.layers.LSTM(latent_dim_decoder, return_state=True, return_sequences=True)
decoder_dense = tf.keras.layers.Dense(units=num_words_fre, activation='softmax')

initial_s = tf.keras.Input(shape=(latent_dim_decoder,))
initial_c = tf.keras.Input(shape=(latent_dim_decoder,))

context_last_word_concat_layer = tf.keras.layers.Concatenate(axis=2)

In [64]:
s = initial_s
c = initial_c

outputs = []
for t in range(max_target_txt_len):
  context = one_step_attention(encoder_output, s)
  selector = tf.keras.layers.Lambda(lambda x:x[:, t:t+1])
  x_t = selector(decoder_x)
  decoder_lstm_input = context_last_word_concat_layer([context, x_t])
  o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[s,c])

  decoder_ouput = decoder_dense(o)
  outputs.append(decoder_output)

ValueError: ignored

In [58]:
def stack_and_transpose(x):
  x = K.stack(x)
  x = K.permute_dimensions(x, pattern=(1,0,2))
  return x

In [59]:
stacker = tf.keras.layers.Lambda(stack_and_transpose)
outputs = stacker(outputs)

model = tf.keras.models.Model(inputs=[input_, decoder_input_, initial_s, initial_c], outputs=outputs)

InvalidArgumentError: ignored

In [None]:
z = np.zeros(shape=(len(encoder_inputs), latent_dim_decoder ))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(x=[encoder_inputs, decoder_inputs,z,z], y=one_hot_targets, batch_size=batch_size, epochs=epochs, validation_split=validation_split)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.show()

In [None]:
encoder_context_model = tf.keras.Model(input_, encoder_states)

encoder_outputs_as_input = tf.keras.Input(shape=(max_input_txt_len, 2*latent_dim_decoder))
context_vector = one_step_attention(encoder_outputs_as_input, initial_s)

decoder_input_single = tf.keras.Input(shape=(1,))
decoder_single_x = decoder_embedding(decoder_input_single)
decoder_lstm_input = context_last_word_concat_layer([context_vector, decoder_single_x])

o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[initial_s, initial_c])
decoder_outputs = decoder_dense(o)

predict_model = tf.keras.models.Model(inputs=[encoder_outputs_as_input, decoder_input_single, initial_s, initial_c], outputs=decoder_outputs)

In [None]:
index2word_eng = {v:k for k,v in word2index_eng.items()}
index2word_fre = {v:k for k,v in word2index_fre.items()}

In [None]:
def decode_english_to_french(eng_seq):

  final_seq = []
  encoder_context_states = encoder_context_model.predict(eng_seq)

  target_seq = np.array([[word2index_fre['<sos>']]])
  eos = word2index_fre['<eos>']

  s = np.zeros((1, latent_dim_decoder))
  c = np.zeros((1, latent_dim_decoder))

  for _ in range(max_target_txt_len):
    decoder_o, s, c = predict_model.predict(encoder_context_states, target_seq, s, c)

    fre_word = np.argmax(decoder_o.flatten())
    if fre_word == eos:
      break
    
    if fre_word > 0:
      final_seq.append(index2word_fre.get(fre_word))
      
    target_seq[0,0] = fre_word
    encoder_context_states = [h, c]

  return ' '.join(final_seq)

In [None]:
while True:
  # Do some test translations
  i = np.random.choice(len(input_txts))
  input_seq = encoder_inputs[i:i+1]
  translation = decode_english_to_french(input_seq)
  print('-')
  print('Input:', input_txts[i])
  print('Translation:', translation)
  print('Actual translation:', target_sequences[i])

  ans = input("Continue? [Y/n]")
  if ans and ans.lower().startswith('n'):
    break