In [1]:
import helper
import re
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Activation, LSTM, Dense, Bidirectional
from keras.losses import sparse_categorical_crossentropy
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 7576487632198033310
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 15728113774827464196
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 17265137480124615653
physical_device_desc: "device: XLA_GPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3866165248
locality {
  bus_id: 1
  links {
  }
}
incarnation: 15473157447889525017
physical_device_desc: "device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


### Load dataset

In [3]:
# Load English data
english_sentences = helper.load_data('data/small_vocab_en')
# Load French data
french_sentences = helper.load_data('data/small_vocab_fr')

print('Dataset Loaded')

Dataset Loaded


### Normalize the dataset

In [4]:
for sentence in english_sentences:
    if re.findall(r'[A-Z]', sentence):
        print(sentence)

#All sentences are normalized

### Tokenize and create oe hot encoded vector

In [5]:
def tokenize(sentences):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    
    return np.array(tokenizer.texts_to_sequences(sentences)), tokenizer

eng_tokenized_id, en_tokenizer = tokenize(english_sentences)
fre_tokenized_id, fr_tokenizer = tokenize(french_sentences)

### Count no. of unique words

In [6]:
unique_en_count = len(en_tokenizer.word_counts)
unique_fr_count = len(fr_tokenizer.word_counts)

### Padding all sentences to same length

In [7]:
def apply_padding(sentences, length=None):
    return pad_sequences(sentences, maxlen=length, padding='post')

eng_tokenized_id = apply_padding(eng_tokenized_id, 21)
fre_tokenized_id = apply_padding(fre_tokenized_id)

In [8]:
# eng_tokenized_id = np.reshape(eng_tokenized_id, newshape=(-1, eng_tokenized_id.shape[1], 1))
fre_tokenized_id = np.reshape(fre_tokenized_id, newshape=(-1, fre_tokenized_id.shape[1], 1))

### Model

In [9]:
model = Sequential()
model.add(Embedding(unique_en_count, unique_en_count, input_shape=eng_tokenized_id.shape[1:]))
model.add(Bidirectional(LSTM(256, return_sequences=True, activation='tanh')))
model.add(Dense(16, activation='relu'))
model.add(Dense(unique_fr_count, activation='softmax'))
model.summary()
model.compile(loss=sparse_categorical_crossentropy, optimizer=Adam(0.001), metrics=['accuracy'])
model.fit(eng_tokenized_id, fre_tokenized_id, batch_size=64, epochs=10, validation_split=0.25)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 21, 199)           39601     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 21, 512)           933888    
_________________________________________________________________
dense_1 (Dense)              (None, 21, 16)            8208      
_________________________________________________________________
dense_2 (Dense)              (None, 21, 344)           5848      
Total params: 987,545
Trainable params: 987,545
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 103395 samples, validate on 34466 samples
Epoch 1/10
Epoch 2/

<keras.callbacks.History at 0x7ff0b9ed7710>

In [10]:
model.predict(eng_tokenized_id[:1]).shape

(1, 21, 344)

### Translating output to sentences

In [11]:
def id_to_text(logits=None):
    index_to_words = {id: word for word, id in fr_tokenizer.word_index.items()}
    index_to_words[0] = ""
    for i in range(logits.shape[0]):
        print("\n", i, "/ English sentence -> ", sep="")
        print("  ", english_sentences[i])
        print("\n   Actual translation ->")
        print("  ", french_sentences[i])
        print("\n   Predicted translation ->")
        print("  ", " ".join(index_to_words[prediction] for prediction in np.argmax(logits, 2)[i]))
        
        print("\n   Predicted word indexes ->")
        print("   ", end="")
        for prediction in np.argmax(logits, 2)[i]:
            print(prediction, end=" ")
        print("\n   Actual word indexes ->")
        print("  ", " ".join(np.reshape(fre_tokenized_id[i], newshape=(1, 21)).astype(str)[0]))

print("Checking some outputs ----")
id_to_text(model.predict(eng_tokenized_id[0:3]))
    

Checking some outputs ----

0/ English sentence -> 
   new jersey is sometimes quiet during autumn , and it is snowy in april .

   Actual translation ->
   new jersey est parfois calme pendant l' automne , et il est neigeux en avril .

   Predicted translation ->
   new jersey est parfois calme pendant l' automne et il est neigeux avril avril       

   Predicted word indexes ->
   35 34 1 8 67 37 11 24 6 3 1 112 50 50 0 0 0 0 0 0 0 
   Actual word indexes ->
   35 34 1 8 67 37 11 24 6 3 1 112 2 50 0 0 0 0 0 0 0

1/ English sentence -> 
   the united states is usually chilly during july , and it is usually freezing in november .

   Actual translation ->
   les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .

   Predicted translation ->
   les états unis est généralement froid en juillet et il gèle habituellement en novembre       

   Predicted word indexes ->
   4 32 31 1 12 19 2 49 6 3 95 69 2 51 0 0 0 0 0 0 0 
   Actual word indexes ->
   4 3