<a href="https://colab.research.google.com/github/rainmaker29/Machine_Translation/blob/master/machine_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import collections
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU,LSTM,Input,Dense,TimeDistributed,Activation,RepeatVector,Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy


Using TensorFlow backend.


In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 13888284727533729026
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 13625631050519182016
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 16632098643805252552
physical_device_desc: "device: XLA_GPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 7304675328
locality {
  bus_id: 1
  links {
  }
}
incarnation: 443494970392580986
physical_device_desc: "device: 0, name: Tesla P4, pci bus id: 0000:00:04.0, compute capability: 6.1"
]


In [0]:
import os

def load_data(path):
  input_file= os.path.join(path)
  with open(input_file,"r") as f:
    data = f.read()

  return data.split("\n")

In [0]:
english_sentences = load_data("/content/drive/My Drive/machine_translation/small_vocab_en")
french_sentences = load_data("/content/drive/My Drive/machine_translation/small_vocab_fr")


In [5]:
print(english_sentences[0])
print(french_sentences[0])

new jersey is sometimes quiet during autumn , and it is snowy in april .
new jersey est parfois calme pendant l' automne , et il est neigeux en avril .


In [6]:
english_words = [word for sentence in english_sentences
                                     for word in sentence.split()]

french_words = [word for sentence in french_sentences
                                     for word in sentence.split()]

number_of_eng = collections.Counter(english_words)
number_of_fr = collections.Counter(french_words)

print(f"English words {len(english_words)}")
print(f"French words {len(french_words)}")

print(f"Unique english words {len(number_of_eng)}")
print(f"Unique english words {len(number_of_fr)}")

print(f"10 most common english : {list(zip(*number_of_eng.most_common(10)))[0]}")
print(f"10 most common french: {list(zip(*number_of_fr.most_common(10)))[0]}")


English words 1823250
French words 1961295
Unique english words 227
Unique english words 355
10 most common english : ('is', ',', '.', 'in', 'it', 'during', 'the', 'but', 'and', 'sometimes')
10 most common french: ('est', '.', ',', 'en', 'il', 'les', 'mais', 'et', 'la', 'parfois')


In [0]:
def tokenize(x):
  t = Tokenizer()
  t.fit_on_texts(x)
  return t.texts_to_sequences(x), t

In [0]:
def pad(x,length=None):
  if not length:
    length = max([len(sent) for sent in x])
  return pad_sequences(x,maxlen=length,padding='post')


In [0]:
def preprocess(x, y):
  preprocess_x, x_tk = tokenize(x)
  preprocess_y, y_tk = tokenize(y)

  preprocess_x = pad(preprocess_x)
  preprocess_y = pad(preprocess_y)

  # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
  preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
  return preprocess_x, preprocess_y, x_tk, y_tk

In [10]:
preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 344


In [0]:
def logits_to_text(logits, tokenizer):
  index_to_words = {id: word for word, id in tokenizer.word_index.items()}
  index_to_words[0] = '<PAD>'
  return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [0]:
def network(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
  inp = Input(shape=(input_shape[1],))
  embed = Embedding(output_dim=100, input_dim=english_vocab_size, input_length=input_shape[1])(inp) # (max_length, 100)
    
  # Encoder
  bd_encoded = Bidirectional(GRU(512))(embed) 
  dense_encoded = Dense(128, activation='relu')(bd_encoded) 
  decoding_layer = RepeatVector(output_sequence_length)(dense_encoded)
    
  # Decoder
  decoded_gru = Bidirectional(GRU(512, return_sequences=True))(decoding_layer)
  preds = TimeDistributed(Dense(french_vocab_size,activation='softmax'))(decoded_gru)
    
  model = Model(inputs=inp, outputs=preds)
  model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(0.005),
                  metrics=['accuracy'])
  return model

In [13]:
def final_predictions(x, y, x_tk, y_tk):

    model = network(x.shape, y.shape[1], len(x_tk.word_index)+1, len(y_tk.word_index))
    model.fit(x,y,batch_size=1024, epochs=25,validation_split=0.2)

    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'

    sentence = 'he saw a old yellow truck'
    sentence = [x_tk.word_index[word] for word in sentence.split()]
    sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
    sentences = np.array([sentence[0], x[0]])
    predictions = model.predict(sentences, len(sentences))

    print('Sample 1:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
    print('Il a vu un vieux camion jaune')
    print('Sample 2:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
    print(' '.join([y_id_to_word[np.max(x)] for x in y[0]]))


final_predictions(preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 110288 samples, validate on 27573 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Sample 1:
il a vu un vieux camion jaune <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Il a vu un vieux camion jaune
Sample 2:
new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
