In [None]:
project_path = "machine_translation/"

In [None]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping,ReduceLROnPlateau

## Load Data

In [None]:
# read phrases from english_telugu_data.txt file
english_sentances = []
telugu_sentances = []
with open(project_path+"english_telugu_data.txt", mode='rt', encoding='utf-8') as fp:
    for line in fp.readlines():
        eng_tel = line.split("++++$++++")
        english_sentances.append(eng_tel[0])
        telugu_sentances.append(eng_tel[1])

In [None]:
data = pd.DataFrame({"english_sentances":english_sentances,"telugu_sentances":telugu_sentances})

In [None]:
data.head()

Unnamed: 0,english_sentances,telugu_sentances
0,His legs are long.,అతని కాళ్ళు పొడవుగా ఉన్నాయి.\n
1,Who taught Tom how to speak French?,టామ్ ఫ్రెంచ్ మాట్లాడటం ఎలా నేర్పించారు?\n
2,I swim in the sea every day.,నేను ప్రతి రోజు సముద్రంలో ఈత కొడతాను.\n
3,Tom popped into the supermarket on his way hom...,టామ్ కొంచెం పాలు కొనడానికి ఇంటికి వెళ్ళేటప్పుడ...
4,Smoke filled the room.,పొగ గదిని నింపింది.\n


In [None]:
data.shape

(155798, 2)

## Text Pre-Processing

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

In [None]:
# clean english sentances
def clean_eng(text):
    # Lowercase all characters
    text = text.lower()
    # map contractions
    text = ' '.join([contraction_mapping[w] if w in contraction_mapping else w for w in text.split(" ")])
    # Remove quotes
    text = re.sub("'", '', text)
    # Remove all the special characters
    exclude = set(string.punctuation) # Set of all special characters
    text = ''.join([c for c in text if c not in exclude])
    # Remove all numbers from text
    remove_digits = str.maketrans('', '', digits)
    text = text.translate(remove_digits)
    # Remove extra spaces
    text= text.strip()

    return text

In [None]:
# clean telugu sentances
def clean_tel(text):
    # Lowercase all characters
    text = text.lower()
    # Remove quotes
    text = re.sub("'", '', text)
    # Remove all the special characters
    exclude = set(string.punctuation) # Set of all special characters
    text = ''.join([c for c in text if c not in exclude])
    # Remove all numbers from text
    remove_digits = str.maketrans('', '', digits)
    text = text.translate(remove_digits)
    # Remove Telugu numbers from text
    text = re.sub("[౦౧౨౩౪౫౬౭౮౯]", '', text)
    # Remove extra spaces
    text= text.strip()
    text = 'START_ '+ text + ' _END'
    return text

In [None]:
# clean text
data_df = data.copy()
data_df["english_sentances"] = data_df["english_sentances"] .apply(lambda x: clean_eng(x))
data_df["telugu_sentances"] = data_df["telugu_sentances"] .apply(lambda x: clean_tel(x))

In [None]:
# Vocabulary of English
all_eng_words=set()
for eng in data_df.english_sentances:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

# Vocabulary of Telugu
all_telugu_words=set()
for tel in data_df.telugu_sentances:
    for word in tel.split():
        if word not in all_telugu_words:
            all_telugu_words.add(word)

In [None]:
# Max Length of source sequence
lenght_list=[]
for l in data_df.english_sentances:
    lenght_list.append(len(l.split(' ')))
max_length_src = np.max(lenght_list)
max_length_src

101

In [None]:
# Max Length of target sequence
lenght_list=[]
for l in data_df.telugu_sentances:
    lenght_list.append(len(l.split(' ')))
max_length_tar = np.max(lenght_list)
max_length_tar

30

In [None]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_telugu_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_telugu_words)
num_encoder_tokens, num_decoder_tokens

(13909, 38724)

In [None]:
num_decoder_tokens += 1 # For zero padding
num_decoder_tokens

38725

In [None]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [None]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [None]:
data_df = shuffle(data_df)
data_df.head(10)

Unnamed: 0,english_sentances,telugu_sentances
125402,we are not cowards,START_ మేము పిరికివాళ్ళు కాదు _END
133209,she decided to marry him even though her paren...,START_ ఆమె తల్లిదండ్రులు ఆమెను కోరుకోనప్పటికీ ...
153771,this is so cool,START_ ఇది చాలా బాగుంది _END
148623,she is not a good person,START_ ఆమె మంచి వ్యక్తి కాదు _END
54287,tom will not catch us,START_ టామ్ మమ్మల్ని పట్టుకోడు _END
18169,give me the coordinates,START_ నాకు కోఆర్డినేట్లు ఇవ్వండి _END
119299,your plan failed,START_ మీ ప్రణాళిక విఫలమైంది _END
61905,there were candles everywhere,START_ ప్రతిచోటా కొవ్వొత్తులు ఉన్నాయి _END
74122,i assure you tom will be perfectly safe,START_ టామ్ సంపూర్ణంగా సురక్షితంగా ఉంటాడని నేన...
60761,it looks like tom is having fun,START_ టామ్ సరదాగా ఉన్నట్లు కనిపిస్తోంది _END


In [None]:
# Train - Test Split
X, y = data_df.english_sentances, data_df.telugu_sentances
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
X_train.shape, X_test.shape

((140218,), (15580,))

Save the train and test dataframes for reproducing the results later, as they are shuffled.

In [None]:
X_train.to_pickle(project_path+'X_train.pkl')
X_test.to_pickle(project_path+'X_test.pkl')

In [None]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

## Encoder - Decoder Model Architecture

![encoder_decoder.png](https://github.com/scionoftech/Neural_Machine_Translation_English_Telugu/blob/master/encoder_decoder.png?raw=1)

In [None]:
latent_dim = 50

In [None]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [None]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
# compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
# from IPython.display import Image
# Image(retina=True, filename='train_model.png')

In [None]:
batch_size = 128
epochs = 30
train_samples_steps = len(X_train) // batch_size
val_samples_steps = len(X_test) // batch_size

In [None]:
# generate train and test datra
train_gen = generate_batch(X_train, y_train, batch_size = batch_size)
test_gen = generate_batch(X_test, y_test, batch_size = batch_size)

In [None]:
# Defining a helper function to save the model after each epoch
# in which the loss decreases
filepath = project_path+'NMT_model_enc_dec.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# Defining a helper function to reduce the learning rate each time
# the learning plateaus
reduce_alpha = ReduceLROnPlateau(monitor ='val_loss', factor = 0.2,patience = 1, min_lr = 0.001)
# stop traning if there increase in loss
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=2)
callbacks = [checkpoint, es, reduce_alpha]

In [None]:
# train the model
model.fit_generator(generator = train_gen,
                    steps_per_epoch = train_samples_steps,
                    epochs=epochs,
                    validation_data = test_gen,
                    validation_steps = val_samples_steps,callbacks = callbacks)

Epoch 1/30
 121/1095 [==>...........................] - ETA: 6:10 - loss: 0.9905 - acc: 0.2983
Epoch 00001: val_loss improved from inf to 0.99052, saving model to /content/drive/My Drive/DLCP/openwork/machine_translation/NMT_model_enc_dec.h5
Epoch 2/30
 121/1095 [==>...........................] - ETA: 6:07 - loss: 0.8931 - acc: 0.3642
Epoch 00002: val_loss improved from 0.99052 to 0.89315, saving model to /content/drive/My Drive/DLCP/openwork/machine_translation/NMT_model_enc_dec.h5
Epoch 3/30
 121/1095 [==>...........................] - ETA: 6:04 - loss: 0.8322 - acc: 0.4001
Epoch 00003: val_loss improved from 0.89315 to 0.83222, saving model to /content/drive/My Drive/DLCP/openwork/machine_translation/NMT_model_enc_dec.h5
Epoch 4/30
 121/1095 [==>...........................] - ETA: 6:06 - loss: 0.7822 - acc: 0.4357
Epoch 00004: val_loss improved from 0.83222 to 0.78216, saving model to /content/drive/My Drive/DLCP/openwork/machine_translation/NMT_model_enc_dec.h5
Epoch 5/30
 121/1095

<tensorflow.python.keras.callbacks.History at 0x7f1d0cf76f98>

In [None]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

## Decode sample sequeces

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
test_gen = generate_batch(X_test, y_test, batch_size = 1)
for k in range(10):
    (input_seq, actual_output), _ = next(test_gen)
    decoded_sentence = decode_sequence(input_seq)
    print('Input English sentence:', X_test[k:k+1].values[0])
    print('Actual Telugu Translation:', y_test[k:k+1].values[0][6:-4])
    print('Predicted Telugu Translation:', decoded_sentence[:-4])
    print("\n")

Input English sentence: count to thirty
Actual Telugu Translation:  ముప్పైకి లెక్కించండి 
Predicted Telugu Translation:  ముప్పై సంవత్సరాలు 


Input English sentence: tom is trying to confuse you
Actual Telugu Translation:  టామ్ మిమ్మల్ని కలవరపెట్టడానికి ప్రయత్నిస్తున్నాడు 
Predicted Telugu Translation:  టామ్ మిమ్మల్ని మీకు ఒక నిమిషాలు ఉన్నాడు 


Input English sentence: i would like to send a telegram
Actual Telugu Translation:  నేను టెలిగ్రామ్ పంపాలనుకుంటున్నాను 
Predicted Telugu Translation:  నేను ఒక విధంగా మేరీ నాకు ఇష్టమైన నుండి ఒక రెండు 


Input English sentence: i cannot remember my password
Actual Telugu Translation:  నా పాస్‌వర్డ్ నాకు గుర్తులేదు 
Predicted Telugu Translation:  నా పేరు నాకు గుర్తులేదు 


Input English sentence: i do only what i want to do
Actual Telugu Translation:  నేను చేయాలనుకున్నది మాత్రమే చేస్తాను 
Predicted Telugu Translation:  నేను ఏమి చేయాలో నాకు ఇష్టం 


Input English sentence: tom might cry
Actual Telugu Translation:  టామ్ ఏడుపు ఉండవచ్చు 
Predicted Tel