# Homework 4: Build A Seq2Seq Model For Machine Translation
### Name: Ravi Patel | CWID: 10432313 | Date: 4/29/2019
### Task: Translate English to 

## 1. Data Preparation
### 1.1 Load And Clean Text

In [0]:
import re, string, numpy
from unicodedata import normalize

def load_doc(filename):
    '''
        load doc into memory
        open file as read only -> read all text -> close file
        return text
    '''
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

def to_pairs(doc):
    '''
        split a loaded document into sentences
    '''
    line = doc.strip().split('n')
    pairs = [line.split('\t') for line in lines]
    return pairs

def clean_data(lines):
    cleaned = list()
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    table = str.maketrans('','',string.punctuation)
    for pairs in lines:
        clean_pair = list()
        for line in pair:
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            line = line.split()
            line = [word.lower() for word in line]
            line = [word.translate(table) for word in line]
            line = [re_print.sub('',w) for w in line]
            line = [word for word in line if word.isalpha()]
            cleaned.append(clean_pair)
        cleaned.append(clean_pair)
    return numpy.array(cleaned)

In [0]:
filename = "./Data/hin.txt"

n_train = 20000 # number of sentences are you foung to use for training?

In [0]:
doc = load_doc(filename)
pairs = to_pairs(doc)
clean_pairs = clean_data(pairs)[0:n_train, :]

In [0]:
for i in range(3000, 3010):
    print('['+clean_pairs[i,0]+ '] => ['+ clean_pairs[i,1]+']')

In [0]:
input_texts = clean_pairs[0:]
target_text = ['\t' + text + '\n' for text in clean_pairs[:,1]]

print('Length of input_texts: '+ str(input_text.shape))
print('Length of target_texts: ' + str(target_texts.shape))

In [0]:
max_encoder_seq_length = max(len(line) for line in input_texts)
max_decoder_seq_length = max(len(line) for line in target_texts)

## 2. Text Processing
###2.1 Convert Texts To Sequences

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def text2sequences(max_len, lines):
    tokenizer = Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index

encoder_input_seq, input_token_index = text2sequences(max_encoder_seq_length, input_texts)
decoder_input_seq, target_token_index = text2sequences(max_encoder_seq_length, target_texts)

print('shape of encoder_input_seq: ' + str(encoder_input_seq.shape))
print('shape of input_token_index: ' + str(len(input_token_index)))
print('shape of decoder_input_seq: ' + str(decoder_input_seq.shape))
print('shape of target_token_index: ' + str(len(target_token_index)))

In [0]:
num_encoder_tokens = len(input_token_index) + 1
num_decoder_tokens = len(target_token_index) + 1

print('num_encoder_token: ' + str(num_encoder_tokens))
print('num_decoder_token: ' + str(num_decoder_tokens))

In [0]:
target_texts[100]

In [0]:
decoder_input_seq[100,:]

###2.2 One-Hot Encode

In [0]:
from keras.utils import to_categorical

def onehot_encode(sequence, max_len, vocab_size):
    n = len(sequences)
    data = numpy.zeros((n, max_len, vocab_size))
    for i in range(n):
        data[i,:,:] = to_categorical(sequences[i], num_classes=vocab_size)
    return data

encoder_input_data = onehot_encode(encoder_input_seq, max_encoder_seq_length, num_encoder_tokens)
decoder_input_data = onehot_encode(decoder_input_seq, max_decoder_seq_length, num_decoder_tokens)

decoder_target_seq = numpy.zeros(decoder_input_seq.shape)
decoder_target_seq[:, 0:-1] = decoder_input_seq[:,1:]
decoder_target_data = onehot_encode(decoder_target_seq, max_decoder_seq_length, num_decoder_tokens)

print(encoder_input_data.shape)
print(decoder_input_data.shape)

## 3. Build The Network For Training
###3.1 Encoder Network

In [0]:
from keras.layers import Input, LSTM
from keras.models import Model

latent_dim = 256

encoder_inputs = Input(shape=(None, num_encoder_tokens), name='encoder_inputs')

encoder_lstm = LSTM(latent_dim, return_state=True,
                    dropout=0.5, name='encoder_inputs')

_,state_h,state_c = encoder_lstm(encoder_inputs)

encoder_model = Model(inputs = encoder_inputs, outputs=[state_h, state_c], name='encoder')

In [0]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

SVG(model_to_dot(encoder_model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(model=encoder_model,
          show_shapes=False,
          to_file='encoder.pdf')

encoder_model.summary()
    

###3.2 Decoder Network

In [0]:
from keras.layers import Input, LSTM, Dense
from keras.models import Model

decoder_input_h = Input(shape=(latent_dim,), name='decoder_input_h')
decoder_input_c = Input(shape=(latent_dim,), name='decoder_input_c')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.5, name='decode_lstm')

decoder_lstm_outputs, state_h, state_c = decoder_lstm(decoder_input_x, initial_state=[decoder_input_h, decoder_input_c])

decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_lstm_outputs)

decoder_model = Model(inputs=[decoder_input_x, decoder_input_h, decoder_input_c], outputs=[decoder_outputs,state_h, state_c],
                     name='decoder')

In [0]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

SVG(model_to_dot(decoder_model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(model=decoder_model, show_shapes=False, to_file='decoder.pdf')

decoder_model.summary()

###3.3 Connect The Encoder And Decoder

In [0]:
encoder_input_x = Input(shape=(None, num_encoder_tokens), name='encoder_input_x')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

encoder_final_states = encoder_model([encoder_input_x])
decoder_lstm_output, _, _ = decoder_lstm(decoder_input_x, initial_state=encoder_final_states)
decoder_pred = decoder_dense(decoder_lstm_output)

model = Model(inputs=[encoder_input_x, decoder_input_x],
             outputs = decoder_pred,
             name='model_training')

In [0]:
print(state_h)
print(decoder_input_h)

In [0]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

SVG(model_to_dot(model, show_shapes=False).create(prog='dot', format='avg'))

plot_model( model=model, show_shapes=False,
          to_file='model_training.pdf')

model.summary()

###3.5 Fit The Model On the Bilingual Dataset

In [0]:
print('shape of encoder_input_data' + str(encoder_input_data.shape))
print('shape of decoder_input_data' + str(decoder_input_data.shape))
print('shape of decoder_target_data' + str(decoder_target_data.shape))

In [0]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

model.fit([encoder_input_data, decoder_input_data], 
          decoder_target_data,
         batch_size=64,
         epochs=50,
         validation_split=0.2)

model.save('seq2seq.h5')

##4. Make Prediction
###4.1 Translate English to 

In [0]:
reverse_input_char_index = dict((i,char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i,char) for char, i in target_token_index.items())

In [0]:
def decode_sequence(input_seq):
    states_value = encode_model.predict(input_seq)
    
    target_seq = numpy.zeros((1,1, num_decoder_token))
    target[0,0,target_token_index['\t']] = 1
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        sampled_token_index = numpy.argmax(output_token_index[0,-1,:])
        
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
        
        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True
        
        target_seq = numpy.zeros((1,1,num_decoder_tokens))
        target_seq[0,0, sampled_token_index] = 1
        
        states_value = [h,c]
    
    return decoded_sentence

In [0]:
for seq_index in range(2100, 2120):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('English: \t ', input_texts[seq_index])
    print(' (true): \t ', target_texts[seq_index][1:-1])
    print(' (pred): \t ', decoded_sentence[0:-1])
   

###4.2 Translate An English Sentence To The Target Language

In [0]:
input_setence = 'why is that'

'''
   input_setence = do tokenization
   input_x = do one-hot encode
   translated_sentence = do translation
'''

print('source sentence is: ' + input_sentence)
print('translated sentence is: ' + translated_sentence)