<h1>Neural Machine Translation for French to English</h1>

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import gzip
import codecs as cds
import re
import time
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
from tensorflow.python.layers.core import Dense
from tensorflow.contrib.seq2seq import TrainingHelper, GreedyEmbeddingHelper, BasicDecoder, dynamic_decode
from tensorflow.contrib.seq2seq import BahdanauAttention, AttentionWrapper, sequence_loss
from tensorflow.contrib.rnn import GRUCell, DropoutWrapper
TOKEN_GO = '<GO>'
TOKEN_EOS = '<EOS>'
TOKEN_PAD = '<PAD>'
TOKEN_UNK = '<UNK>'

In [None]:
frdata=[]
endata=[]
with open('data/train_fr_lines.txt') as frfile:
    for li in frfile:
        frdata.append(li)
with open('data/train_en_lines.txt') as enfile:
    for li in enfile:
        endata.append(li)
mtdata = pd.DataFrame({'FR':frdata,'EN':endata})
mtdata['FR_len'] = mtdata['FR'].apply(lambda x: len(x.split(' ')))
mtdata['EN_len'] = mtdata['EN'].apply(lambda x: len(x.split(' ')))

In [None]:
print(mtdata['FR'].head(2).values)
print(mtdata['EN'].head(2).values)

In [None]:
mtdata_fr = []
for fr in mtdata.FR:
    mtdata_fr.append(fr)
mtdata_en = []
for en in mtdata.EN:
    mtdata_en.append(en)

In [None]:
def count_words(words_dict, text):
    for sentence in text:
        for word in sentence.split():
            if word not in words_dict:
                words_dict[word] = 1
            else:
                words_dict[word] += 1

In [None]:
word_counts_dict_fr = {}
word_counts_dict_en = {}
count_words(word_counts_dict_fr, mtdata_fr)
count_words(word_counts_dict_en, mtdata_en)
            
print("Total French words in Vocabulary:", len(word_counts_dict_fr))
print("Total English words in Vocabulary", len(word_counts_dict_en))

In [None]:
def build_wd_vector_matrix(vect_f):
    emb_index = {}
    with cds.open(vect_f, 'r', 'utf-8') as fl:
        for i, wd_li in enumerate(fl):
            sr = wd_li.split()
            if(len(sr)<26):
                continue
            wd = sr[0]
            emb = np.asarray(sr[1:], dtype='float32')
            emb_index[wd] = emb
    return emb_index
embs_index = build_wd_vector_matrix('../../temp/glove.6B.50d.txt')

In [None]:
def build_word2id_mapping(word_counts_dict):
    word2int = {} 
    count_threshold = 20
    value = 0
    for word, count in word_counts_dict.items():
        if count >= count_threshold or word in embs_index:
            word2int[word] = value
            value += 1


    special_codes = [TOKEN_UNK,TOKEN_PAD,TOKEN_EOS,TOKEN_GO]   

    for code in special_codes:
        word2int[code] = len(word2int)

    int2word = {}
    for word, value in word2int.items():
        int2word[value] = word
    return word2int,int2word

In [None]:
def build_embeddings(word2int):
    embedding_dim = 50
    nwords = len(word2int)

    word_emb_matrix = np.zeros((nwords, embedding_dim), dtype=np.float32)
    for word, i in word2int.items():
        if word in embs_index:
            word_emb_matrix[i] = embs_index[word]
        else:
            new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
            word_emb_matrix[i] = new_embedding
    return word_emb_matrix

In [None]:
fr_word2int,fr_int2word = build_word2id_mapping(word_counts_dict_fr)
en_word2int,en_int2word = build_word2id_mapping(word_counts_dict_en)
fr_embs_mat = build_embeddings(fr_word2int)
en_embs_mat = build_embeddings(en_word2int)
print("Length of french word embeddings: ", len(fr_embs_mat))
print("Length of english word embeddings: ", len(en_embs_mat))

In [None]:
def convert_sentence_to_ids(text, word2int, eos=False):
    wordints = []
    word_count = 0
    for sentence in text:
        sentence2ints = []
        for word in sentence.split():
            word_count += 1
            if word in word2int:
                sentence2ints.append(word2int[word])
            else:
                sentence2ints.append(word2int[TOKEN_UNK])
        if eos:
            sentence2ints.append(word2int[TOKEN_EOS])
        wordints.append(sentence2ints)
    return wordints, word_count

In [None]:
id_fr, word_count_fr = convert_sentence_to_ids(mtdata_fr, fr_word2int)
id_en, word_count_en = convert_sentence_to_ids(mtdata_en, en_word2int, eos=True)

In [None]:
def unknown_tokens(sentence, word2int):
    unk_token_count = 0
    for word in sentence:
        if word == word2int[TOKEN_UNK]:
            unk_token_count += 1
    return unk_token_count

In [None]:
en_filtered = []
fr_filtered = []
max_en_length = int(mtdata.EN_len.max())
max_fr_length = int(mtdata.FR_len.max())
min_length = 4
unknown_token_en_limit = 10
unknown_token_fr_limit = 10

for count,text in enumerate(id_en):
    unknown_token_en = unknown_tokens(id_en[count],en_word2int)
    unknown_token_fr = unknown_tokens(id_fr[count],fr_word2int)
    en_len = len(id_en[count])
    fr_len = len(id_fr[count])
    if( (unknown_token_en>unknown_token_en_limit) or (unknown_token_fr>unknown_token_fr_limit) or 
       (en_len<min_length) or (fr_len<min_length) ):
        continue
    fr_filtered.append(id_fr[count])
    en_filtered.append(id_en[count])
print("Length of filtered french/english sentences: ", len(fr_filtered), len(en_filtered) )

In [None]:
def model_inputs():
    dat_inps = tf.placeholder(tf.int32, [None, None], name='dat_inps')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    rt_lr = tf.placeholder(tf.float32, name='rt_lr')
    drp_rt = tf.placeholder(tf.float32, name='drp_rt')
    en_len = tf.placeholder(tf.int32, (None,), name='en_len')
    max_en_len = tf.reduce_max(en_len, name='max_en_len')
    fr_len = tf.placeholder(tf.int32, (None,), name='fr_len')
    return dat_inps, targets, rt_lr, drp_rt, en_len, max_en_len, fr_len

In [None]:
def process_encoding_input(target_data, word2int, batch_size):
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    decoding_input = tf.concat([tf.fill([batch_size, 1], word2int[TOKEN_GO]), ending], 1)
    return decoding_input

In [None]:
def get_recnet_cell(recnet_cell_sz,drp_rt):
    c_recnet= GRUCell(recnet_cell_sz)
    c_recnet= DropoutWrapper(c_recnet, input_keep_prob = drp_rt)
    return c_recnet

def encoding_layer(recnet_cell_sz, len_seq, n_layers, recnet_inp, drp_rt):
    for l in range(n_layers):
        with tf.variable_scope('encoding_l_{}'.format(l)):
            fw_recnet = get_recnet_cell(recnet_cell_sz,drp_rt)
            bw_recnet = get_recnet_cell(recnet_cell_sz,drp_rt)
            op_enc, st_enc = tf.nn.bidirectional_dynamic_rnn(fw_recnet, bw_recnet, 
                                                                    recnet_inp,
                                                                    len_seq,
                                                                    dtype=tf.float32)
    op_enc = tf.concat(op_enc,2)
    return op_enc, st_enc

In [None]:
def training_decoding_layer(inp_dec_emb, en_len, c_dec, st_init, op_layer, 
                            v_sz, max_en_len):
    helper = TrainingHelper(inputs=inp_dec_emb,sequence_length=en_len, time_major=False)
    dec = BasicDecoder(c_dec,helper,st_init,op_layer) 
    lgits, _, _ = dynamic_decode(dec,output_time_major=False,impute_finished=True, 
                                  maximum_iterations=max_en_len)
    return lgits

In [None]:
def inference_decoding_layer(embs, tk_st, toks_en, c_dec, initial_state, lyr_op,
                             max_en_len, bt_sz):
    
    toks_st = tf.tile(tf.constant([tk_st], dtype=tf.int32), [bt_sz], name='toks_st')
    inf_helper = GreedyEmbeddingHelper(embs,toks_st,toks_en)
    dec_inf = BasicDecoder(c_dec,inf_helper,initial_state,lyr_op)       
    inf_lgits, _, _ = dynamic_decode(dec_inf,output_time_major=False,impute_finished=True,
                                                            maximum_iterations=max_en_len)
    return inf_lgits

In [None]:
def decoding_layer(dec_emb_inp, embs, enc_op, st_enc, v_size, fr_len, 
                   en_len,max_en_len, recnet_sz, word2int, drp_rt, bt_sz, lyr_n):
    
    for l in range(lyr_n):
        with tf.variable_scope('dec_rnn_layer_{}'.format(l)):
            gru = tf.contrib.rnn.GRUCell(rnn_len)
            c_dec = tf.contrib.rnn.DropoutWrapper(gru,input_keep_prob = drp_rt)
    out_l = Dense(v_size, kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    attention = BahdanauAttention(recnet_sz, enc_op,fr_len,
                                                  normalize=False,
                                                  name='BahdanauAttention')
    c_dec =  AttentionWrapper(c_dec,attention,rnn_len)
    st_attn_zero = c_dec.zero_state(bt_sz , tf.float32 )
    st_attn_zero = st_attn_zero.clone(cell_state = st_enc[0])
    with tf.variable_scope("decoding_layer"):
        lgits_tr = training_decoding_layer(dec_emb_inp, 
                                                  en_len, 
                                                  c_dec, 
                                                  st_attn_zero,
                                                  out_l,
                                                  v_size, 
                                                  max_en_len)
    with tf.variable_scope("decoding_layer", reuse=True):
        lgits_inf = inference_decoding_layer(embs,  word2int[TOKEN_GO], 
                                                    word2int[TOKEN_EOS],
                                                    c_dec, 
                                                    st_attn_zero, 
                                                    out_l,
                                                    max_en_len,
                                                    bt_sz)

    return lgits_tr, lgits_inf

In [None]:
def model_seqtoseq(dat_inp, target_en_data, drp_rt, fr_len, en_len, max_en_len, 
                  v_size, recnet_sz, lyr_n, word2int_en, bt_sz):
    
    inp_wd_embs = tf.Variable(fr_embs_mat, name="inp_wd_embs")
    enc_emb_inp = tf.nn.embedding_lookup(inp_wd_embs, dat_inp)
    op_enc, st_enc = encoding_layer(recnet_sz, fr_len, lyr_n, enc_emb_inp, drp_rt)
    
    dec_inp = process_encoding_input(target_en_data, word2int_en, bt_sz)
    dec_emb_inp = tf.nn.embedding_lookup(en_embs_mat, dec_inp)
    
    tr_lgits, inf_lgits  = decoding_layer(dec_emb_inp, en_embs_mat,op_enc,st_enc, v_size, fr_len, 
                                                        en_len, max_en_len,
                                                        recnet_sz, word2int_en, 
                                                        drp_rt, bt_sz,lyr_n)
    return tr_lgits, inf_lgits

In [None]:
def sents_pad(sentences_batch,word2int):
    max_sentence = max([len(sentence) for sentence in sentences_batch])
    return [sentence + [word2int[TOKEN_PAD]] * (max_sentence - len(sentence)) for sentence in sentences_batch]

In [None]:
def get_batches(en_text, fr_text, bt_sz):
    for bt_idx in range(0, len(fr_text)//bt_sz):
        start_idx = bt_idx * bt_sz
        en_bt = en_text[start_idx:start_idx + bt_sz]
        fr_bt = fr_text[start_idx:start_idx + bt_sz]
        pad_en_bt = np.array(sents_pad(en_bt, en_word2int))
        pad_fr_bt = np.array(sents_pad(fr_bt,fr_word2int))

        pad_en_lens = []
        for en_b in pad_en_bt:
            pad_en_lens.append(len(en_b))
        
        pad_fr_lens = []
        for fr_b in pad_fr_bt:
            pad_fr_lens.append(len(fr_b))
        
        yield pad_en_bt, pad_fr_bt, pad_en_lens, pad_fr_lens

In [None]:
epochs = 20
bt_sz = 64
rnn_len = 256
n_layers = 2
lr = 0.005
dr_prob = 0.75
logs_path='/tmp/models/'

In [None]:
train_graph = tf.Graph()
with train_graph.as_default():
    
    dat_inp, targets, lr_rt, drp_rt, en_len, max_en_len, fr_len = model_inputs()

    lgits_tr, lgits_inf = model_seqtoseq(tf.reverse(dat_inp, [-1]),
                                                      targets, 
                                                      drp_rt,   
                                                      fr_len,
                                                      en_len,
                                                      max_en_len,
                                                      len(en_word2int)+1,
                                                      rnn_len, 
                                                      n_layers, 
                                                      en_word2int,
                                                      bt_sz)
    
    lgits_tr = tf.identity(lgits_tr.rnn_output, 'lgits_tr')
    lgits_inf = tf.identity(lgits_inf.sample_id, name='predictions')
    
    seq_masks = tf.sequence_mask(en_len, max_en_len, dtype=tf.float32, name='masks')

    with tf.name_scope("optimizer"):
        tr_cost = sequence_loss(lgits_tr,targets,seq_masks)
        optimizer = tf.train.AdamOptimizer(lr_rt)
        grds = optimizer.compute_gradients(tr_cost)
        cap_grds = [(tf.clip_by_value(gr, -5., 5.), var) for gr, var in grds 
                        if gr is not None]
        train_op = optimizer.apply_gradients(cap_grds)
    tf.summary.scalar("cost", tr_cost)
print("Graph created.")

In [None]:
min_learning_rate = 0.0006
display_step = 20 
stop_early_count = 0 
stop_early_max_count = 3 
per_epoch = 10 


update_loss = 0 
bt_loss = 0
s_upd_loss = [] 

en_tr = en_filtered[0:30000]
fr_tr = fr_filtered[0:30000]
update_check = (len(fr_tr)//bt_sz//per_epoch)-1
checkpoint = logs_path + 'best_so_far_model.ckpt' 
with tf.Session(graph=train_graph) as sess:
    s_writer = tf.summary.FileWriter(logs_path, graph=train_graph)
    op_summ_merged = tf.summary.merge_all()
    sess.run(tf.global_variables_initializer())
    for epoch_i in range(1, epochs+1):
        update_loss = 0
        bt_loss = 0
        for bt_i, (en_batch, fr_batch, en_text_len, fr_text_len) in enumerate(
                get_batches(en_tr, fr_tr, bt_sz)):
            before = time.time()
            _,loss,res_summ = sess.run(
                [train_op, tr_cost,op_summ_merged],
                {dat_inp: fr_batch,
                 targets: en_batch,
                 lr_rt: lr,
                 en_len: en_text_len,
                 fr_len: fr_text_len,
                 drp_rt: dr_prob})
            bt_loss += loss
            update_loss += loss
            after = time.time()
            batch_time = after - before
            s_writer.add_summary(res_summ, epoch_i * bt_sz + bt_i)
            if bt_i % display_step == 0 and bt_i > 0:
                print('** Epoch {:>3}/{} Batch {:>4}/{} - Batch Loss: {:>6.3f}, seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              bt_i, 
                              len(fr_filtered) // bt_sz, 
                              bt_loss / display_step, 
                              batch_time*display_step))
                bt_loss = 0

            if bt_i % update_check == 0 and bt_i > 0:
                print("Average loss:", round(update_loss/update_check,3))
                s_upd_loss.append(update_loss)
                
                if update_loss <= min(s_upd_loss):
                    print('Saving model') 
                    stop_early_count = 0
                    saver = tf.train.Saver() 
                    saver.save(sess, checkpoint)

                else:
                    print("No Improvement.")
                    stop_early_count += 1
                    if stop_early_count == stop_early_max_count:
                        break
                update_loss = 0

        if stop_early_count == stop_early_max_count:
            print("Stopping Training.")
            break

In [None]:
#random = np.random.randint(3000,len(fr_filtered))
random = np.random.randint(0,3000)
fr_text = fr_filtered[random]

checkpoint = logs_path + 'best_so_far_model.ckpt'

g_load = tf.Graph()
with tf.Session(graph=g_load) as sess:
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)
    dat_inp = g_load.get_tensor_by_name('dat_inps:0')
    lgits = g_load.get_tensor_by_name('predictions:0')
    fr_length = g_load.get_tensor_by_name('fr_len:0')
    en_length = g_load.get_tensor_by_name('en_len:0')
    dropout_prob = g_load.get_tensor_by_name('drp_rt:0')
    result_lgits = sess.run(lgits, {dat_inp: [fr_text]*bt_sz, 
                                      en_length: [len(fr_text)], 
                                      fr_length: [len(fr_text)]*bt_sz,
                                      dropout_prob: 1.0})[0] 

pad = en_word2int[TOKEN_PAD] 

#print('\nOriginal Text:', input_sentence)

print('\nFrench Text')
print('  Word Ids:    {}'.format([i for i in fr_text]))
print('  Input Words: {}'.format(" ".join( [fr_int2word[i] for i in fr_text ] )))

print('\nEnglish Text')
print('  Word Ids:       {}'.format([i for i in result_lgits if i != pad]))
print('  Response Words: {}'.format(" ".join( [en_int2word[i]for i in result_lgits if i!=pad] )))
print(' Ground Truth: {}'.format(" ".join( [en_int2word[i] for i in en_filtered[random]] )))