In [1]:
# dependencies
import tensorflow as tf
import numpy as np
from sklearn.cross_validation import train_test_split
import time
import matplotlib.pyplot as plt
import pickle
import codecs

In [2]:
def read_dataset(filepath):
    with open(filepath, 'rb') as fp:
        return pickle.load(fp)

In [3]:
# read dataset
dataset_location = "./data.p"
X, Y, l1_word2idx, l1_idx2word, l1_vocab, l2_word2idx, l2_idx2word, l2_vocab = read_dataset(dataset_location)

In [4]:
input_seq_len = 20
output_seq_len = 22
l1_vocab_size = len(l1_vocab) + 2 # + <pad>, <ukn>
l2_vocab_size = len(l2_vocab) + 4 # + <pad>, <ukn>, <eos>, <go>

In [5]:
def decode_sentence_string(sentences, idx2word):
    result = []
    for sentence in sentences:
        temp = """"""
        for i in range(len(sentence)):
            if sentence[i] not in [1, 2, 3]:
                temp += idx2word[sentence[i]] +" "
        result.append(temp)
    return result

In [6]:
# let's define some helper functions

# simple softmax function
def softmax(x):
    n = np.max(x)
    e_x = np.exp(x - n)
    return e_x / e_x.sum()

# feed data into placeholders
def feed_dict(x, y, batch_size = 64):
    feed = {}
    
    idxes = np.random.choice(len(x), size = batch_size, replace = False)
    
    for i in range(input_seq_len):
        feed[encoder_inputs[i].name] = np.array([x[j][i] for j in idxes])
        
    for i in range(output_seq_len):
        feed[decoder_inputs[i].name] = np.array([y[j][i] for j in idxes])
        
    feed[targets[len(targets)-1].name] = np.full(shape = [batch_size], fill_value = l2_word2idx['<pad>'])
    
    for i in range(output_seq_len-1):
        batch_weights = np.ones(batch_size, dtype = np.float32)
        target = feed[decoder_inputs[i+1].name]
        for j in range(batch_size):
            if target[j] == l2_word2idx['<pad>']:
                batch_weights[j] = 0.0
        feed[target_weights[i].name] = batch_weights
        
    feed[target_weights[output_seq_len-1].name] = np.zeros(batch_size, dtype = np.float32)
    
    return feed

# decode output sequence
def decode_output(output_seq):
    words = []
    for i in range(output_seq_len):
        smax = softmax(output_seq[i])
        idx = np.argmax(smax)
        words.append(l2_idx2word[idx])
    return words

In [7]:
whole_test_data = pickle.load(open("./encoded_test.p", "rb"))
l1_test_data = whole_test_data['X']
l2_test_data = whole_test_data['Y']

In [8]:
# For whole test data

with tf.Graph().as_default():
    
    # placeholders
    encoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'encoder{}'.format(i)) for i in range(input_seq_len)]
    decoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'decoder{}'.format(i)) for i in range(output_seq_len)]

    # output projection
    size = 512
    w_t = tf.get_variable('proj_w', [l2_vocab_size, size], tf.float32)
    b = tf.get_variable('proj_b', [l2_vocab_size], tf.float32)
    w = tf.transpose(w_t)
    output_projection = (w, b)
    
    
    # change the model so that output at time t can be fed as input at time t+1
    outputs, states = tf.nn.seq2seq.embedding_attention_seq2seq(
                                                encoder_inputs,
                                                decoder_inputs,
                                                tf.nn.rnn_cell.BasicLSTMCell(size),
                                                num_encoder_symbols = l1_vocab_size,
                                                num_decoder_symbols = l2_vocab_size,
                                                embedding_size = 80,
                                                feed_previous = True, # <-----this is changed----->
                                                output_projection = output_projection,
                                                dtype = tf.float32)
    
    # ops for projecting outputs
    outputs_proj = [tf.matmul(outputs[i], output_projection[0]) + output_projection[1] for i in range(output_seq_len)]
    
    for idx in range(1, 11):
        print("Batch - ",idx, " Processed")
        
        fp = codecs.open("./test_results/test_result_"+str(idx)+".txt", encoding="utf-8", mode="w")
        
        l1_sentences_encoded = l1_test_data[(idx-1)*1000:idx*1000]
        l2_sentences_encoded = l2_test_data[(idx-1)*1000:idx*1000]
        
        l1_sentences = decode_sentence_string(l1_sentences_encoded, l1_idx2word)
        l2_sentences = decode_sentence_string(l2_sentences_encoded, l2_idx2word)
        
        # restore all variables - use the last checkpoint saved
        saver = tf.train.Saver()
        path = tf.train.latest_checkpoint('./checkpoints/')
        
        with tf.Session() as sess:
            # restore
            saver.restore(sess, path)

            # feed data into placeholders
            feed = {}
            for i in range(input_seq_len):
                feed[encoder_inputs[i].name] = np.array([l1_sentences_encoded[j][i] for j in range(len(l1_sentences_encoded))])

            feed[decoder_inputs[0].name] = np.array([l2_word2idx['<go>']] * len(l1_sentences_encoded))

            # translate
            output_sequences = sess.run(outputs_proj, feed_dict = feed)

            # decode seq.
            for i in range(len(l1_sentences_encoded)):
                fp.write('\n')
                fp.write('{}.\n--------------------------------'.format(i+1))
                fp.write('\n')
                ouput_seq = [output_sequences[j][i] for j in range(output_seq_len)]
                
                #decode output sequence
                words = decode_output(ouput_seq)

                fp.write('Input\t\t - ')
                fp.write(l1_sentences[i])
                fp.write('\n')

                fp.write('Actual\t\t - ')
                fp.write(l2_sentences[i])
                fp.write('\n')

                fp.write('Predicted\t\t - ')
                for i in range(len(words)):
                    if words[i] not in ['<eos>', '<pad>', '<go>']:
                        #print(words[i], end=' ')
                        fp.write(words[i]+" ")
                fp.write('\n--------------------------------')
                fp.write('\n')
        fp.close()
print("DONE")

Batch -  1  Processed
Batch -  2  Processed
Batch -  3  Processed
Batch -  4  Processed
Batch -  5  Processed
Batch -  6  Processed
Batch -  7  Processed
Batch -  8  Processed
Batch -  9  Processed
Batch -  10  Processed
DONE
