### Import the required libraries

In [99]:
import numpy as np
import pandas as pd
import os
import io
import pickle
import copy
import tensorflow as tf

### Exploring the dataset (creating dataframe using pandas)

https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0023-6260-A/README.txt?sequence=1&isAllowed=y

The details of the dataset (the knowledge of the values present in the dataset) were found from this link. 

In [26]:
df = pd.read_csv('Dataset RNN/hindencorp05.plaintext', sep = '\t', names = ['source', 'alignment', 'alignment_type', 'english', 'hindi'])

In [27]:
df.head()

Unnamed: 0,source,alignment,alignment_type,english,hindi
0,wikiner2013inflected,1-1,1.000,Sharaabi,शराबी
1,ted,1-1,1.0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
2,ted,1-1,1.0,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3,indic2012,1-1,manual,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
4,quote-name,1-1,1.0,- John Collins,- जॉन कॉलिन्स


### Create English to Hindi Translation table from the dataset

In [28]:
translations = df[['english', 'hindi']]
translations.head()

Unnamed: 0,english,hindi
0,Sharaabi,शराबी
1,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
2,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
4,- John Collins,- जॉन कॉलिन्स


In [29]:
no_of_samples = translations.shape[0]
print(no_of_samples)

273885


### Separate out different sources

In [32]:
sources_english = {} # data from different sources as dictionary
targets_hindi = {} # corresponding translation to hindi 

for i in range(0, no_of_samples):
    source = df['source'][i]
    if source in sources_english:
        sources_english[source].append(df['english'][i])
        targets_hindi[source].append(df['hindi'][i])
    else:
        sources_english[source] = []
        targets_hindi[source] = []

In [53]:
for source in sources_english:
    print(source + ": ", len(sources_english[source]))
    en_file = open('Dataset RNN/en_' + source, 'w', encoding = 'utf-8')
    hi_file = open('Dataset RNN/hi_' + source, 'w', encoding = 'utf-8')
    for i in range(0, len(sources_english[source])):
        en_file.write(str(sources_english[source][i]) + '\n')
        hi_file.write(str(targets_hindi[source][i]) + '\n')
    en_file.close()
    hi_file.close()

wikiner2013inflected:  24562
ted:  39880
indic2012:  37725
quote-name:  908
launchpad:  66730
agro-hunaligned:  293
wikiner2013:  20573
tides:  49999
danielpipes:  6590
intercorp:  7495
words-word:  2843
wikiner2011:  852
emille:  8970
acl2005:  3440
words-example:  1263
quote-sent:  1438
agro-exact:  307


### Preprocessing the data

### Text to Word Ids
For RNN, turn the text into a number. In the function **text_to_ids()**, turn **source_text** and **target_text** from words to ids.

Need to add the <EOS> word id at the end of each sentence from **target_text**. This will help the neural network predict when the sentence should end.

Get word ids using **source_vocab_to_int** and **target_vocab_to_int**.

In [54]:
# source_text: String that contains all the source text.
# target_text: String that contains all the target text.
# source_vocab_to_int: Dictionary to go from the source words to an id
# target_vocab_to_int: Dictionary to go from the target words to an id
# The function returns a tuple of lists (source_id_text, target_id_text)

def text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int):
    
    sentences = source_text.split('\n')
    source_id_text = [[source_vocab_to_int[word] for word in line.split()] for line in sentences]
    
    sentences = target_text.split('\n')
    target_id_text = [[target_vocab_to_int[word] for word in line.split()]+[target_vocab_to_int['<EOS>']] for line in sentences]

    return source_id_text, target_id_text

In [93]:
# function to load dataset from a file
def load_data(path):
    input_file = os.path.join(path)
    with io.open(input_file, 'r', encoding='utf-8') as f:
        data = f.read()
    return data

In [101]:
# give codes to padding, end of sentences, unknown, and start of sentence
CODES = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }

In [100]:
# function to create lookup tables for dictionary
def create_lookup_tables(text):
    
    # create a set of words by splitting through spaces
    vocab = set(text.split())
    
    # copy the pre-existing codes (shallow copy)
    vocab_to_int = copy.copy(CODES)

    # starting from the length of codes, assign the numebers to words
    for v_i, v in enumerate(vocab, len(CODES)):
        vocab_to_int[v] = v_i

    # reverse mapping from integers to the words
    int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()}

    return vocab_to_int, int_to_vocab

In [102]:
# function to preprocess the text data and save to a file
def preprocess_and_save_data(source_path, target_path, text_to_ids, savefilename):
    # Preprocess
    # load the data from source and target files
    source_text = load_data(source_path)
    target_text = load_data(target_path)
    
    # convert text to lower cases
    source_text = source_text.lower()
    target_text = target_text.lower()

    # create lookup tables for source and target
    source_vocab_to_int, source_int_to_vocab = create_lookup_tables(source_text)
    target_vocab_to_int, target_int_to_vocab = create_lookup_tables(target_text)

    # convert English sentences and corresponding target sentences into their integer ids using table created above
    source_text, target_text = text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int)
    
    # Save Data
    pickle.dump((
        (source_text, target_text),
        (source_vocab_to_int, target_vocab_to_int),
        (source_int_to_vocab, target_int_to_vocab)), open(savefilename, 'wb'))

In [129]:
# consider the source with largest no. of sentences, i.e. launchad
preprocess_and_save_data("Dataset RNN/en_words-example", "Dataset RNN/hi_words-example", text_to_ids, "Dataset RNN/preprocessed_data_small")

In [130]:
# Load the Preprocessed Training data and return 
def load_preprocess(savefilename):
    return pickle.load(open(savefilename, mode='rb'))

In [131]:
(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), _ = load_preprocess("Dataset RNN/preprocessed_data")

### Building the Recurrent Neural Network

Build the components necessary to build a Sequence-to-Sequence model by implementing the following functions below:

- model_inputs
- process_encoding_input
- encoding_layer
- decoding_layer_train
- decoding_layer_infer
- decoding_layer
- seq2seq_model

In [148]:
def model_inputs():
    """
    Create TF Placeholders for input, targets, and learning rate.
    :return: Tuple (input, targets, learning rate, keep probability)
    """
    # TODO: Implement Function
    input_data = tf.placeholder(tf.int32, [None, None], name="input")
    targets = tf.placeholder(tf.int32, [None, None])
    lr = tf.placeholder(tf.float32)
    keep_prob = tf.placeholder(tf.float32, name="keep_prob")
    return input_data, targets, lr, keep_prob

In [149]:
def process_decoding_input(target_data, target_vocab_to_int, batch_size):
    """
    Preprocess target data for dencoding
    :param target_data: Target Placehoder
    :param target_vocab_to_int: Dictionary to go from the target words to an id
    :param batch_size: Batch Size
    :return: Preprocessed target data
    """
    # TODO: Implement Function
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], target_vocab_to_int['<GO>']), ending], 1)

    return dec_input

In [150]:
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob):
    """
    Create encoding layer
    :param rnn_inputs: Inputs for the RNN
    :param rnn_size: RNN Size
    :param num_layers: Number of layers
    :param keep_prob: Dropout keep probability
    :return: RNN state
    """
    # TODO: Implement Function
    
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)

    drop = tf.contrib.rnn.DropoutWrapper(lstm , output_keep_prob = keep_prob)

    enc_cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)

    _, enc_state = tf.nn.dynamic_rnn(enc_cell, rnn_inputs, dtype=tf.float32)

    return enc_state


In [151]:
def decoding_layer_train(encoder_state, dec_cell, dec_embed_input, sequence_length, decoding_scope,
                         output_fn, keep_prob):
    """
    Create a decoding layer for training
    :param encoder_state: Encoder State
    :param dec_cell: Decoder RNN Cell
    :param dec_embed_input: Decoder embedded input
    :param sequence_length: Sequence Length
    :param decoding_scope: TenorFlow Variable Scope for decoding
    :param output_fn: Function to apply the output layer
    :param keep_prob: Dropout keep probability
    :return: Train Logits
    """
    # TODO: Implement Function
    train_decoder_fn = tf.contrib.seq2seq.simple_decoder_fn_train(encoder_state)
    
    drop = tf.contrib.rnn.DropoutWrapper(dec_cell, output_keep_prob = keep_prob)
    
    train_pred, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(drop, train_decoder_fn, dec_embed_input, sequence_length, scope=decoding_scope)
    
    # Apply output function
    train_logits =  output_fn(train_pred)
    
    return train_logits


In [152]:
def decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id, end_of_sequence_id,
                         maximum_length, vocab_size, decoding_scope, output_fn, keep_prob):
    """
    Create a decoding layer for inference
    :param encoder_state: Encoder state
    :param dec_cell: Decoder RNN Cell
    :param dec_embeddings: Decoder embeddings
    :param start_of_sequence_id: GO ID
    :param end_of_sequence_id: EOS Id
    :param maximum_length: Maximum length of 
    :param vocab_size: Size of vocabulary
    :param decoding_scope: TensorFlow Variable Scope for decoding
    :param output_fn: Function to apply the output layer
    :param keep_prob: Dropout keep probability
    :return: Inference Logits
    """
    # TODO: Implement Function
    infer_decoder_fn = tf.contrib.seq2seq.simple_decoder_fn_inference( \
        output_fn, encoder_state, dec_embeddings, start_of_sequence_id, end_of_sequence_id, \
        maximum_length - 1, vocab_size)
    inference_logits, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(dec_cell, infer_decoder_fn, scope=decoding_scope)
    return inference_logits

In [153]:
def decoding_layer(dec_embed_input, dec_embeddings, encoder_state, vocab_size, sequence_length, rnn_size,
                   num_layers, target_vocab_to_int, keep_prob):
    """
    Create decoding layer
    :param dec_embed_input: Decoder embedded input
    :param dec_embeddings: Decoder embeddings
    :param encoder_state: The encoded state
    :param vocab_size: Size of vocabulary
    :param sequence_length: Sequence Length
    :param rnn_size: RNN Size
    :param num_layers: Number of layers
    :param target_vocab_to_int: Dictionary to go from the target words to an id
    :param keep_prob: Dropout keep probability
    :return: Tuple of (Training Logits, Inference Logits)
    """
    # TODO: Implement Function

    # Decoder RNNs
    dec_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicLSTMCell(rnn_size)] * num_layers)

    with tf.variable_scope("decoding") as decoding_scope:
        output_fn = lambda x: tf.contrib.layers.fully_connected(x, vocab_size, None, scope=decoding_scope)
        train_logits = decoding_layer_train(encoder_state, dec_cell, dec_embed_input, sequence_length, \
                                            decoding_scope, output_fn, keep_prob)

    with tf.variable_scope("decoding", reuse=True) as decoding_scope:
        inference_logits = decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, \
                                                target_vocab_to_int['<GO>'], target_vocab_to_int['<EOS>'], \
                                                sequence_length, vocab_size, decoding_scope, output_fn, keep_prob)

    return train_logits, inference_logits

In [154]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size, sequence_length, source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size, rnn_size, num_layers, target_vocab_to_int):
    """
    Build the Sequence-to-Sequence part of the neural network
    :param input_data: Input placeholder
    :param target_data: Target placeholder
    :param keep_prob: Dropout keep probability placeholder
    :param batch_size: Batch Size
    :param sequence_length: Sequence Length
    :param source_vocab_size: Source vocabulary size
    :param target_vocab_size: Target vocabulary size
    :param enc_embedding_size: Decoder embedding size
    :param dec_embedding_size: Encoder embedding size
    :param rnn_size: RNN Size
    :param num_layers: Number of layers
    :param target_vocab_to_int: Dictionary to go from the target words to an id
    :return: Tuple of (Training Logits, Inference Logits)
    """
    # TODO: Implement Function
    
    # Encoder embedding
    enc_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size, enc_embedding_size)
    
    enc_state = encoding_layer(enc_embed_input, rnn_size, num_layers, keep_prob)
    
    dec_input = process_decoding_input(target_data, target_vocab_to_int, batch_size)
    
    # Decoder Embedding
    dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, dec_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    train_logits, inference_logits = decoding_layer(dec_embed_input, dec_embeddings, enc_state, target_vocab_size, sequence_length, rnn_size, \
                   num_layers, target_vocab_to_int, keep_prob)


    return train_logits, inference_logits

In [155]:
# Number of Epochs
epochs = 20
# Batch Size
batch_size = 128
# RNN Size
rnn_size = 50
# Number of Layers
num_layers = 2
# Embedding Size
encoding_embedding_size = 50
decoding_embedding_size = 50
# Learning Rate
learning_rate = 0.001
# Dropout Keep Probability
keep_probability = 0.7

In [156]:
print(len(source_vocab_to_int))
print(len(target_vocab_to_int))
max_target_sentence_length = max([len(sentence) for sentence in source_int_text])

train_graph = tf.Graph()
with train_graph.as_default():
    input_data, targets, lr, keep_prob = model_inputs()
    sequence_length = tf.placeholder_with_default(max_target_sentence_length, None, name='sequence_length')
    input_shape = tf.shape(input_data)
    
    train_logits, inference_logits = seq2seq_model(
        tf.reverse(input_data, [-1]), targets, keep_prob, batch_size, sequence_length, len(source_vocab_to_int), len(target_vocab_to_int),
        encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers, target_vocab_to_int)

    tf.identity(inference_logits, 'logits')
    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            train_logits,
            targets,
            tf.ones([input_shape[0], sequence_length]))

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

28044
35690


AttributeError: module 'tensorflow.contrib.seq2seq' has no attribute 'simple_decoder_fn_train'