### Import the required libraries

In [99]:
import numpy as np
import pandas as pd
import os
import io
import pickle
import copy
import tensorflow as tf

### Exploring the dataset (creating dataframe using pandas)

https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0023-6260-A/README.txt?sequence=1&isAllowed=y

The details of the dataset (the knowledge of the values present in the dataset) were found from this link. 

In [26]:
df = pd.read_csv('Dataset RNN/hindencorp05.plaintext', sep = '\t', names = ['source', 'alignment', 'alignment_type', 'english', 'hindi'])

In [27]:
df.head()

Unnamed: 0,source,alignment,alignment_type,english,hindi
0,wikiner2013inflected,1-1,1.000,Sharaabi,शराबी
1,ted,1-1,1.0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
2,ted,1-1,1.0,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3,indic2012,1-1,manual,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
4,quote-name,1-1,1.0,- John Collins,- जॉन कॉलिन्स


### Create English to Hindi Translation table from the dataset

In [28]:
translations = df[['english', 'hindi']]
translations.head()

Unnamed: 0,english,hindi
0,Sharaabi,शराबी
1,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
2,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
4,- John Collins,- जॉन कॉलिन्स


In [29]:
no_of_samples = translations.shape[0]
print(no_of_samples)

273885


### Separate out different sources

In [32]:
sources_english = {} # data from different sources as dictionary
targets_hindi = {} # corresponding translation to hindi 

for i in range(0, no_of_samples):
    source = df['source'][i]
    if source in sources_english:
        sources_english[source].append(df['english'][i])
        targets_hindi[source].append(df['hindi'][i])
    else:
        sources_english[source] = []
        targets_hindi[source] = []

In [53]:
for source in sources_english:
    print(source + ": ", len(sources_english[source]))
    en_file = open('Dataset RNN/en_' + source, 'w', encoding = 'utf-8')
    hi_file = open('Dataset RNN/hi_' + source, 'w', encoding = 'utf-8')
    for i in range(0, len(sources_english[source])):
        en_file.write(str(sources_english[source][i]) + '\n')
        hi_file.write(str(targets_hindi[source][i]) + '\n')
    en_file.close()
    hi_file.close()

wikiner2013inflected:  24562
ted:  39880
indic2012:  37725
quote-name:  908
launchpad:  66730
agro-hunaligned:  293
wikiner2013:  20573
tides:  49999
danielpipes:  6590
intercorp:  7495
words-word:  2843
wikiner2011:  852
emille:  8970
acl2005:  3440
words-example:  1263
quote-sent:  1438
agro-exact:  307


### Preprocessing the data

### Text to Word Ids
For RNN, turn the text into a number. In the function **text_to_ids()**, turn **source_text** and **target_text** from words to ids.

Need to add the <EOS> word id at the end of each sentence from **target_text**. This will help the neural network predict when the sentence should end.

Get word ids using **source_vocab_to_int** and **target_vocab_to_int**.

In [54]:
# source_text: String that contains all the source text.
# target_text: String that contains all the target text.
# source_vocab_to_int: Dictionary to go from the source words to an id
# target_vocab_to_int: Dictionary to go from the target words to an id
# The function returns a tuple of lists (source_id_text, target_id_text)

def text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int):
    
    sentences = source_text.split('\n')
    source_id_text = [[source_vocab_to_int[word] for word in line.split()] for line in sentences]
    
    sentences = target_text.split('\n')
    target_id_text = [[target_vocab_to_int[word] for word in line.split()]+[target_vocab_to_int['<EOS>']] for line in sentences]

    return source_id_text, target_id_text

In [93]:
# function to load dataset from a file
def load_data(path):
    input_file = os.path.join(path)
    with io.open(input_file, 'r', encoding='utf-8') as f:
        data = f.read()
    return data

In [101]:
# give codes to padding, end of sentences, unknown, and start of sentence
CODES = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }

In [100]:
# function to create lookup tables for dictionary
def create_lookup_tables(text):
    
    # create a set of words by splitting through spaces
    vocab = set(text.split())
    
    # copy the pre-existing codes (shallow copy)
    vocab_to_int = copy.copy(CODES)

    # starting from the length of codes, assign the numebers to words
    for v_i, v in enumerate(vocab, len(CODES)):
        vocab_to_int[v] = v_i

    # reverse mapping from integers to the words
    int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()}

    return vocab_to_int, int_to_vocab

In [102]:
# function to preprocess the text data and save to a file
def preprocess_and_save_data(source_path, target_path, text_to_ids, savefilename):
    # Preprocess
    # load the data from source and target files
    source_text = load_data(source_path)
    target_text = load_data(target_path)
    
    # convert text to lower cases
    source_text = source_text.lower()
    target_text = target_text.lower()

    # create lookup tables for source and target
    source_vocab_to_int, source_int_to_vocab = create_lookup_tables(source_text)
    target_vocab_to_int, target_int_to_vocab = create_lookup_tables(target_text)

    # convert English sentences and corresponding target sentences into their integer ids using table created above
    source_text, target_text = text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int)
    
    # Save Data
    pickle.dump((
        (source_text, target_text),
        (source_vocab_to_int, target_vocab_to_int),
        (source_int_to_vocab, target_int_to_vocab)), open(savefilename, 'wb'))

In [103]:
# consider the source with largest no. of sentences, i.e. launchad
preprocess_and_save_data("Dataset RNN/en_launchpad", "Dataset RNN/hi_launchpad", text_to_ids, "Dataset RNN/preprocessed_data")

In [104]:
# Load the Preprocessed Training data and return 
def load_preprocess(savefilename):
    return pickle.load(open(savefilename, mode='rb'))

In [105]:
(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), _ = load_preprocess("Dataset RNN/preprocessed_data")