In [8]:
import pickle
from collections import Counter
import codecs

In [9]:
def read_sentences(filepath):
    sentences = []
    with codecs.open(filepath, encoding="utf-8", mode="r") as fp:
        for sentence in fp:
            sentences.append(sentence.lower())
    return sentences

In [10]:
def read_sentences_format2(filepath):
    X = []
    Y = []
    with codecs.open(filepath, encoding = "utf-8", mode = "r") as fp:
        for sentence in fp:
            splits = sentence.split("\t")
            X.append(splits[3].strip().lower())
            Y.append(splits[4].strip().lower())
    return X, Y

In [11]:
def create_dataset(l1_sentences, l2_sentences):
    """
    Need to exchange strip functionality for hindi and english
    
    String for Hindi   => ',." ;:)(|][?!<>a-zA-Z'
    String for English => ',." ;:)(][?!-\''
    """
    l1_vocab_dict = Counter(word.strip(',." ;:)(|][?!<>') for sentence in l1_sentences for word in sentence.split())
    l2_vocab_dict = Counter(word.strip(',." ;:)(][?!-\'') for sentence in l2_sentences for word in sentence.split())
    #l1_vocab_dict = Counter(word for sentence in l1_sentences for word in sentence.split())
    #l2_vocab_dict = Counter(word for sentence in l2_sentences for word in sentence.split())
    
    l1_vocab = list(map(lambda x: x[0], sorted(l1_vocab_dict.items(), key = lambda x: -x[1])))
    l2_vocab = list(map(lambda x: x[0], sorted(l2_vocab_dict.items(), key = lambda x: -x[1])))
    
    # Limit the vocabulary size. Consider only the top 20,000 and 30,000 words respectively
    l1_vocab = l1_vocab[:30000]
    l2_vocab = l2_vocab[:30000]
    
    # Build a Word to Index Dictionary for English
    start_idx = 2
    l1_word2idx = dict([(word, idx+start_idx) for idx, word in enumerate(l1_vocab)])
    l1_word2idx['<ukn>'] = 0 # Unknown words
    l1_word2idx['<pad>'] = 1 # Padding word
    
    # Build an Index to Word Dictionary for English using the already created Word to Index Dictionary
    l1_idx2word = dict([(idx, word) for word, idx in l1_word2idx.items()])
    
    # Build a Word to Index Dictionary for Hindi
    start_idx = 4
    l2_word2idx = dict([(word, idx+start_idx) for idx, word in enumerate(l2_vocab)])
    l2_word2idx['<ukn>'] = 0 # Unknown
    l2_word2idx['<go>']  = 1 
    l2_word2idx['<eos>'] = 2 # End of sentence
    l2_word2idx['<pad>'] = 3 # Padding
    
    # Build an Index to Word Dictionary for Hindi using the already created Word to Index Dictionary
    l2_idx2word = dict([(idx, word) for word, idx in l2_word2idx.items()])

    # Encode words in senteces by their index in Vocabulary
    x = [[l1_word2idx.get(word.strip(',." ;:)(|][?!<>'), 0) for word in sentence.split()] for sentence in l1_sentences]
    y = [[l2_word2idx.get(word.strip(',." ;:)(][?!-\''), 0) for word in sentence.split()] for sentence in l2_sentences]
    #x = [[l1_word2idx.get(word, 0) for word in sentence.split()] for sentence in l1_sentences]
    #y = [[l2_word2idx.get(word, 0) for word in sentence.split()] for sentence in l2_sentences]
    
    X = []
    Y = []
    for i in range(len(x)):
        n1 = len(x[i])
        n2 = len(y[i])
        n = n1 if n1 < n2 else n2
        if abs(n1 - n2) < 0.3 * n:
            if n1 <= 20 and n2 <= 20:
                X.append(x[i])
                Y.append(y[i])
    
    return X, Y, l1_word2idx, l1_idx2word, l1_vocab, l2_word2idx, l2_idx2word, l2_vocab

In [12]:
def save_dataset(filepath, obj):
    with open(filepath, 'wb') as fp:
        pickle.dump(obj, fp, -1)

In [13]:
def read_dataset(filepath):
    with open(filepath, 'rb') as fp:
        return pickle.load(fp)

In [14]:
def main():
    dataset_save_location = "./data.p"
    
    X, Y = read_sentences_format2("./hindencorp05.plaintext")
    # Here, X represents English and Y Represents Hindi. 
    """
    For English to Hindi, 
    l1_sentences = X
    l2_sentences = Y
    """
    # This 
    l1_sentences = Y
    l2_sentences = X
    save_dataset(dataset_save_location , create_dataset(l1_sentences, l2_sentences))

if __name__ == '__main__':
    main()

In [25]:
X, Y = read_sentences_format2("./hindencorp05.plaintext")
print(X[:2])
print(Y[:2])

['sharaabi', 'politicians do not have permission to do what needs to be done.']
['शराबी', 'राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .']
