In [14]:
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention
from tensorflow.keras.models import Model
import tensorflow as tf
import keras

df = pd.read_csv('/teamspace/studios/this_studio/Translator/data/Hindi_English_Truncated_Corpus (1).csv')


In [15]:
df = df.dropna()

In [16]:
def calculate_word_ratio(df, english_column, hindi_column):
    english_words = df[english_column].apply(lambda x: len(x.split()))
    hindi_words = df[hindi_column].apply(lambda x: len(x.split()))

    return english_words.mean() / hindi_words.mean()

calculate_word_ratio(df, 'english_sentence', 'hindi_sentence')

0.8996152071804627

In [17]:
df['english_count'] = df['english_sentence'].apply(lambda x: len(x.split()))
df['hindi_count'] = df['hindi_sentence'].apply(lambda x: len(x.split()))

print("Average English Sequence Length:", df['english_count'].mean())
print("Average Hindi Sequence Length:", df['hindi_count'].mean())

Average English Sequence Length: 16.091814584068022
Average Hindi Sequence Length: 17.887441714666352


In [18]:
def create_buckets(df, seq_length, tolerance_ratio=1.2):
    """
    Create buckets of sequences of constant size for machine translation.

    Args:
        df (pd.DataFrame): Input dataframe with columns 'english_sentence' and 'hindi_sentence'.
        seq_length (int): Desired sequence length for each bucket.
        tolerance_ratio (float): Tolerance ratio for the difference in sequence length between English and Hindi.

    Returns:
        pd.DataFrame: Modified dataframe with buckets of the specified sequence length.
    """
    english_buckets = []
    hindi_buckets = []

    # Iterate over the dataframe rows
    for idx, row in df.iterrows():
        english_tokens = row['english_sentence'].split()
        hindi_tokens = row['hindi_sentence'].split()

        english_len = len(english_tokens)
        hindi_len = len(hindi_tokens)

        i = 0
        while i < max(english_len, hindi_len):
            english_bucket = english_tokens[i:i+seq_length]
            hindi_bucket = hindi_tokens[i:i+int(seq_length * tolerance_ratio)]

            # Append the bucket to the list
            english_buckets.append(' '.join(english_bucket))
            hindi_buckets.append(' '.join(hindi_bucket))

            # Move to the next bucket
            i += seq_length

    # Create the new dataframe
    bucketed_df = pd.DataFrame({
        'english_sentence': english_buckets,
        'hindi_sentence': hindi_buckets
    })

    return bucketed_df

df = create_buckets(df, 18)
df

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।
...,...,...
196207,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।
196208,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द..."
196209,", etc . sulphates were limited , and the produ...",सल्फेट आदि की आवश्यकता सीमित थी और युद्धोपरांत...
196210,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .


In [19]:
import re

def preprocess_english(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'\d+', '', sentence)
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
    sentence = sentence.strip()
    return sentence

def preprocess_hindi(sentence):
    sentence = '<start> ' + sentence + ' <end>'
    return sentence


df['english_sentence'] = df['english_sentence'].apply(preprocess_english)
df['hindi_sentence'] = df['hindi_sentence'].apply(preprocess_hindi)

In [20]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

english_sentences = df['english_sentence'].tolist()
hindi_sentences = df['hindi_sentence'].tolist()


In [21]:
english_vocab_mapping = dict()
hindi_vocab_mapping = dict()

def generate_vocab_map(d, sentences):
    index = 1
    for sentence in sentences:
        for word in sentence.split():
            if word not in d:
                d[word] = index
                index += 1

generate_vocab_map(english_vocab_mapping, english_sentences)
generate_vocab_map(hindi_vocab_mapping, hindi_sentences)

In [22]:
print("English vocabulary size:", len(english_vocab_mapping))
print("Hindi vocabulary size:", len(hindi_vocab_mapping))

English vocabulary size: 70352
Hindi vocabulary size: 93921


In [23]:
english_tokens = []
hindi_tokens = []

def generate_tokens(tokens_list, sentences, vocab_map):
    for sentence in sentences:
        sentence_tokens = []
        for word in sentence.split():
            if word in vocab_map:
                sentence_tokens.append(vocab_map[word])

        tokens_list.append(sentence_tokens)

generate_tokens(english_tokens, english_sentences, english_vocab_mapping)
generate_tokens(hindi_tokens, hindi_sentences, hindi_vocab_mapping)

In [24]:
def convert_int_to_text(int_sequences, vocab_map):
    # Create a reverse mapping from index to word
    reverse_vocab_map = {index: word for word, index in vocab_map.items()}

    text_sequences = []
    for int_sequence in int_sequences:
        text_sequence = []
        for token in int_sequence:
            if token in reverse_vocab_map:
                text_sequence.append(reverse_vocab_map[token])

        text_sequences.append(' '.join(text_sequence))

    return text_sequences

int_to_text = convert_int_to_text(english_tokens, english_vocab_mapping)

In [25]:
sequence_length = 18

def generate_padding_tokens(tokens_list, sequence_length):
    padded_tokens_list = []
    for tokens in tokens_list:
        if len(tokens) < sequence_length:
            tokens = tokens + [0] * (sequence_length - len(tokens))
        else:
            tokens = tokens[:sequence_length]

        padded_tokens_list.append(tokens)

    return padded_tokens_list

padded_english_tokens = generate_padding_tokens(english_tokens, sequence_length)
padded_hindi_tokens = generate_padding_tokens(hindi_tokens, sequence_length)

In [26]:
def generate_decoder_targets(padded_tokens_list, sequence_length):
    decoder_targets = []
    for tokens in padded_tokens_list:
        if len(tokens) < sequence_length:
            shifted_tokens = tokens[1:] + [0]
        else:
            shifted_tokens = tokens[1:sequence_length] + [0]
        decoder_targets.append(shifted_tokens)
    return decoder_targets

In [27]:
src = np.array(padded_english_tokens)
tgt = np.array(padded_hindi_tokens)
labels = np.array(generate_decoder_targets(padded_hindi_tokens, sequence_length))

In [28]:
src

array([[    1,     2,     3, ...,     0,     0,     0],
       [   11,    12,     6, ...,     0,     0,     0],
       [   19,    20,    21, ...,     0,     0,     0],
       ...,
       [ 2657, 70352,   119, ...,     0,     0,     0],
       [   74, 15678,  5136, ...,     0,     0,     0],
       [ 8045,  2171,  4055, ...,     0,     0,     0]])

In [29]:
tgt

array([[    1,     2,     3, ...,    16,     0,     0],
       [    1,    17,    18, ...,     0,     0,     0],
       [    1,    27,    28, ...,     0,     0,     0],
       ...,
       [    1, 10399,   739, ...,    57,    15,    16],
       [    1, 93920,   142, ...,     0,     0,     0],
       [    1,    88,    20, ...,    16,     0,     0]])

In [30]:
labels

array([[    2,     3,     4, ...,     0,     0,     0],
       [   17,    18,    19, ...,     0,     0,     0],
       [   27,    28,    29, ...,     0,     0,     0],
       ...,
       [10399,   739,    61, ...,    15,    16,     0],
       [93920,   142,  1153, ...,     0,     0,     0],
       [   88,    20,    24, ...,     0,     0,     0]])

In [32]:
np.savez('/teamspace/studios/this_studio/Translator/data/dataset.npz', src = src, tgt=tgt, labels = labels)

In [34]:
import pickle 


with open('/teamspace/studios/this_studio/Translator/data/src_vocab.pkl', 'wb') as f:
    pickle.dump(english_vocab_mapping, f)

with open('/teamspace/studios/this_studio/Translator/data/tgt_vocab.pkl', 'wb') as f:
    pickle.dump(hindi_vocab_mapping, f)