In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('Hindi_English_Truncated_Corpus.csv')

In [3]:
df['hindi_sentence'][6]

'इसमें तुमसे पूर्व गुज़रे हुए लोगों के हालात हैं।'

In [4]:
df.isna().any()

source              False
english_sentence     True
hindi_sentence      False
dtype: bool

In [5]:
df = df.dropna()

In [6]:
df.isna().any()

source              False
english_sentence    False
hindi_sentence      False
dtype: bool

In [7]:
def calculate_word_ratio(df, english_column, hindi_column):
    """Function to calculate the ratio between the english and hindi sentences"""
    english_words = df[english_column].apply(lambda x: len(x.split()))
    hindi_words = df[hindi_column].apply(lambda x: len(x.split()))

    return english_words.mean() / hindi_words.mean()

calculate_word_ratio(df, 'english_sentence', 'hindi_sentence')

0.8996152071804627

In [8]:
df['english_count'] = df['english_sentence'].apply(lambda x: len(x.split()))
df['hindi_count'] = df['hindi_sentence'].apply(lambda x: len(x.split()))

print("Average English Sequence Length:", df['english_count'].mean())
print("Average Hindi Sequence Length:", df['hindi_count'].mean())

Average English Sequence Length: 16.091814584068022
Average Hindi Sequence Length: 17.887441714666352


In [9]:
def create_buckets(df, seq_length, tolerance_ratio=1.2):
    """
    Create buckets of sequences of constant size for machine translation.

    Args:
        df (pd.DataFrame): Input dataframe with columns 'english_sentence' and 'hindi_sentence'.
        seq_length (int): Desired sequence length for each bucket.
        tolerance_ratio (float): Tolerance ratio for the difference in sequence length between English and Hindi.

    Returns:
        pd.DataFrame: Modified dataframe with buckets of the specified sequence length.
    """
    english_buckets = []
    hindi_buckets = []

    # Iterate over the dataframe rows
    for idx, row in df.iterrows():
        english_tokens = row['english_sentence'].split()
        hindi_tokens = row['hindi_sentence'].split()

        english_len = len(english_tokens)
        hindi_len = len(hindi_tokens)

        i = 0
        while i < max(english_len, hindi_len):
            english_bucket = english_tokens[i:i+seq_length]
            hindi_bucket = hindi_tokens[i:i+int(seq_length * tolerance_ratio)]

            # Append the bucket to the list
            english_buckets.append(' '.join(english_bucket))
            hindi_buckets.append(' '.join(hindi_bucket))

            # Move to the next bucket
            i += seq_length

    # Create the new dataframe
    bucketed_df = pd.DataFrame({
        'english_sentence': english_buckets,
        'hindi_sentence': hindi_buckets
    })

    return bucketed_df

df = create_buckets(df, 18)
df

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।
...,...,...
196207,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।
196208,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द..."
196209,", etc . sulphates were limited , and the produ...",सल्फेट आदि की आवश्यकता सीमित थी और युद्धोपरांत...
196210,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .


In [10]:
df['english_sentence'][10]

'effect on human being.'

In [11]:
import re

def preprocess_english(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'\d+', '', sentence)
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
    sentence = sentence.strip()
    return sentence

def preprocess_hindi(sentence):
    """Add start and end tokens"""
    sentence = '<start> ' + sentence + ' <end>'
    return sentence


df['english_sentence'] = df['english_sentence'].apply(preprocess_english)
df['hindi_sentence'] = df['hindi_sentence'].apply(preprocess_hindi)

In [12]:
df

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,<start> राजनीतिज्ञों के पास जो कार्य करना चाहि...
1,id like to tell you about one such child,<start> मई आपको ऐसे ही एक बच्चे के बारे में बत...
2,this percentage is even greater than the perce...,<start> यह प्रतिशत भारत में हिन्दुओं प्रतिशत स...
3,what we really mean is that theyre bad at not ...,<start> हम ये नहीं कहना चाहते कि वो ध्यान नहीं...
4,the ending portion of these vedas is called up...,<start> इन्हीं वेदों का अंतिम भाग उपनिषद कहलात...
...,...,...
196207,and put it in our cheeks,<start> और अपने गालों में डाल लेते हैं। <end>
196208,as for the other derivatives of sulphur the c...,<start> जहां तक गंधक के अन्य उत्पादों का प्रश्...
196209,etc sulphates were limited and the productio...,<start> सल्फेट आदि की आवश्यकता सीमित थी और युद...
196210,its complicated functioning is defined thus in...,<start> Zरचना-प्रकिया को उसने एक पहेली में यों...


In [14]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

english_sentences = df['english_sentence'].tolist()
hindi_sentences = df['hindi_sentence'].tolist()

In [16]:

# Generating Vocabulary for English and Hindi tokens

english_vocab_mapping = dict()
hindi_vocab_mapping = dict()

def generate_vocab_map(d, sentences):
    index = 1
    for sentence in sentences:
        for word in sentence.split():
            if word not in d:
                d[word] = index
                index += 1

generate_vocab_map(english_vocab_mapping, english_sentences)
generate_vocab_map(hindi_vocab_mapping, hindi_sentences)

In [17]:
english_tokens = []
hindi_tokens = []

# Generating Sequence of integer tokens

def generate_tokens(tokens_list, sentences, vocab_map):
    for sentence in sentences:
        sentence_tokens = []
        for word in sentence.split():
            if word in vocab_map:
                sentence_tokens.append(vocab_map[word])

        tokens_list.append(sentence_tokens)

generate_tokens(english_tokens, english_sentences, english_vocab_mapping)
generate_tokens(hindi_tokens, hindi_sentences, hindi_vocab_mapping)

In [18]:
def convert_int_to_text(int_sequences, vocab_map):
    """ Function to convert tokens back to text """
    # Create a reverse mapping from index to word
    reverse_vocab_map = {index: word for word, index in vocab_map.items()}

    text_sequences = []
    for int_sequence in int_sequences:
        text_sequence = []
        for token in int_sequence:
            if token in reverse_vocab_map:
                text_sequence.append(reverse_vocab_map[token])

        text_sequences.append(' '.join(text_sequence))

    return text_sequences

int_to_text = convert_int_to_text(english_tokens, english_vocab_mapping)
int_to_text

['politicians do not have permission to do what needs to be done',
 'id like to tell you about one such child',
 'this percentage is even greater than the percentage in india',
 'what we really mean is that theyre bad at not paying attention',
 'the ending portion of these vedas is called upanishad',
 'the then governor of kashmir resisted transfer but was finally reduced to subjection with the aid of',
 'british',
 'in this lies the circumstances of people before you',
 'and who are we to say even that they are wrong',
 'global warming refer to warming caused in recent decades and probability of its continual presence and its indirect',
 'effect on human being',
 'you may want your child to go to a school that is not run by the lea',
 'a nonmaintained special school or an independent school that can meet your child s needs',
 '',
 'please ensure that you use the appropriate form',
 'category religious text',
 'this period summarily is pepped up with devotion',
 'so there is some sort 

In [19]:
sequence_length = 18

def generate_padding_tokens(tokens_list, sequence_length):
    """Function to pad with zero to maintain sequence length"""
    padded_tokens_list = []
    for tokens in tokens_list:
        if len(tokens) < sequence_length:
            tokens = tokens + [0] * (sequence_length - len(tokens))
        else:
            tokens = tokens[:sequence_length]

        padded_tokens_list.append(tokens)

    return padded_tokens_list

padded_english_tokens = generate_padding_tokens(english_tokens, sequence_length)
padded_hindi_tokens = generate_padding_tokens(hindi_tokens, sequence_length)

In [20]:
encoder_inputs = np.array(padded_english_tokens)
decoder_inputs = np.array(padded_hindi_tokens)

In [21]:
def generate_decoder_targets(padded_tokens_list, sequence_length):
    decoder_targets = []
    for tokens in padded_tokens_list:
        if len(tokens) < sequence_length:
            shifted_tokens = tokens[1:] + [0]
        else:
            shifted_tokens = tokens[1:sequence_length] + [0]
        decoder_targets.append(shifted_tokens)
    return decoder_targets

decoder_targets = generate_decoder_targets(padded_hindi_tokens, sequence_length)

In [22]:
decoder_targets = np.array(decoder_targets)

In [23]:
decoder_targets

array([[    2,     3,     4, ...,     0,     0,     0],
       [   17,    18,    19, ...,     0,     0,     0],
       [   27,    28,    29, ...,     0,     0,     0],
       ...,
       [10399,   739,    61, ...,    15,    16,     0],
       [93920,   142,  1153, ...,     0,     0,     0],
       [   88,    20,    24, ...,     0,     0,     0]])

In [24]:
decoder_inputs[0]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,  0,
        0])

In [25]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention
from tensorflow.keras.models import Model
import tensorflow as tf

sequence_length = 18
embedding_dim = 64
latent_dim = 128

encoder_input = Input(shape=(sequence_length,), name = 'encoder_inputs')
encoder_embedding = Embedding(input_dim = len(english_vocab_mapping) + 1, output_dim = embedding_dim, input_length = sequence_length, trainable = True)(encoder_input)
encoder_lstm, state_h, state_c = LSTM(latent_dim, return_state = True, return_sequences=True, name ='encoder_lstm')(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_input = Input(shape = (sequence_length,), name = 'decoder_inputs')
decoder_embedding = Embedding(input_dim = len(hindi_vocab_mapping) + 1,
                              output_dim = embedding_dim,
                              input_length = sequence_length,
                              trainable = True)(decoder_input)


decoder_lstm = LSTM(latent_dim, return_sequences = True, return_state=True, name = 'decoder_lstm')
decoder_lstm_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state = encoder_states)

attention = Attention(name = 'attention_layer')
attention_out = attention([decoder_lstm_outputs, encoder_lstm])

decoder_concat_input = tf.concat([decoder_lstm_outputs, attention_out], axis = -1)

decoder_dense = Dense(len(hindi_vocab_mapping) + 1, activation = 'softmax', name = 'decoder_dense')
decoder_outputs = decoder_dense(decoder_concat_input)

model = Model([encoder_input, decoder_input], decoder_outputs)

model.compile(optimizer = 'adam', loss='sparse_categorical_crossentropy', metrics = ['accuracy'])

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_inputs (InputLayer  [(None, 18)]                 0         []                            
 )                                                                                                
                                                                                                  
 decoder_inputs (InputLayer  [(None, 18)]                 0         []                            
 )                                                                                                
                                                                                                  
 embedding (Embedding)       (None, 18, 64)               4502592   ['encoder_inputs[0][0]']      
                                                                                              

In [None]:
model.fit([encoder_inputs, decoder_inputs], decoder_targets, batch_size = 16, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7ccf74c68670>

In [30]:
def translate(source_text, sequence_length = 18):
    int_tokens = []
    padded_tokens = []
    for word in source_text.split():
        int_tokens.append(english_vocab_mapping[word.lower()])
    if len(int_tokens) < sequence_length:
        int_tokens = int_tokens + [0] * (sequence_length - len(int_tokens))
    else:
        int_tokens = int_tokens[:sequence_length]

    encoder_inputs = np.array([int_tokens])
    decoder_input_sequence = np.zeros((1, sequence_length))
    decoder_input_sequence[0, 0] = hindi_vocab_mapping['<start>']

    translated_tokens = []

    for i in range(1, sequence_length):
        predictions = model.predict([encoder_inputs, decoder_input_sequence])
        predicted_index = np.argmax(predictions[0, i - 1, :])
        translated_tokens.append(predicted_index)
        if predicted_index == hindi_vocab_mapping['<end>']:
            break

        decoder_input_sequence[0, i] = predicted_index
    return ''.join(convert_int_to_text([translated_tokens], hindi_vocab_mapping))

translate("what is th purpose of life")



'जीवन का लक्ष्य क्या है? <end>'

In [31]:
while True:
    input_text = input("Enter source text:")
    translation = translate(input_text)
    print(translation)
    if input_text == 'q':
        break

Enter source text:what is happening
क्या हो रहा है ? <end>
Enter source text:how are you doing
क्या आप कैसे कर सकते हैं?” <end>
Enter source text:this is great
यह बहुत ही महत्वपूर्ण है. <end>
Enter source text:you are nice
आप को एक अच्छा परिप्रेक्ष्य दिया. <end>
Enter source text:what is the purpose of life
जीवन की क्या है <end>
Enter source text:do you know that english is a popular language in the world
क्या आप जानते हैं कि दुनिया में सबसे बड़ा सबसे बड़ा लोकतंत्र है। <end>
Enter source text:english is good
अंग्रेजी में <end>


KeyboardInterrupt: Interrupted by user

In [38]:
model.save('/content/drive/MyDrive/NLP/LSTM_Translator.keras')