# Language Translation with Machine Learning

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing

import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model




In [2]:
# Read the CSV file into a DataFrame
lines = pd.read_csv("Hindi_English_Truncated_Corpus.csv", encoding='utf-8')

# Print the first few rows of the DataFrame
print(lines.head())

      source                                   english_sentence  \
0        ted  politicians do not have permission to do what ...   
1        ted         I'd like to tell you about one such child,   
2  indic2012  This percentage is even greater than the perce...   
3        ted  what we really mean is that they're bad at not...   
4  indic2012  .The ending portion of these Vedas is called U...   

                                      hindi_sentence  
0  राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर...  
1  मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...  
2   यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।  
3     हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते  
4        इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।  


In [3]:
# Filter the dataset to include only rows where the source is 'ted'
lines = lines[lines['source'] == 'ted']

# Remove rows with null English sentences
lines = lines[~pd.isnull(lines['english_sentence'])]

# Drop duplicate rows
lines.drop_duplicates(inplace=True)

# Randomly sample 25,000 rows from the dataset
lines = lines.sample(n=25000, random_state=42)

# Check the shape of the resulting dataset
lines.shape

(25000, 3)

In [4]:
# Convert English sentences to lowercase
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: x.lower())

# Convert Hindi sentences to lowercase
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: x.lower())

In [5]:
# Remove single quotes from English and Hindi sentences
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: re.sub("'", '', x))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [6]:
# Set of all special characters
exclude = set(string.punctuation)

# Remove all the special characters from English and Hindi sentences
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [7]:
# Remove digits from English and Hindi sentences
remove_digits = str.maketrans('', '', digits)
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: x.translate(remove_digits))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

# Remove specific Hindi characters
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces and add special tokens
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: re.sub(" +", " ", x.strip()))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x.strip()))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: 'START_ ' + x + ' _END')

In [8]:
# Get English and Hindi vocabularies
all_eng_words = set(word for eng in lines['english_sentence'] for word in eng.split())
all_hindi_words = set(word for hin in lines['hindi_sentence'] for word in hin.split())

# Calculate the length of each English and Hindi sentence
lines['length_eng_sentence'] = lines['english_sentence'].apply(lambda x: len(x.split()))
lines['length_hin_sentence'] = lines['hindi_sentence'].apply(lambda x: len(x.split()))

In [9]:
# Filter out sentences longer than 20 words
lines = lines[lines['length_eng_sentence'] <= 20]
lines = lines[lines['length_hin_sentence'] <= 20]

# Get the maximum length of source and target sentences
max_length_src = max(lines['length_hin_sentence'])
max_length_tar = max(lines['length_eng_sentence'])

# Get sorted lists of unique words for both English and Hindi
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))

# Get the number of tokens for both encoder and decoder
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)

# Increment decoder tokens for zero padding
num_decoder_tokens += 1  

# Create token indexes for both languages
input_token_index = dict([(word, i + 1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i + 1) for i, word in enumerate(target_words)])

# Create reverse indexes for tokenization
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

# Shuffle the dataset
lines = shuffle(lines)

## Training Model to Translate English to Hindi

In [10]:
X, y = lines['english_sentence'], lines['hindi_sentence']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the training and testing sets as pickle files
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')

In [11]:
def generate_batch(X, y, batch_size):
    '''
    Generate a batch of data
    Args:
    - X: Input data
    - y: Target data
    - batch_size: Size of the batch
    '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src), dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar), dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens), dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word]  # Encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t < len(target_text.split()) - 1:
                        decoder_input_data[i, t] = target_token_index[word]  # Decoder input seq
                    if t > 0:
                        # Decoder target sequence (one hot encoded)
                        # Does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield ([encoder_input_data, decoder_input_data], decoder_target_data)

# Define model parameters
latent_dim = 300
batch_size = 128

# Define encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]  # Keep the states

# Define decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

# Train parameters
train_samples = len(X_train)
val_samples = len(X_test)
epochs = 100



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 300)            4209000   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 300)            5262300   ['input_2[0][0]']             
                                                                                            

In [None]:
# Train the model using the generator for both training and validation data
model.fit_generator(generator=generate_batch(X_train, y_train, batch_size=batch_size),
                    steps_per_epoch=train_samples//batch_size,
                    epochs=epochs,
                    validation_data=generate_batch(X_test, y_test, batch_size=batch_size),
                    validation_steps=val_samples//batch_size)

# Save the trained model weights
model.save_weights('nmt_weights.h5')

In [14]:
import tensorflow as tf

# Check for and handle GPU memory growth
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        # Enable GPU memory growth to prevent allocation errors
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
    except RuntimeError as e:
        print(e)

# Train the model with error handling
try:
    model.fit_generator(generator=generate_batch(X_train, y_train, batch_size=batch_size),
                        steps_per_epoch=train_samples//batch_size,
                        epochs=epochs,
                        validation_data=generate_batch(X_test, y_test, batch_size=batch_size),
                        validation_steps=val_samples//batch_size)
    # Save the trained model weights
    model.save_weights('nmt_weights.h5')
except tf.errors.InvalidArgumentError as e:
    # Handle the InvalidArgumentError
    print("InvalidArgumentError:", e)

  model.fit_generator(generator=generate_batch(X_train, y_train, batch_size=batch_size),


Epoch 1/100
  3/154 [..............................] - ETA: 4:30 - loss: 9.7518InvalidArgumentError: Graph execution error:

Detected at node model/embedding/embedding_lookup defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\USER\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "C:\Users\USER\anaconda3\Lib\site-packages\traitlets\config\application.py", line 992, in launch_instance

  File "C:\Users\USER\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 736, in start

  File "C:\Users\USER\anaconda3\Lib\site-packages\tornado\platform\asyncio.py", line 195, in start

  File "C:\Users\USER\anaconda3\Lib\asyncio\base_events.py", line 607, in run_forever

  File "C:\Users\USER\anaconda3\Lib\asyncio\base_events.py", line 1922, in _run_once

  File "C:\Users\USER\anaconda3\Lib\asyncio\events.py", line 80, in _run

  File "C:\Users\USER\anaconda

In [15]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)  

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start character
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1)
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' ' + sampled_char

        # Exit condition: either hit max length or find stop character
        if (sampled_char == '_END' or len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

# Initialize training data generator
train_gen = generate_batch(X_train, y_train, batch_size=1)

k = -1  # Assuming this is used as a counter/index variable

In [16]:
k += 1  # Increment k
(input_seq, actual_output), _ = next(train_gen)  # Generate a batch of data
decoded_sentence = decode_sequence(input_seq)  # Decode the input sequence

# Print the input English sentence, actual Hindi translation, and predicted Hindi translation
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])  # Exclude 'START_' and '_END' tokens
print('Predicted Hindi Translation:', decoded_sentence[:-4])  # Exclude '_END' token

Input English sentence: another aha moment
Actual Hindi Translation:  एक और अहा क्षण 
Predicted Hindi Translation:  
