# **Group 6: Language Translation using Seq2Seq**

In [1]:
import numpy as np
import pandas as pd

import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

In [2]:
lines=pd.read_csv("../input/newest11/language_data.csv",encoding='utf-8')
lines.head()

Unnamed: 0,English,Marathi
0,Go.,जा.
1,Run!,पळ!
2,Run!,धाव!
3,Run!,पळा!
4,Run!,धावा!


In [3]:
lines.drop_duplicates(inplace=True)

lines=lines.sample(n=25000,random_state=42)
lines.shape
lines.head()

Unnamed: 0,English,Marathi
17968,What a beautiful night!,काय सुंदर रात्र होती!
5034,Where's the map?,नकाशा कुठेय?
1697,Unbelievable!,काहीही काय!
17538,The classroom is empty.,वर्ग रिकामा आहे.
37087,Those who live by the sword die by the sword.,जे तलवारीने जगतात ते तलवारीनेच मरतात.


In [4]:
lines.English=lines.English.apply(lambda x: x.lower())
lines.Marathi=lines.Marathi.apply(lambda x: x.lower())

# Lowercase all characters
lines['English']=lines['English'].apply(lambda x: x.lower())
lines['Marathi']=lines['Marathi'].apply(lambda x: x.lower())

lines.head()

Unnamed: 0,English,Marathi
17968,what a beautiful night!,काय सुंदर रात्र होती!
5034,where's the map?,नकाशा कुठेय?
1697,unbelievable!,काहीही काय!
17538,the classroom is empty.,वर्ग रिकामा आहे.
37087,those who live by the sword die by the sword.,जे तलवारीने जगतात ते तलवारीनेच मरतात.


In [5]:
lines['English']=lines['English'].apply(lambda x: re.sub("'", '', x))
lines['Marathi']=lines['Marathi'].apply(lambda x: re.sub("'", '', x))
exclude = set(string.punctuation)

lines['English']=lines['English'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['Marathi']=lines['Marathi'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [6]:
remove_digits = str.maketrans('', '', digits)
lines['English']=lines['English'].apply(lambda x: x.translate(remove_digits))
lines['Marathi']=lines['Marathi'].apply(lambda x: x.translate(remove_digits))

lines['Marathi'] = lines['Marathi'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

lines['English']=lines['English'].apply(lambda x: x.strip())
lines['Marathi']=lines['Marathi'].apply(lambda x: x.strip())

lines['English']=lines['English'].apply(lambda x: re.sub(" +", " ", x))
lines['Marathi']=lines['Marathi'].apply(lambda x: re.sub(" +", " ", x))

In [7]:
lines['Marathi'] = lines['Marathi'].apply(lambda x : 'START_ '+ x + ' _END')
lines.head()

Unnamed: 0,English,Marathi
17968,what a beautiful night,START_ काय सुंदर रात्र होती _END
5034,wheres the map,START_ नकाशा कुठेय _END
1697,unbelievable,START_ काहीही काय _END
17538,the classroom is empty,START_ वर्ग रिकामा आहे _END
37087,those who live by the sword die by the sword,START_ जे तलवारीने जगतात ते तलवारीनेच मरतात _END


In [8]:
all_eng_words=set()
for eng in lines['English']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_mar_words=set()
for mar in lines['Marathi']:
    for word in mar.split():
        if word not in all_mar_words:
            all_mar_words.add(word)

In [9]:
lines['length_eng_sentence']=lines['English'].apply(lambda x:len(x.split(" ")))
lines['length_mar_sentence']=lines['Marathi'].apply(lambda x:len(x.split(" ")))

lines.head()

Unnamed: 0,English,Marathi,length_eng_sentence,length_mar_sentence
17968,what a beautiful night,START_ काय सुंदर रात्र होती _END,4,6
5034,wheres the map,START_ नकाशा कुठेय _END,3,4
1697,unbelievable,START_ काहीही काय _END,1,4
17538,the classroom is empty,START_ वर्ग रिकामा आहे _END,4,5
37087,those who live by the sword die by the sword,START_ जे तलवारीने जगतात ते तलवारीनेच मरतात _END,10,8


In [10]:
lines[lines['length_eng_sentence']>30].shape

(1, 4)

In [11]:
lines=lines[lines['length_eng_sentence']<=20]
lines=lines[lines['length_mar_sentence']<=20]

print("maximum length of Marathi Sentence ",max(lines['length_mar_sentence']))
print("maximum length of English Sentence ",max(lines['length_eng_sentence']))

maximum length of Marathi Sentence  20
maximum length of English Sentence  19


In [12]:
max_length_src=max(lines['length_mar_sentence'])
max_length_tar=max(lines['length_eng_sentence'])

In [13]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_mar_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_mar_words)
num_encoder_tokens, num_decoder_tokens

(4778, 10853)

In [14]:
num_decoder_tokens += 1
num_encoder_tokens += 1 
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())
lines = shuffle(lines)
lines.head(10)

Unnamed: 0,English,Marathi,length_eng_sentence,length_mar_sentence
20826,let me take a look at it,START_ मला बघू द्या _END,7,5
7150,i like my friends,START_ मला माझ्या मैत्रिणी आवडतात _END,4,6
33292,i went to disneyland with my mother,START_ मी आईबरोबर डिज्नीलँडला गेले _END,7,6
35867,let me know in advance if you are coming,START_ येणार असशील तर मला आधीच कळव _END,9,8
35341,herbert hoover won the election of,START_ हर्बर्ट हूव्हर ची निवडणूक जिंकला _END,6,7
34569,their names were erased from the list,START_ त्यांची नावं यादीतून पुसून टाकण्यात आली...,7,8
32486,hey what are you doing in my room,START_ अरे तुम्ही माझ्या खोलीत काय करताहात _END,8,8
7112,i have many discs,START_ माझ्याकडे भरपूर डिस्क आहेत _END,4,6
15661,the party was my idea,START_ पार्टीची आयडिया माझी होती _END,5,6
36337,we shouldve bought three bottles of wine,START_ आपण वाईनच्या तीन बाटल्या आणायला हव्या ह...,7,9


In [15]:
X, y = lines['English'], lines['Marathi']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((19994,), (4999,))

In [16]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')

In [17]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield((encoder_input_data, decoder_input_data), decoder_target_data)

In [18]:
latent_dim=300
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True, use_cudnn=False)

encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

In [19]:
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, use_cudnn=False)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [20]:
model.summary()

In [21]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 50
model.fit(
    x = generate_batch(X_train, y_train, batch_size = batch_size),
    steps_per_epoch = train_samples//batch_size,
    epochs=epochs,
    validation_data = generate_batch(X_test, y_test, batch_size=batch_size),
    validation_steps = val_samples//batch_size
)

Epoch 1/50
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 176ms/step - loss: 7.4576 - val_loss: 5.8037
Epoch 2/50
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 167ms/step - loss: 5.7387 - val_loss: 5.6807
Epoch 3/50
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 165ms/step - loss: 5.6300 - val_loss: 5.6219
Epoch 4/50
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 166ms/step - loss: 5.5465 - val_loss: 5.5289
Epoch 5/50
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 167ms/step - loss: 5.4313 - val_loss: 5.3919
Epoch 6/50
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 167ms/step - loss: 5.2943 - val_loss: 5.2803
Epoch 7/50
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 166ms/step - loss: 5.1467 - val_loss: 5.1257
Epoch 8/50
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 166ms/step - loss: 4.9881 - val_loss: 5.0083
Epoch 9/50
[1m1

<keras.src.callbacks.history.History at 0x7b305b61c820>

In [22]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [23]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = target_token_index['START_']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]


    return decoded_sentence

In [29]:
model.save('translation_model.h5')

In [31]:
from keras.preprocessing.sequence import pad_sequences

while True:
    print("\nEnter a sentence to translate or type '0' to exit:")
    input_sentence = input()
    
    if input_sentence == "0":
        print("Exiting the program. Goodbye!")
        break
    
    # Process the input sentence
    input_sequence = [input_token_index.get(word, 0) for word in input_sentence.split()]
    input_sequence = pad_sequences([input_sequence], maxlen=max_length_src, padding='post')

    # Decode the input sequence
    decoded_sentence = decode_sequence(input_sequence)

    # Clean up the decoded sentence if it ends with '_END'
    if decoded_sentence.endswith(" _END"):
        decoded_sentence = decoded_sentence[:-5].strip()

    # Display results
    print("Input Sentence:", input_sentence)
    print("Predicted Marathi Translation:", decoded_sentence)



Enter a sentence to translate or type '0' to exit:


 i want to speak with you


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Input Sentence: i want to speak with you
Predicted Marathi Translation: मला तुझ्याशी बोलायचं आहे

Enter a sentence to translate or type '0' to exit:


 lets play the game


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Input Sentence: lets play the game
Predicted Marathi Translation: खेळ खेळू या

Enter a sentence to translate or type '0' to exit:


 what a beautiful night


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Input Sentence: what a beautiful night
Predicted Marathi Translation: काय सुंदर रात्र होती

Enter a sentence to translate or type '0' to exit:


 turn off the tv


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Input Sentence: turn off the tv
Predicted Marathi Translation: टीव्ही बंद कर

Enter a sentence to translate or type '0' to exit:


 i knew it


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Input Sentence: i knew it
Predicted Marathi Translation: मला माहीत होतं

Enter a sentence to translate or type '0' to exit:


 0


Exiting the program. Goodbye!
