# Spanish to English Translator

In [1]:
# Import necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import os,io
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.losses import sparse_categorical_crossentropy
from nltk.translate.bleu_score import corpus_bleu

In [2]:
# Downloading the file
zip_file = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

file_path = os.path.dirname(zip_file)+"/spa-eng/spa.txt"

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [4]:
def load_data(path, size=None):
    text = io.open(file_path, encoding='UTF-8').read()
    lines = text.splitlines()
    pairs = [line.split('\t') for line in lines]
    source = np.array([source for target, source in pairs])  # extract source text into a numpy array
    target = np.array([target for target, source in pairs])  # extract target text into a numpy array
    return source, target


In [5]:
src_sentences, tgt_sentences = load_data(file_path)
print("Original Sentence:",src_sentences[40])
print("Translated Sentence:",tgt_sentences[40])

Original Sentence: Abrázame.
Translated Sentence: Hug me.


In [6]:
src_sentences, tgt_sentences = load_data(file_path)
print("Original Sentence:",src_sentences[50])
print("Translated Sentence:",tgt_sentences[50])

Original Sentence: Estoy levantado.
Translated Sentence: I'm up.


In [7]:
src_sentences, tgt_sentences = load_data(file_path)
print("Original Sentence:",src_sentences[100])
print("Translated Sentence:",tgt_sentences[100])

Original Sentence: ¡Salga de aquí!
Translated Sentence: Go away!


In [8]:
print(src_sentences.shape)
print(tgt_sentences.shape)

(118964,)
(118964,)


In [13]:
#Create a dataframe
df = pd.DataFrame(zip(src_sentences, tgt_sentences), columns=['source_sentence', 'target_sentence'])

In [14]:
df.head(5)

Unnamed: 0,source_sentence,target_sentence
0,Ve.,Go.
1,Vete.,Go.
2,Vaya.,Go.
3,Váyase.,Go.
4,Hola.,Hi.


In [15]:
# Lowercase all characters
df['source_sentence']=df['source_sentence'].apply(lambda x: x.lower())
df['target_sentence']=df['target_sentence'].apply(lambda x: x.lower())

In [16]:
# Remove quotes
df['source_sentence']=df['source_sentence'].apply(lambda x: re.sub("'", '', x))
df['target_sentence']=df['target_sentence'].apply(lambda x: re.sub("'", '', x))

In [20]:
import string
from string import digits

In [22]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
df['source_sentence']=df['source_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['target_sentence']=df['target_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [23]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
df['source_sentence']=df['source_sentence'].apply(lambda x: x.translate(remove_digits))
df['target_sentence']=df['target_sentence'].apply(lambda x: x.translate(remove_digits))

df['target_sentence'] = df['target_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
df['source_sentence']=df['source_sentence'].apply(lambda x: x.strip())
df['target_sentence']=df['target_sentence'].apply(lambda x: x.strip())
df['source_sentence']=df['source_sentence'].apply(lambda x: re.sub(" +", " ", x))
df['target_sentence']=df['target_sentence'].apply(lambda x: re.sub(" +", " ", x))

In [24]:
s = df['source_sentence']
t = df['target_sentence']

In [25]:
source_sentences = s[500:15000]
target_sentences = t[500:15000]

In [26]:
print(source_sentences.shape)
print(target_sentences.shape)

(14500,)
(14500,)


In [28]:
type(target_sentences)

pandas.core.series.Series

In [29]:
# Dataset Preparation

# Tokenization and one-hot encoding
tokenizer_source = Tokenizer(oov_token="<OOV>")
tokenizer_source.fit_on_texts(source_sentences)
source_sequences = tokenizer_source.texts_to_sequences(source_sentences)
source_sequences_padded = pad_sequences(source_sequences)

tokenizer_target = Tokenizer(oov_token="<OOV>")
tokenizer_target.fit_on_texts(target_sentences)
target_sequences = tokenizer_target.texts_to_sequences(target_sentences)
target_sequences_padded = pad_sequences(target_sequences)

In [30]:
# Ensure <start> and <end> tokens are in the vocabulary
if '<start>' not in tokenizer_target.word_index:
    tokenizer_target.word_index['<start>'] = len(tokenizer_target.word_index) + 1
if '<end>' not in tokenizer_target.word_index:
    tokenizer_target.word_index['<end>'] = len(tokenizer_target.word_index) + 1


In [31]:
# Building the Encoder-Decoder Model
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(len(tokenizer_source.word_index) + 1, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]


In [32]:
# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(len(tokenizer_target.word_index) + 1, latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(len(tokenizer_target.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [33]:
# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Create encoder model
encoder_model = Model(encoder_inputs, encoder_states)

In [34]:
# Create decoder model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_input = [decoder_state_input_h, decoder_state_input_c]

In [35]:
# Fix: Apply the embedding directly to the decoder inputs
decoder_embedding_inference = Embedding(len(tokenizer_target.word_index) + 1, latent_dim, mask_zero=True)(decoder_inputs)
decoder_outputs_inference, state_h, state_c = decoder_lstm(
    decoder_embedding_inference, initial_state=decoder_states_input
)
decoder_states_inference = [state_h, state_c]
decoder_outputs_inference = decoder_dense(decoder_outputs_inference)

decoder_model = Model(
    [decoder_inputs] + decoder_states_input,
    [decoder_outputs_inference] + decoder_states_inference
)



In [54]:
# Model Training
model.compile(optimizer='adam', loss=sparse_categorical_crossentropy, metrics=['accuracy'])
model.fit([source_sequences_padded, target_sequences_padded[:, :-1]], target_sequences_padded[:, 1:], epochs=100, batch_size=124)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7d6867ab83d0>

In [55]:
# Translation Function
def translate_sentence(input_sentence):
    input_sequence = tokenizer_source.texts_to_sequences([input_sentence])
    input_sequence_padded = pad_sequences(input_sequence)

    states_value = encoder_model.predict(input_sequence_padded)

    target_sequence = np.zeros((1, 1))
    target_sequence[0, 0] = tokenizer_target.word_index['<start>']
    translated_sentence = ''

    while True:
        output_tokens, h, c = decoder_model.predict([target_sequence] + states_value)

        # Get the index of the most probable token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Handle the case where the sampled token is not in the vocabulary
        if sampled_token_index not in tokenizer_target.index_word:
            break

        sampled_word = tokenizer_target.index_word[sampled_token_index]

        if sampled_word == '<end>' or len(translated_sentence.split()) > len(input_sentence.split()) * 2:
            break

        translated_sentence += sampled_word + ' '

        target_sequence = np.zeros((1, 1))
        target_sequence[0, 0] = sampled_token_index

        states_value = [h, c]

    return translated_sentence.strip()


In [56]:
# Evaluation
# Prepare a held-out test set and evaluate BLEU score
test_source_sentences = ['Tom estaba aterrorizado.']
test_target_sentences = ['Tom was terrified.']

reference_translations = [sent.split() for sent in test_target_sentences]
hypothesis_translations = [translate_sentence(sent).split() for sent in test_source_sentences]

bleu_score = corpus_bleu(reference_translations, test_target_sentences)
print(f'BLEU Score: {bleu_score}')

BLEU Score: 0.667278568794606


In [59]:
# Example usage
input_sentence = "Estoy levantado."  # Replace with your input sentence
predicted_translation = translate_sentence(input_sentence)

print(f"Input Sentence: {input_sentence}")
print(f"Predicted Translation: {predicted_translation}")

Input Sentence: Estoy levantado.
Predicted Translation: up up up up up
