## Proyecto Off-Platform Project: Machine Translation

*PROJECT IN POGRESS AND STILL UNDER CONSTRUCTION*

In [1]:
import os
import re
import numpy as np
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
# Parameters & File Paths

DATA_PATH = "ita.txt"  # Change this path to your translation dataset file
NUM_LINES = 500        # Number of lines to process (adjust as needed)

In [11]:
# Hyperparameters

LATENT_DIM = 256       # Dimensionality of the LSTM hidden states
BATCH_SIZE = 80
EPOCHS = 50

### Data Preprocessing Functions

In [5]:
 """ Loads and preprocesses the dataset.
     Returns: 
      input_texts: List of source sentences.
      target_texts: List of target sentences (with start/end tokens).
      input_tokens: Sorted list of unique tokens in the source language.
      target_tokens: Sorted list of unique tokens in the target language.  """
def load_data(data_path: str, num_lines: int):
    if not Path(data_path).exists():
        raise FileNotFoundError(f"Data file {data_path} not found.")

    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

    input_texts = []
    target_texts = []
    input_token_set = set()
    target_token_set = set()

    for line in lines[:num_lines]:
        # Expecting each line to contain at least two tab-separated columns
        parts = line.split('\t')
        if len(parts) < 2:
            continue
        input_text, target_text = parts[0], parts[1]
        input_texts.append(input_text)
        
        # Use regex to split punctuation from words
        target_text = " ".join(re.findall(r"[\w']+|[^\s\w]", target_text))
        # Add start and end tokens to target sentence
        target_text = '<START> ' + target_text + ' <END>'
        target_texts.append(target_text)

        # Build token sets from input (using regex tokenization) and target (space-split)
        for token in re.findall(r"[\w']+|[^\s\w]", input_text):
            input_token_set.add(token)
        for token in target_text.split():
            target_token_set.add(token)

    input_tokens = sorted(list(input_token_set))
    target_tokens = sorted(list(target_token_set))
    
    return input_texts, target_texts, input_tokens, target_tokens

In [6]:
""" Converts input and target texts into one-hot encoded 3D numpy arrays.  Returns: encoder_input_data, decoder_input_data, decoder_target_data, 
          input_features_dict, target_features_dict, reverse_target_features_dict, max_encoder_seq_length, max_decoder_seq_length. """
def vectorize_data(input_texts, target_texts, input_tokens, target_tokens):
    num_encoder_tokens = len(input_tokens)
    num_decoder_tokens = len(target_tokens)
    max_encoder_seq_length = max(len(re.findall(r"[\w']+|[^\s\w]", txt)) for txt in input_texts)
    max_decoder_seq_length = max(len(txt.split()) for txt in target_texts)
    
    # Create token-index mappings
    input_features_dict = {token: i for i, token in enumerate(input_tokens)}
    target_features_dict = {token: i for i, token in enumerate(target_tokens)}
    reverse_target_features_dict = {i: token for token, i in target_features_dict.items()}

    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32')
    decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')
    decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        # One-hot encode input sequence
        for t, token in enumerate(re.findall(r"[\w']+|[^\s\w]", input_text)):
            encoder_input_data[i, t, input_features_dict[token]] = 1.
        # One-hot encode decoder input and target sequences
        target_tokens_seq = target_text.split()
        for t, token in enumerate(target_tokens_seq):
            decoder_input_data[i, t, target_features_dict[token]] = 1.
            if t > 0:
                # Decoder target data is ahead by one timestep
                decoder_target_data[i, t - 1, target_features_dict[token]] = 1.
    
    return (encoder_input_data, decoder_input_data, decoder_target_data,
            input_features_dict, target_features_dict, reverse_target_features_dict,
            max_encoder_seq_length, max_decoder_seq_length)


### Model Building Functions

In [7]:
 """ Builds the encoder-decoder training model. Returns: training_model, encoder_inputs, decoder_inputs, decoder_lstm, decoder_dense."""
def build_training_model(num_encoder_tokens, num_decoder_tokens, latent_dim):
    # Define encoder
    encoder_inputs = Input(shape=(None, num_encoder_tokens), name='encoder_inputs')
    encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm')
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
    encoder_states = [state_h, state_c]
    
    # Define decoder
    decoder_inputs = Input(shape=(None, num_decoder_tokens), name='decoder_inputs')
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
    decoder_outputs = decoder_dense(decoder_outputs)
    
    training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return training_model, encoder_inputs, decoder_inputs, decoder_lstm, decoder_dense



### Inference Model Functions

In [8]:
""" Constructs the encoder and decoder models for inference. Returns: encoder_model, decoder_model."""
def build_inference_models(training_model, latent_dim, num_encoder_tokens, num_decoder_tokens):
    # Encoder inference model
    encoder_inputs = training_model.input[0]  # input placeholder for encoder
    encoder_outputs, state_h_enc, state_c_enc = training_model.get_layer('encoder_lstm').output
    encoder_states = [state_h_enc, state_c_enc]
    encoder_model = Model(encoder_inputs, encoder_states)

    # Decoder inference model
    decoder_inputs = training_model.input[1]  # input placeholder for decoder
    decoder_state_input_h = Input(shape=(latent_dim,), name='input_h')
    decoder_state_input_c = Input(shape=(latent_dim,), name='input_c')
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_lstm = training_model.get_layer('decoder_lstm')
    decoder_dense = training_model.get_layer('decoder_dense')
    
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    
    return encoder_model, decoder_model


In [9]:
""" Decodes an input sequence to generate the translated sentence."""
def decode_sequence(test_input, encoder_model, decoder_model, target_features_dict, reverse_target_features_dict, max_decoder_seq_length, num_decoder_tokens, latent_dim):
    # Encode the input sequence
    states_value = encoder_model.predict(test_input)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_features_dict['<START>']] = 1.

    decoded_sentence = ""
    stop_condition = False

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        # Get token with highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_features_dict[sampled_token_index]
        decoded_sentence += " " + sampled_token

        # Exit if hit <END> or max length reached
        if sampled_token == '<END>' or len(decoded_sentence.split()) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (length 1)
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.
        states_value = [h, c]

    return decoded_sentence.strip()

### Main Execution: Training & Testing

In [10]:
def main():
    # Preprocess the data
    print("Loading and preprocessing data...")
    input_texts, target_texts, input_tokens, target_tokens = load_data(DATA_PATH, NUM_LINES)
    (encoder_input_data, decoder_input_data, decoder_target_data,
     input_features_dict, target_features_dict, reverse_target_features_dict,
     max_encoder_seq_length, max_decoder_seq_length) = vectorize_data(input_texts, target_texts, input_tokens, target_tokens)

    num_encoder_tokens = len(input_tokens)
    num_decoder_tokens = len(target_tokens)
    print(f"Number of samples: {len(input_texts)}")
    print(f"Number of unique input tokens: {num_encoder_tokens}")
    print(f"Number of unique output tokens: {num_decoder_tokens}")
    print(f"Max sequence length for inputs: {max_encoder_seq_length}")
    print(f"Max sequence length for outputs: {max_decoder_seq_length}\n")

    # Build the training model
    print("Building the training model...")
    training_model, encoder_inputs, decoder_inputs, decoder_lstm, decoder_dense = build_training_model(num_encoder_tokens, num_decoder_tokens, LATENT_DIM)
    training_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    training_model.summary()
    
    # Setup callbacks for early stopping and saving the best model
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=5, verbose=1),
        ModelCheckpoint('training_model.h5', monitor='val_loss', save_best_only=True, verbose=1)
    ]
    
    # Train the model
    print("\nTraining the model...")
    training_model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
                       batch_size=BATCH_SIZE,
                       epochs=EPOCHS,
                       validation_split=0.2,
                       callbacks=callbacks)
    
    # Load the best saved model
    print("\nLoading the best model...")
    model = load_model('training_model.h5')
    
    # Build inference models
    encoder_model, decoder_model = build_inference_models(model, LATENT_DIM, num_encoder_tokens, num_decoder_tokens)
    
    # Testing: Decode some sequences and display
    print("\nDecoding test sequences...\n")
    for seq_index in range(min(10, len(input_texts))):
        test_input = encoder_input_data[seq_index: seq_index + 1]
        decoded_sentence = decode_sequence(test_input, encoder_model, decoder_model,
                                           target_features_dict, reverse_target_features_dict,
                                           max_decoder_seq_length, num_decoder_tokens, LATENT_DIM)
        print("Input sentence:", input_texts[seq_index])
        print("Decoded sentence:", decoded_sentence)
        print('-' * 50)

if __name__ == "__main__":
    main()

Loading and preprocessing data...
Number of samples: 500
Number of unique input tokens: 143
Number of unique output tokens: 428
Max sequence length for inputs: 4
Max sequence length for outputs: 8

Building the training model...



Training the model...
Epoch 1/50
[1m4/5[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 38ms/step - accuracy: 0.0444 - loss: 2.8208 
Epoch 1: val_loss improved from inf to 2.93068, saving model to training_model.h5




[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 178ms/step - accuracy: 0.0595 - loss: 2.8258 - val_accuracy: 0.1275 - val_loss: 2.9307
Epoch 2/50
[1m4/5[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 40ms/step - accuracy: 0.1280 - loss: 2.7830
Epoch 2: val_loss improved from 2.93068 to 2.87241, saving model to training_model.h5




[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - accuracy: 0.1275 - loss: 2.7871 - val_accuracy: 0.1250 - val_loss: 2.8724
Epoch 3/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.1250 - loss: 2.7169
Epoch 3: val_loss improved from 2.87241 to 2.67918, saving model to training_model.h5




[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.1250 - loss: 2.7154 - val_accuracy: 0.1250 - val_loss: 2.6792
Epoch 4/50
[1m4/5[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 39ms/step - accuracy: 0.1250 - loss: 2.4938
Epoch 4: val_loss improved from 2.67918 to 2.31089, saving model to training_model.h5




[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.1250 - loss: 2.4689 - val_accuracy: 0.1250 - val_loss: 2.3109
Epoch 5/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1250 - loss: 2.1587
Epoch 5: val_loss improved from 2.31089 to 2.04619, saving model to training_model.h5




[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.1250 - loss: 2.1516 - val_accuracy: 0.1250 - val_loss: 2.0462
Epoch 6/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1250 - loss: 1.8986
Epoch 6: val_loss improved from 2.04619 to 1.88626, saving model to training_model.h5




[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.1250 - loss: 1.8949 - val_accuracy: 0.1250 - val_loss: 1.8863
Epoch 7/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.1250 - loss: 1.7663
Epoch 7: val_loss did not improve from 1.88626
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.1250 - loss: 1.7682 - val_accuracy: 0.1250 - val_loss: 1.9207
Epoch 8/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1250 - loss: 1.7563
Epoch 8: val_loss improved from 1.88626 to 1.85748, saving model to training_model.h5




[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.1250 - loss: 1.7570 - val_accuracy: 0.1250 - val_loss: 1.8575
Epoch 9/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.1270 - loss: 1.7215
Epoch 9: val_loss did not improve from 1.85748
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.1279 - loss: 1.7241 - val_accuracy: 0.1700 - val_loss: 1.8604
Epoch 10/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1586 - loss: 1.7106
Epoch 10: val_loss did not improve from 1.85748
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.1586 - loss: 1.7130 - val_accuracy: 0.1700 - val_loss: 1.8591
Epoch 11/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1586 - loss: 1.7007
Epoch 11: val_loss did not improve from 1.85748
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0



[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.1586 - loss: 1.6963 - val_accuracy: 0.1700 - val_loss: 1.8470
Epoch 13/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1586 - loss: 1.6855
Epoch 13: val_loss did not improve from 1.84700
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.1586 - loss: 1.6884 - val_accuracy: 0.1700 - val_loss: 1.8663
Epoch 14/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.1586 - loss: 1.6838
Epoch 14: val_loss did not improve from 1.84700
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.1586 - loss: 1.6861 - val_accuracy: 0.1700 - val_loss: 1.8510
Epoch 15/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.1586 - loss: 1.6746
Epoch 15: val_loss did not improve from 1.84700
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s




Decoding test sequences...

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Input sentence: Hi.
Decoded sentence: . . <END>
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Input sentence: Hi.
Decoded sentence: . . <END>
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 