In [None]:
import tensorflow as tf
import numpy as np
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
class NeuralMachineTranslation:
    def __init__(self, max_sequence_length=10):
        """
        Initialize Neural Machine Translation Model
        
        Args:
            max_sequence_length (int): Maximum length of input/output sequences
        """
        self.max_sequence_length = max_sequence_length
        self.input_tokenizer = None
        self.output_tokenizer = None
        self.model = None
    
    def prepare_data(self, input_texts, output_texts):
        """
        Prepare input and output texts for training
        
        Args:
            input_texts (list): List of input language sentences
            output_texts (list): List of output language sentences
        
        Returns:
            tuple: Tokenized and padded input and output sequences
        """
        # Tokenize input texts
        self.input_tokenizer = Tokenizer(filters='', lower=False)
        self.input_tokenizer.fit_on_texts(input_texts)
        input_sequences = self.input_tokenizer.texts_to_sequences(input_texts)
        encoder_input_data = pad_sequences(input_sequences, maxlen=self.max_sequence_length, padding='post')
        
        # Tokenize output texts
        self.output_tokenizer = Tokenizer(filters='', lower=False)
        self.output_tokenizer.fit_on_texts(output_texts)
        output_sequences = self.output_tokenizer.texts_to_sequences(output_texts)
        decoder_input_data = pad_sequences(output_sequences, maxlen=self.max_sequence_length, padding='post')
        
        # One-hot encode output sequences
        # Shift the target data by one time step
        decoder_target_data = np.zeros((len(output_texts), self.max_sequence_length, 
                                        len(self.output_tokenizer.word_index) + 1), dtype='float32')
        
        for i, sequence in enumerate(decoder_input_data):
            for t, word_index in enumerate(sequence):
                if word_index > 0 and t > 0:
                    decoder_target_data[i, t-1, word_index] = 1.0
        
        # Get vocabulary sizes
        input_vocab_size = len(self.input_tokenizer.word_index) + 1
        output_vocab_size = len(self.output_tokenizer.word_index) + 1
        
        return (encoder_input_data, decoder_input_data, decoder_target_data, 
                input_vocab_size, output_vocab_size)
    
    def build_model(self, input_vocab_size, output_vocab_size):
        """
        Build the Neural Machine Translation model
        
        Args:
            input_vocab_size (int): Size of input language vocabulary
            output_vocab_size (int): Size of output language vocabulary
        """
        # Encoder
        encoder_inputs = tf.keras.layers.Input(shape=(self.max_sequence_length,))
        encoder_embedding = tf.keras.layers.Embedding(
            input_vocab_size, 256, mask_zero=True)(encoder_inputs)
        encoder_lstm = tf.keras.layers.LSTM(256, return_sequences=True, return_state=True)
        encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
        encoder_states = [state_h, state_c]
        
        # Decoder
        decoder_inputs = tf.keras.layers.Input(shape=(self.max_sequence_length,))
        decoder_embedding = tf.keras.layers.Embedding(
            output_vocab_size, 256, mask_zero=True)(decoder_inputs)
        decoder_lstm = tf.keras.layers.LSTM(256, return_sequences=True, return_state=True)
        decoder_outputs, _, _ = decoder_lstm(
            decoder_embedding, initial_state=encoder_states)
        
        # Dense output layer
        decoder_dense = tf.keras.layers.Dense(
            output_vocab_size, activation='softmax')
        decoder_outputs = decoder_dense(decoder_outputs)
        
        # Compile the model
        model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
        model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
        
        self.model = model
    
    def train(self, input_texts, output_texts, epochs=50, batch_size=32):
        """
        Train the Neural Machine Translation model
        
        Args:
            input_texts (list): Training input language sentences
            output_texts (list): Training output language sentences
            epochs (int): Number of training epochs
            batch_size (int): Batch size for training
        """
        # Prepare data
        (encoder_input_data, decoder_input_data, 
         decoder_target_data, input_vocab_size, 
         output_vocab_size) = self.prepare_data(input_texts, output_texts)
        
        # Build model
        self.build_model(input_vocab_size, output_vocab_size)
        
        # Train the model
        self.model.fit(
            [encoder_input_data, decoder_input_data], 
            decoder_target_data,
            batch_size=batch_size,
            epochs=epochs,
            validation_split=0.2
        )

    def save(self, filepath):
        """
        Save the entire model to a specified filepath
        """
        self.model.save(filepath, save_format='h5')
    
    def translate(self, input_sentence):
        """
        Translate a single sentence
        
        Args:
            input_sentence (str): Sentence to translate
        
        Returns:
            str: Translated sentence
        """
        if not self.model or not self.input_tokenizer or not self.output_tokenizer:
            raise ValueError("Model must be trained before translation")
        
        # Tokenize and pad input sentence
        input_seq = self.input_tokenizer.texts_to_sequences([input_sentence])
        input_padded = pad_sequences(input_seq, maxlen=self.max_sequence_length, padding='post')
        
        # Prepare decoder input
        decoder_input = np.zeros((1, self.max_sequence_length))
        decoder_input[0, 0] = self.output_tokenizer.word_index.get('<start>', 1)  # Start token
        
        # Translation process
        translated_words = []
        for i in range(self.max_sequence_length):
            # Predict next word
            output = self.model.predict([input_padded, decoder_input])
            
            # Get the index of the word with the highest probability
            predicted_word_index = np.argmax(output[0, i, :])
            
            # Convert index to word
            predicted_word = self.output_tokenizer.index_word.get(predicted_word_index, '')
            
            # Stop if no word is predicted or we've reached max length
            if not predicted_word or predicted_word == '<end>':
                break
            
            translated_words.append(predicted_word)
            
            # Update decoder input
            decoder_input[0, i+1] = predicted_word_index
        
        return ' '.join(translated_words)

In [None]:
def read_texts_from_file(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            # Strip whitespace and remove empty lines
            texts = ['<start> ' + line.strip() + ' <end>' for line in file if line.strip()]
        return texts
    except FileNotFoundError:
        print(f"Error: File {filename} not found.")
        return []
    except Exception as e:
        print(f"Error reading file {filename}: {e}")
        return []

In [None]:
def main():

    model_save_path = 'en_fr_model.h5'

    
    input_texts = read_texts_from_file('datasets/eng_fr.en')
    output_texts = read_texts_from_file('datasets/french')

    input_texts = input_texts[:10000]
    output_texts = output_texts[:10000]
    
    # Initialize and train the model
    nmt_model = NeuralMachineTranslation(max_sequence_length=5)
    
    # Train the model
    nmt_model.train(input_texts, output_texts, epochs=100)
    nmt_model.save(model_save_path)

    with open("en_fr_input_tokenizer.pkl", "wb") as f:
        pickle.dump(nmt_model.input_tokenizer, f)

    with open("en_fr_output_tokenizer.pkl", "wb") as f:
        pickle.dump(nmt_model.output_tokenizer, f)


    
    # Translate the first sentence
    if input_texts:
        input_sentence = input_texts[0].replace('<start>', '').replace('<end>', '').strip()
        translation = nmt_model.translate(input_sentence)
        print(f"Input: {input_sentence}")
        print(f"Translation: {translation}")
    else:
        print("No input texts available for translation.")

if __name__ == "__main__":
    main()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78