In [1]:
#These are the necessary libraries for the program.
import keras, tensorflow # Deep learning frameworks
from keras.models import Model # This is for creating the neural network model
from keras.layers import Input, LSTM, Dense # Neurakl network layer types
import numpy as np # Used for numerical operations
import difflib #Used for finding similar strings
import string # Used for string operations
import tkinter as tk #This is used to implement a GUI for the program.
from tkinter import ttk, scrolledtext # GUI components
from tkinter import messagebox # This is used to display popup messages

In [2]:
#This opens the dataset which and reads it
with open("englishtogerman.txt", 'r', encoding='utf-8') as f:
  lines = f.read().split('\n') #This splits the file into lines

english_texts = [] #This is where English sentences will be stored
german_texts = [] #This is where German sentences will be stored
input_characters = set() #Unique characters in English texts
target_characters = set() #Same as for English, this is unique characters in German tetxs

num_samples = 10000 # The number of translation pairs to use

In [3]:
#This for loop iterates through each line in the dataset, splits the dataset into English and German. It also adds start and end tokens to German text.
#After that it stores the texts in the english_texts.append and german_texts.append lines, an builds character sets once that is done.
for line in lines[: min(num_samples, len(lines) - 1)]:
  english_text, german_text = line.split('\t')
  german_text = '\t' + german_text + '\n'
  english_texts.append(english_text)
  german_texts.append(german_text)
  for char in english_text:
    if char not in input_characters:
      input_characters.add(char)
  for char in german_text:
    if char not in target_characters:
      target_characters.add(char)

#This sequence converts character sets into sorted lists, calculates vocabulary sizes and the maximum sequence lenghts.
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in english_texts])
max_decoder_seq_length = max([len(txt) for txt in german_texts])

In [4]:
#Next, this creates character to index mapping dictionaries
input_token_index = dict(
  [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
  [(char, i) for i, char in enumerate(target_characters)])

#This initializes input matrices for the neural network
encoder_input_data = np.zeros(
  (len(english_texts), max_encoder_seq_length, num_encoder_tokens),
  dtype='float32') # The matrix for english input texts
decoder_input_data = np.zeros(
  (len(english_texts), max_decoder_seq_length, num_decoder_tokens),
  dtype='float32') # Matrix for German input texts
decoder_target_data = np.zeros(
  (len(english_texts), max_decoder_seq_length, num_decoder_tokens),
  dtype='float32') # Matrix for german target texts


# Converts texts to one hot encoded matrices
for i, (english_text, german_text) in enumerate(zip(english_texts, german_texts)):
  for t, char in enumerate(english_text): # One hot encoded english text
    encoder_input_data[i, t, input_token_index[char]] = 1.
  for t, char in enumerate(german_text): # One hot encode german text
    decoder_input_data[i, t, target_token_index[char]] = 1.
    if t > 0:
      # decoder_target_data will be ahead by one timestep
      # and will not include the start character.
      decoder_target_data[i, t - 1, target_token_index[char]] = 1.


In [5]:
#The parameters for the model
batch_size = 64  # batch size for training
epochs = 100  # number of epochs to train for
latent_dim = 256  # latent dimensionality of the encoding space

#Building the encoder model
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

#Building the decoder model
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

#Creating the full model by combining encoder and decoder. This can either be trained and saved or if already trained, it can load the model if it ssaved.
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.load_weights("seq2seq_eng-ger.h5")

#creates the inference models
encoder_model = Model(encoder_inputs, encoder_states)

#this sets up decoder inference model inputs
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

#creates the decoder inference model
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
  [decoder_inputs] + decoder_states_inputs,
  [decoder_outputs] + decoder_states)

# Creates reverse lookups for converting indices back to characters
reverse_input_char_index = dict(
  (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
  (i, char) for char, i in target_token_index.items())


AttributeError: module 'ml_dtypes' has no attribute 'float8_e3m4'


In [6]:
#This is a function for text normalization
def normalize_sentence(sentence): #Converts text to lowercase and removes the punctuation.
    return ''.join([char.lower() for char in sentence if char not in string.punctuation]).strip()

#Sequence decoding function (translation)
def decode_sequence(input_seq): # takes an input sequence and then processes it by using the seq2seq model. Then it generates a decoded output sequence step by step.
    states_value = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.
    
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
        
        if sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True
            
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.
        states_value = [h, c]
    
    return decoded_sentence


In [7]:
# This is the implementation for the graphical user interface for the program.
class TranslatorGUI:
    def __init__(self, root): # This is used to initialize the GUI window and its components
        self.root = root
        self.root.title("English-German Translator")
        self.root.geometry("800x600")
        
        # Creates main frame
        main_frame = ttk.Frame(root, padding="10")
        main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
        
        # Adds the title to the window
        title_label = ttk.Label(main_frame, text="English-German Translator", 
                              font=('Helvetica', 16, 'bold'))
        title_label.grid(row=0, column=0, columnspan=2, pady=10)
        
        # Input section
        input_frame = ttk.LabelFrame(main_frame, text="Input", padding="5")
        input_frame.grid(row=1, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=5)
        #adds the text input area
        self.english_text = scrolledtext.ScrolledText(input_frame, height=6, width=70, wrap=tk.WORD)
        self.english_text.grid(row=0, column=0, padx=5, pady=5)
        
        # Translate button
        self.translate_button = ttk.Button(main_frame, text="Translate", command=self.translate)
        self.translate_button.grid(row=2, column=0, columnspan=2, pady=10)
        
        # Output section
        output_frame = ttk.LabelFrame(main_frame, text="Translation Results", padding="5")
        output_frame.grid(row=3, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=5)
        
        self.output_text = scrolledtext.ScrolledText(output_frame, height=12, width=70, wrap=tk.WORD)
        self.output_text.grid(row=0, column=0, padx=5, pady=5)
        
        # Status bar
        self.status_var = tk.StringVar()
        self.status_bar = ttk.Label(main_frame, textvariable=self.status_var)
        self.status_bar.grid(row=4, column=0, columnspan=2, pady=5)
        
        # Configure grid weights
        self.root.columnconfigure(0, weight=1)
        self.root.rowconfigure(0, weight=1)
        main_frame.columnconfigure(0, weight=1)
        
    def translate(self): #This handles the translation button.
        # Clear previous output
        self.output_text.delete(1.0, tk.END)
        
        # Gets the input text
        english_text = self.english_text.get(1.0, tk.END).strip()
        
        if not english_text:
            messagebox.showwarning("Warning", "Please enter an English sentence to translate.")
            return
            
        normalized_input = normalize_sentence(english_text)
        normalized_english_texts = [normalize_sentence(text) for text in english_texts]
        
        if len(normalized_input) > max_encoder_seq_length:
            self.status_var.set(f"Error: Input exceeds maximum length of {max_encoder_seq_length} characters")
            return
            
        # Check for exact or close match
        if normalized_input in normalized_english_texts:
            seq_index = normalized_english_texts.index(normalized_input)
            input_seq = encoder_input_data[seq_index: seq_index + 1]
            decoded_sentence = decode_sequence(input_seq)
            self.output_text.insert(tk.END, f"Translation:\n{decoded_sentence}\n")
            self.status_var.set("Translation complete")
        else:
            closest_matches = difflib.get_close_matches(normalized_input, normalized_english_texts, n=3, cutoff=0.3)
            if closest_matches:
                self.output_text.insert(tk.END, "No exact match found. Similar sentences and their translations:\n\n")
                for match in closest_matches:
                    seq_index = normalized_english_texts.index(match)
                    input_seq = encoder_input_data[seq_index: seq_index + 1]
                    decoded_sentence = decode_sequence(input_seq)
                    self.output_text.insert(tk.END, f"Similar: {match}\n")
                    self.output_text.insert(tk.END, f"Translation: {decoded_sentence}\n\n")
                self.status_var.set("Found similar matches")
            else:
                self.output_text.insert(tk.END, "No matches found in the dataset.")
                self.status_var.set("No matches found")

In [8]:
#GUI calling point 
def main(): #Starts the application
    root = tk.Tk() #creates the main window
    app = TranslatorGUI(root) # creates the application
    root.mainloop() #Starts event loop

if __name__ == "__main__":
    main()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
