In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [2]:
data=pd.read_csv("eng-ger.csv")
english_sentences=data["ENGLISH"].head(10000).tolist()
german_sentences=data["GERMAN"].head(10000).tolist()

In [3]:
# Tokenize English sentences
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
total_words_eng = len(tokenizer_eng.word_index) + 1

# Tokenize German sentences
tokenizer_ger = Tokenizer()
tokenizer_ger.fit_on_texts(german_sentences)
total_words_ger = len(tokenizer_ger.word_index) + 1

# Convert sentences to sequences
input_sequences = tokenizer_eng.texts_to_sequences(english_sentences)
output_sequences = tokenizer_ger.texts_to_sequences(german_sentences)

In [4]:


# Save reverse word index to JSON files
import json

with open('tokenizer_eng.json', 'w') as f:
    json.dump(tokenizer_eng.to_json(), f)

with open('tokenizer_ger.json', 'w') as f:
    json.dump(tokenizer_ger.to_json(), f)

In [5]:
# Pad sequences to have the same length
max_len = max(max(len(seq) for seq in input_sequences), max(len(seq) for seq in output_sequences))
input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='post')
output_sequences = pad_sequences(output_sequences, maxlen=max_len, padding='post')

In [None]:

# Create input and output for the model
X = np.array(input_sequences)
y = np.array(output_sequences)

# Convert target labels to one-hot encoding
y_one_hot = to_categorical(y, num_classes=total_words_ger)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)


In [5]:
# model = Sequential()
# model.add(Embedding(total_words_eng, 128, input_length=max_len))
# model.add(Bidirectional(LSTM(256, return_sequences=True)))
# model.add(Dropout(0.2))
# model.add(BatchNormalization())
# model.add(Dense(total_words_ger, activation='softmax'))

# # Compile the model with Adam optimizer and categorical_crossentropy loss
# optimizer = Adam(learning_rate=0.001)
# model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# # Learning rate scheduler
# def lr_schedule(epoch):
#     if epoch < 10:
#         return 0.001
#     else:
#         return 0.0001

# lr_scheduler = LearningRateScheduler(lr_schedule)

# # Early stopping
# early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)



In [6]:
# # Train the model with callbacks
# model.fit(X_train, y_train, epochs=8, validation_data=(X_test, y_test), callbacks=[lr_scheduler, early_stopping])



Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x201340afa00>

In [7]:
# # Evaluate on test set
# test_loss, test_accuracy = model.evaluate(X_test, y_test)



In [8]:
model.save("language_translation_model.keras")

In [13]:
with open("max_len.txt", "w") as f:
    f.write(str(max_len))

In [7]:
# Load the trained model
loaded_model = load_model("language_translation_model.keras")

In [25]:
# Take input from the user
user_input = input("Enter an English sentence: ")

# Tokenize and pad the input sequence
input_seq = tokenizer_eng.texts_to_sequences([user_input])
input_seq = pad_sequences(input_seq, maxlen=max_len, padding='post')

# Predict the output sequence
predicted_seq = loaded_model.predict(input_seq)

predicted_text = []
for word_index in np.argmax(predicted_seq, axis=-1)[0]:
    if word_index != 0:  # Ignore padding index
        word = tokenizer_ger.index_word.get(word_index, '<OOV>')
        predicted_text.append(word)

# Display the result
german_translation = ' '.join(predicted_text)
print(f"German Translation: {german_translation}")

Enter an English sentence:  i am studying


German Translation: ich lerne sehr
