In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, Dense, TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

english_file = 'english-corpus.txt'
urdu_file = 'urdu-corpus.txt'

with open(english_file, 'r', encoding='utf-8') as file:
    english_sentences = file.readlines()

with open(urdu_file, 'r', encoding='utf-8') as file:
    urdu_sentences = file.readlines()

assert len(english_sentences) == len(urdu_sentences), "Mismatch in number of sentences between English and Urdu files."
data = pd.DataFrame({'English': [sentence.strip() for sentence in english_sentences],
                     'Urdu': [sentence.strip() for sentence in urdu_sentences]})

english_tokenizer = Tokenizer()
urdu_tokenizer = Tokenizer()

english_tokenizer.fit_on_texts(data['English'])
urdu_tokenizer.fit_on_texts(data['Urdu'])

english_sequences = english_tokenizer.texts_to_sequences(data['English'])
urdu_sequences = urdu_tokenizer.texts_to_sequences(data['Urdu'])

max_length = 20
english_padded = pad_sequences(english_sequences, maxlen=max_length, padding='post', truncating='post')
urdu_padded = pad_sequences(urdu_sequences, maxlen=max_length, padding='post', truncating='post')

decoder_input_data = urdu_padded[:, :-1]
decoder_output_data = urdu_padded[:, 1:]

decoder_input_data = np.pad(decoder_input_data, ((0, 0), (0, 1)), mode='constant', constant_values=0)
decoder_output_data = np.pad(decoder_output_data, ((0, 0), (0, 1)), mode='constant', constant_values=0)

embedding_dim = 100
rnn_units = 256

vocab_size_eng = len(english_tokenizer.word_index) + 1
vocab_size_urdu = len(urdu_tokenizer.word_index) + 1

encoder_inputs = Input(shape=(max_length,))
encoder_embedding = Embedding(vocab_size_eng, embedding_dim, input_length=max_length)(encoder_inputs)
encoder_rnn = SimpleRNN(rnn_units, return_state=True)(encoder_embedding)
encoder_outputs, state_h = encoder_rnn

decoder_inputs = Input(shape=(max_length,))
decoder_embedding = Embedding(vocab_size_urdu, embedding_dim, input_length=max_length)(decoder_inputs)
decoder_rnn = SimpleRNN(rnn_units, return_sequences=True)(decoder_embedding, initial_state=state_h)
decoder_outputs = TimeDistributed(Dense(vocab_size_urdu, activation='softmax'))(decoder_rnn)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit([english_padded, decoder_input_data], decoder_output_data,
                    epochs=50,
                    batch_size=64,
                    validation_split=0.2,
                    callbacks=[early_stopping])

print("Training Complete")


Epoch 1/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 216ms/step - accuracy: 0.7792 - loss: 2.3474 - val_accuracy: 0.8138 - val_loss: 1.2028
Epoch 2/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 249ms/step - accuracy: 0.8165 - loss: 1.1521 - val_accuracy: 0.8240 - val_loss: 1.0940
Epoch 3/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 237ms/step - accuracy: 0.8286 - loss: 1.0323 - val_accuracy: 0.8332 - val_loss: 1.0154
Epoch 4/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 232ms/step - accuracy: 0.8389 - loss: 0.9359 - val_accuracy: 0.8400 - val_loss: 0.9707
Epoch 5/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 228ms/step - accuracy: 0.8457 - loss: 0.8621 - val_accuracy: 0.8427 - val_loss: 0.9453
Epoch 6/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 229ms/step - accuracy: 0.8496 - loss: 0.8104 - val_accuracy: 0.8466 - val_loss: 0.9296
Epoch 7/50

In [None]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, Dense, Bidirectional, RepeatVector, TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

with open('english-corpus.txt', 'r', encoding='utf-8') as f:
    english_sentences = [line.strip() for line in f.readlines()]

with open('urdu-corpus.txt', 'r', encoding='utf-8') as f:
    urdu_sentences = [line.strip() for line in f.readlines()]

assert len(english_sentences) == len(urdu_sentences), "Sentence count mismatch."

eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_sentences)
eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)

urdu_tokenizer = Tokenizer()
urdu_tokenizer.fit_on_texts(urdu_sentences)
urdu_sequences = urdu_tokenizer.texts_to_sequences(urdu_sentences)

max_len = 20
encoder_input_data = pad_sequences(eng_sequences, maxlen=max_len, padding='post')
decoder_output_data = pad_sequences(urdu_sequences, maxlen=max_len, padding='post')

decoder_input_data = np.zeros_like(decoder_output_data)
decoder_input_data[:, 1:] = decoder_output_data[:, :-1]

eng_vocab_size = len(eng_tokenizer.word_index) + 1
urdu_vocab_size = len(urdu_tokenizer.word_index) + 1

embedding_dim = 100
rnn_units = 256

encoder_inputs = Input(shape=(max_len,))
encoder_embedding = Embedding(input_dim=eng_vocab_size, output_dim=embedding_dim, input_length=max_len)(encoder_inputs)
encoder_outputs = Bidirectional(SimpleRNN(rnn_units, return_sequences=False))(encoder_embedding)

context_vector = RepeatVector(max_len)(encoder_outputs)
decoder_inputs = Input(shape=(max_len,))
decoder_embedding = Embedding(input_dim=urdu_vocab_size, output_dim=embedding_dim, input_length=max_len)(decoder_inputs)

decoder_combined = decoder_embedding
decoder_rnn = SimpleRNN(rnn_units * 2, return_sequences=True)(decoder_combined)
decoder_outputs = TimeDistributed(Dense(urdu_vocab_size, activation='softmax'))(decoder_rnn)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_output_data[..., np.newaxis],
    epochs=50,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping]
)

print("Training Complete.")

Epoch 1/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 324ms/step - accuracy: 0.7435 - loss: 2.2680 - val_accuracy: 0.7735 - val_loss: 1.3972
Epoch 2/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 327ms/step - accuracy: 0.7775 - loss: 1.3362 - val_accuracy: 0.7854 - val_loss: 1.2676
Epoch 3/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 321ms/step - accuracy: 0.7898 - loss: 1.1929 - val_accuracy: 0.7945 - val_loss: 1.2010
Epoch 4/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 353ms/step - accuracy: 0.7991 - loss: 1.0940 - val_accuracy: 0.7988 - val_loss: 1.1695
Epoch 5/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 334ms/step - accuracy: 0.8048 - loss: 1.0276 - val_accuracy: 0.8022 - val_loss: 1.1497
Epoch 6/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 337ms/step - accuracy: 0.8098 - loss: 0.9716 - val_accuracy: 0.8037 - val_loss: 1.1411
Epoch

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

english_file = 'english-corpus.txt'
urdu_file = 'urdu-corpus.txt'

with open(english_file, 'r', encoding='utf-8') as file:
    english_sentences = file.readlines()

with open(urdu_file, 'r', encoding='utf-8') as file:
    urdu_sentences = file.readlines()

assert len(english_sentences) == len(urdu_sentences), "Mismatch in number of sentences between English and Urdu files."
data = pd.DataFrame({'English': [sentence.strip() for sentence in english_sentences],
                     'Urdu': [sentence.strip() for sentence in urdu_sentences]})

english_tokenizer = Tokenizer()
urdu_tokenizer = Tokenizer()

english_tokenizer.fit_on_texts(data['English'])
urdu_tokenizer.fit_on_texts(data['Urdu'])

english_sequences = english_tokenizer.texts_to_sequences(data['English'])
urdu_sequences = urdu_tokenizer.texts_to_sequences(data['Urdu'])

max_length = 20
english_padded = pad_sequences(english_sequences, maxlen=max_length, padding='post', truncating='post')
urdu_padded = pad_sequences(urdu_sequences, maxlen=max_length, padding='post', truncating='post')

decoder_input_data = urdu_padded[:, :-1]
decoder_output_data = urdu_padded[:, 1:]

decoder_input_data = np.pad(decoder_input_data, ((0, 0), (0, 1)), mode='constant', constant_values=0)
decoder_output_data = np.pad(decoder_output_data, ((0, 0), (0, 1)), mode='constant', constant_values=0)

embedding_dim = 100
lstm_units = 256

vocab_size_eng = len(english_tokenizer.word_index) + 1
vocab_size_urdu = len(urdu_tokenizer.word_index) + 1

encoder_inputs = Input(shape=(max_length,))
encoder_embedding = Embedding(vocab_size_eng, embedding_dim, input_length=max_length)(encoder_inputs)
encoder_lstm, forward_h, forward_c = LSTM(lstm_units, return_state=True)(encoder_embedding)
encoder_state = [forward_h, forward_c]

decoder_inputs = Input(shape=(max_length,))
decoder_embedding = Embedding(vocab_size_urdu, embedding_dim, input_length=max_length)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True)(decoder_embedding, initial_state=encoder_state)
decoder_outputs = TimeDistributed(Dense(vocab_size_urdu, activation='softmax'))(decoder_lstm)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit([english_padded, decoder_input_data], decoder_output_data,
                    epochs=50,
                    batch_size=64,
                    validation_split=0.2,
                    callbacks=[early_stopping])

print("Training Complete")

Epoch 1/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 257ms/step - accuracy: 0.7853 - loss: 2.4296 - val_accuracy: 0.8106 - val_loss: 1.2333
Epoch 2/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 300ms/step - accuracy: 0.8118 - loss: 1.1986 - val_accuracy: 0.8177 - val_loss: 1.1567
Epoch 3/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 256ms/step - accuracy: 0.8206 - loss: 1.1138 - val_accuracy: 0.8242 - val_loss: 1.0924
Epoch 4/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 276ms/step - accuracy: 0.8281 - loss: 1.0373 - val_accuracy: 0.8305 - val_loss: 1.0403
Epoch 5/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 262ms/step - accuracy: 0.8353 - loss: 0.9689 - val_accuracy: 0.8375 - val_loss: 0.9933
Epoch 6/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 257ms/step - accuracy: 0.8428 - loss: 0.9014 - val_accuracy: 0.8444 - val_loss: 0.9460
Epoch 7/5

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, LayerNormalization, Dropout, MultiHeadAttention, Add
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

english_file = 'english-corpus.txt'
urdu_file = 'urdu-corpus.txt'

with open(english_file, 'r', encoding='utf-8') as file:
    english_sentences = file.readlines()

with open(urdu_file, 'r', encoding='utf-8') as file:
    urdu_sentences = file.readlines()

assert len(english_sentences) == len(urdu_sentences), "Mismatch in number of sentences between English and Urdu files."
data = pd.DataFrame({'English': [sentence.strip() for sentence in english_sentences],
                     'Urdu': [sentence.strip() for sentence in urdu_sentences]})

english_tokenizer = Tokenizer()
urdu_tokenizer = Tokenizer()

english_tokenizer.fit_on_texts(data['English'])
urdu_tokenizer.fit_on_texts(data['Urdu'])

english_sequences = english_tokenizer.texts_to_sequences(data['English'])
urdu_sequences = urdu_tokenizer.texts_to_sequences(data['Urdu'])

max_length = 20
english_padded = pad_sequences(english_sequences, maxlen=max_length, padding='post', truncating='post')
urdu_padded = pad_sequences(urdu_sequences, maxlen=max_length, padding='post', truncating='post')

decoder_input_data = urdu_padded[:, :-1]
decoder_output_data = urdu_padded[:, 1:]
decoder_input_data = np.pad(decoder_input_data, ((0, 0), (0, 1)), mode='constant', constant_values=0)
decoder_output_data = np.pad(decoder_output_data, ((0, 0), (0, 1)), mode='constant', constant_values=0)

embedding_dim = 100
num_heads = 8
ff_dim = 512
num_layers = 4

vocab_size_eng = len(english_tokenizer.word_index) + 1
vocab_size_urdu = len(urdu_tokenizer.word_index) + 1

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0.1):
    attention = MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(inputs, inputs)
    attention = Dropout(dropout)(attention)
    attention = Add()([inputs, attention])
    attention = LayerNormalization()(attention)
    
    ffn = Dense(ff_dim, activation='relu')(attention)
    ffn = Dense(inputs.shape[-1])(ffn)
    ffn = Dropout(dropout)(ffn)
    ffn = Add()([attention, ffn])
    ffn = LayerNormalization()(ffn)
    
    return ffn

encoder_inputs = Input(shape=(max_length,))
encoder_embedding = Embedding(vocab_size_eng, embedding_dim, input_length=max_length)(encoder_inputs)

x = encoder_embedding
for _ in range(num_layers):
    x = transformer_encoder(x, head_size=embedding_dim, num_heads=num_heads, ff_dim=ff_dim)

decoder_inputs = Input(shape=(max_length,))
decoder_embedding = Embedding(vocab_size_urdu, embedding_dim, input_length=max_length)(decoder_inputs)

y = decoder_embedding
for _ in range(num_layers):
    y = transformer_encoder(y, head_size=embedding_dim, num_heads=num_heads, ff_dim=ff_dim)
decoder_outputs = Dense(vocab_size_urdu, activation='softmax')(y)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit([english_padded, decoder_input_data], decoder_output_data,
                    epochs=50,
                    batch_size=64,
                    validation_split=0.2,
                    callbacks=[early_stopping])

print("Training Complete")

Epoch 1/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 322ms/step - accuracy: 0.7887 - loss: 2.9755 - val_accuracy: 0.8734 - val_loss: 0.8676
Epoch 2/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 338ms/step - accuracy: 0.8889 - loss: 0.7501 - val_accuracy: 0.9209 - val_loss: 0.5571
Epoch 3/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 306ms/step - accuracy: 0.9335 - loss: 0.4431 - val_accuracy: 0.9438 - val_loss: 0.4037
Epoch 4/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 330ms/step - accuracy: 0.9567 - loss: 0.2786 - val_accuracy: 0.9556 - val_loss: 0.3287
Epoch 5/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 344ms/step - accuracy: 0.9695 - loss: 0.1871 - val_accuracy: 0.9595 - val_loss: 0.2998
Epoch 6/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 315ms/step - accuracy: 0.9785 - loss: 0.1232 - val_accuracy: 0.9621 - val_loss: 0.2825
Epoch 