In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('data.csv')
df.head

<bound method NDFrame.head of                                                   english  \
0                                                     Go.   
1                                                     Go.   
2                                                     Go.   
3                                                     Go.   
4                                                     Hi.   
...                                                   ...   
118959  There are four main causes of alcohol-related ...   
118960  There are mothers and fathers who will lie awa...   
118961  A carbon footprint is the amount of carbon dio...   
118962  Since there are usually multiple websites on a...   
118963  If you want to sound like a native speaker, yo...   

                                                  spanish  
0                                                     Ve.  
1                                                   Vete.  
2                                                   Vaya.

In [4]:
df.columns

Index(['english', 'spanish'], dtype='object')

In [5]:
source_vocab = set(' '.join(df['english'][25:125]))
target_vocab = set(' '.join(df['spanish'][25:125]))
source_vocab_size = len(source_vocab)
target_vocab_size = len(target_vocab)

In [6]:
print(source_vocab)
print(target_vocab)

{'D', 'l', '!', 'i', 'T', 'R', ' ', '1', 'S', "'", 'N', 'O', 'x', 'm', 'f', 'L', 'W', 'H', 'w', 'C', '?', 'o', 't', 'h', 'k', 'c', 'r', 'e', 'q', 'I', 'G', 'g', '9', 'b', 'p', 'a', 's', 'n', 'd', 'B', 'y', 'u', '.', 'A'}
{'í', 'Ó', 'P', 'D', 'l', 'i', '!', 'á', 'T', 'R', 'ú', 'S', 'M', 'N', 'O', 'm', 'ó', 'f', 'L', 'é', 'H', 'C', '?', 'o', ',', 't', 'h', 'E', 'Á', 'c', 'r', 'e', 'j', 'q', 'g', 'V', 'I', 'G', 'b', '¡', 'p', '.', 'a', 'z', 's', 'Y', 'É', 'v', '¿', 'n', 'd', 'B', 'U', 'y', 'u', ' ', 'A'}


In [7]:
source_char_to_int = {char: idx for idx, char in enumerate(source_vocab)}
target_char_to_int = {char: idx for idx, char in enumerate(target_vocab)}
source_int_to_char = {idx: char for char, idx in source_char_to_int.items()}
target_int_to_char = {idx: char for char, idx in target_char_to_int.items()}

In [8]:
# Convert text sequences to integer sequences
source_sequences = [[source_char_to_int[char] for char in text] for text in df['english'][25:125]]
target_sequences = [[target_char_to_int[char] for char in text] for text in df['spanish'][25:125]]

In [9]:
# Pad sequences to the same length
max_sequence_length = max(len(seq) for seq in source_sequences)
source_sequences = tf.keras.preprocessing.sequence.pad_sequences(source_sequences, maxlen=max_sequence_length, padding='post')
target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')

In [10]:
# Build the model
input_shape = (max_sequence_length, source_vocab_size)
output_shape = (max_sequence_length, target_vocab_size)

model = tf.keras.models.Sequential([
    # Embedding layer with a larger dimension for richer word representations
    tf.keras.layers.Embedding(source_vocab_size, 512, input_length=max_sequence_length),

    # First Bidirectional RNN layer with dropout and L2 regularization
    tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(512, return_sequences=True, 
                                kernel_regularizer=tf.keras.regularizers.l2(0.001))),
    tf.keras.layers.Dropout(0.4),

    # Second RNN layer with higher units and dropout
    tf.keras.layers.SimpleRNN(512, return_sequences=True),
    tf.keras.layers.Dropout(0.4),

    # Third RNN layer for more complex patterns
    tf.keras.layers.SimpleRNN(256, return_sequences=True),
    tf.keras.layers.Dropout(0.3),

    # Dense layer for more feature learning
    tf.keras.layers.Dense(512, activation='relu'),

    # Output layer to map to the target vocabulary
    tf.keras.layers.Dense(target_vocab_size, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [11]:
# One-hot encode the target sequences
target_sequences_one_hot = np.array([tf.keras.utils.to_categorical(seq, num_classes=target_vocab_size) for seq in target_sequences])

In [12]:
# Train the model
model.fit(source_sequences, target_sequences_one_hot,batch_size = 64, epochs=250)

Epoch 1/250
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 71ms/step - accuracy: 0.0469 - loss: 5.0094
Epoch 2/250
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.1789 - loss: 4.3173
Epoch 3/250
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.1964 - loss: 3.9905
Epoch 4/250
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.2272 - loss: 3.8061
Epoch 5/250
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.2491 - loss: 3.6146
Epoch 6/250
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.2903 - loss: 3.4823
Epoch 7/250
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.3142 - loss: 3.3077
Epoch 8/250
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - accuracy: 0.3746 - loss: 3.1610
Epoch 9/250
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x2506f420590>

In [32]:
# Translate a new input sequence
x = input("Enter String : ")
input_sequence = x
input_sequence = [source_char_to_int[char] for char in input_sequence]
input_sequence = tf.keras.preprocessing.sequence.pad_sequences([input_sequence], maxlen=max_sequence_length, padding='post')
output_sequence = model.predict(input_sequence)[0]
# Decode the output sequence
output_sequence = [target_int_to_char[np.argmax(char)] for char in output_sequence]
print("Input Sequence:",x)
print("Translated Sequence:", ''.join(output_sequence))

Enter String :  Thanks!.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Input Sequence: Thanks!.
Translated Sequence: Gracias!
