In [2]:
#Import necessary Libraries
import pandas as pd
import numpy as np

import string
import re
import os

import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import TextVectorization
from tensorflow import keras
from tensorflow.keras import layers

from params import *





In [3]:
my_df = pd.read_csv("Dataset/data.csv")
print("**********************")
my_df.head()



**********************


Unnamed: 0,english,spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.


In [4]:
my_df.info()
print("**********************")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118964 entries, 0 to 118963
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   english  118964 non-null  object
 1   spanish  118964 non-null  object
dtypes: object(2)
memory usage: 1.8+ MB
**********************


In [5]:
# This line of code modifies the "spanish" column of the DataFrame `my_df` by applying a lambda function to each item in the column..

my_df["spanish"] = my_df["spanish"].apply(lambda item: Get_Params.begin_token + " " + item + " " + Get_Params.end_token)

In [6]:
missing_values = my_df.isnull().sum()

print("Missing Values:")
print(missing_values)

my_df.head()

Missing Values:
english    0
spanish    0
dtype: int64


Unnamed: 0,english,spanish
0,Go.,[start] Ve. [end]
1,Go.,[start] Vete. [end]
2,Go.,[start] Vaya. [end]
3,Go.,[start] Váyase. [end]
4,Hi.,[start] Hola. [end]


In [7]:
# Define punctuation marks along with the Spanish inverted question mark ("¿") to be removed or replaced.
prepro_ = string.punctuation + "¿"

# Remove square brackets from the punctuation set.
prepro_ = prepro_.replace("[", "")
prepro_ = prepro_.replace("]", "")

# Print the updated set of punctuation marks.
print(prepro_)

# Define a function to perform Spanish text standardization by converting input strings to lowercase and removing specified punctuation marks.
def perform_spanish_standardization(input_string):
    lowercase = tf.strings.lower(input_string)  # Convert input string to lowercase.
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(prepro_), "")  # Remove specified punctuation marks.

# Instantiate a TextVectorization layer for English text with specified parameters.
vect_eng = TextVectorization(
    max_tokens=Get_Params.vocab_size,  # Maximum vocabulary size.
    output_mode="int",  # Output integers corresponding to token indices.
    output_sequence_length=Get_Params.sequence_length,  # Output sequences of fixed length.
)

# Instantiate a TextVectorization layer for Spanish text with specified parameters.
vect_spa = TextVectorization(
    max_tokens=Get_Params.vocab_size,  # Maximum vocabulary size.
    output_mode="int",  # Output integers corresponding to token indices.
    output_sequence_length=Get_Params.sequence_length + 1,  # Output sequences of fixed length with an additional token.
    standardize=perform_spanish_standardization,  # Apply Spanish text standardization.
)

# Adapt the English TextVectorization layer to the English text data from the DataFrame.
vect_eng.adapt(list(my_df["english"]))

# Adapt the Spanish TextVectorization layer to the Spanish text data from the DataFrame after applying standardization.
vect_spa.adapt(list(my_df["spanish"]))

!"#$%&'()*+,-./:;<=>?@\^_`{|}~¿




In [8]:
def preprocess(english, spanish):
    # This function preprocesses the input English and Spanish sequences by splitting each Spanish sequence into two parts:
    # - The input consists of the English sequence and the Spanish sequence with the last token removed.
    # - The target output consists of the Spanish sequence with the first token removed.
    return (english, spanish[:, :-1]), spanish[:, 1:]



In [9]:
def gen_data(df, batch_size, mode):
    # This function generates a TensorFlow dataset from the DataFrame `df` containing English and Spanish sequences.
    # It first converts the English and Spanish sequences into numerical vectors using functions `vect_eng` and `vect_spa`.
    english = vect_eng(list(df["english"]))
    spanish = vect_spa(list(df["spanish"]))
    dataset = tf.data.Dataset.from_tensor_slices((english, spanish))
    if mode == "train":
       # If the mode is set to "train", shuffle the dataset to introduce randomness in the training process.
       dataset = dataset.shuffle(256)
    # Batch the dataset into batches of size `batch_size`.
    dataset = dataset.batch(batch_size)
    # Apply the `preprocess` function to each element in the dataset, splitting each Spanish sequence into input and target sequences.
    dataset = dataset.map(preprocess)
    # Take a number of batches equal to the length of the dataset divided by the batch size.
    dataset = dataset.take(len(df) // batch_size)
    # Cache the dataset to memory for faster access, prefetch 16 batches for efficient data loading, and repeat the dataset once.
    dataset = dataset.cache().prefetch(16).repeat(1)
    return dataset

In [10]:
train, valid = train_test_split(my_df, test_size=Get_Params.validation_split)
print("Training set")
train.shape

Training set


(101119, 2)

In [11]:
print("Testing Set")
valid.shape

Testing Set


(17845, 2)

In [12]:
train_ds = gen_data(train, batch_size=Get_Params.batch_size, mode="train")
valid_ds = gen_data(valid, batch_size=Get_Params.batch_size, mode="valid")

In [13]:
for batch in train_ds.take(1):
    print(batch)

((<tf.Tensor: shape=(64, 20), dtype=int64, numpy=
array([[   6,  118,    3, ...,    0,    0,    0],
       [   8,   13,  289, ...,    0,    0,    0],
       [  24,   15,    5, ...,    0,    0,    0],
       ...,
       [ 105,   95,    4, ...,    0,    0,    0],
       [   2, 7824,    8, ...,    0,    0,    0],
       [   6,   50, 1662, ...,    0,    0,    0]], dtype=int64)>, <tf.Tensor: shape=(64, 20), dtype=int64, numpy=
array([[  2,   8,  91, ...,   0,   0,   0],
       [  2,  12, 110, ...,   0,   0,   0],
       [  2,  27, 425, ...,   0,   0,   0],
       ...,
       [  2,  89,  47, ...,   0,   0,   0],
       [  2,   9,   1, ...,   0,   0,   0],
       [  2,   8, 130, ...,   0,   0,   0]], dtype=int64)>), <tf.Tensor: shape=(64, 20), dtype=int64, numpy=
array([[   8,   91,    5, ...,    0,    0,    0],
       [  12,  110,    5, ...,    0,    0,    0],
       [  27,  425,   18, ...,    0,    0,    0],
       ...,
       [  89,   47,    5, ...,    0,    0,    0],
       [   9,    1,  

In [14]:
class FNetEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, **kwargs):
        super(FNetEncoder, self).__init__(**kwargs)  # Initialize the superclass with any additional arguments
        self.embed_dim = embed_dim  # Store the embedding dimension
        self.dense_dim = dense_dim  # Store the dense layer dimension
        # Define a sequential model for projecting the input
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),  # Dense layer with ReLU activation
                layers.Dense(embed_dim),  # Dense layer without activation
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()  # Layer normalization instance
        self.layernorm_2 = layers.LayerNormalization()  # Another layer normalization instance

    def call(self, inputs):
        # Cast inputs to complex64 datatype
        inp_complex = tf.cast(inputs, tf.complex64)
        # Compute 2D Fast Fourier Transform (FFT) and take the real part
        fft = tf.math.real(tf.signal.fft2d(inp_complex))
        # Add the FFT result to the inputs and normalize
        proj_input = self.layernorm_1(inputs + fft)
        # Project the input through the dense layers
        proj_output = self.dense_proj(proj_input)
        # Add the projected output to the input and normalize
        return self.layernorm_2(proj_input + proj_output)