In [3]:
import pandas as pd
import json
import pickle
import os
file_path = 'cleaned_data.csv'

df = pd.read_csv(file_path, low_memory=False)
df.head()

Unnamed: 0,ID,question,response
0,0,is the worst customer service,can you please send us a private message so th...
1,0,i did,please send us a private message so that we ca...
2,0,and how do you propose we do that i have sent ...,i understand i would like to assist you we wou...
3,1,yall lie about your great connection 5 bars lt...,h there we would definitely like to work with ...
4,1,since i signed up with yousince day 1,we understand your concerns and we would like ...


In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Converting the columns to strings
df["question"] = df["question"].astype(str)
df["response"] = df["response"].astype(str)

# Splitting the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Defining hyperparameters
batch_size = 64  # Adjust as needed
epochs = 10      # Number of training epochs

# Initializing and fitting tokenizers for both questions and responses
tokenizer_question = Tokenizer(oov_token="<OOV>")
tokenizer_response = Tokenizer(oov_token="<OOV>")

tokenizer_question.fit_on_texts(train_df["question"])
tokenizer_response.fit_on_texts(train_df["response"])

# Converting text data to sequences
train_question_sequences = tokenizer_question.texts_to_sequences(train_df["question"])
train_response_sequences = tokenizer_response.texts_to_sequences(train_df["response"])
val_question_sequences = tokenizer_question.texts_to_sequences(val_df["question"])
val_response_sequences = tokenizer_response.texts_to_sequences(val_df["response"])

#Padding sequences to a fixed length
max_sequence_length = 50 
train_question_sequences = pad_sequences(train_question_sequences, maxlen=max_sequence_length, padding="post", truncating="post")
train_response_sequences = pad_sequences(train_response_sequences, maxlen=max_sequence_length, padding="post", truncating="post")
val_question_sequences = pad_sequences(val_question_sequences, maxlen=max_sequence_length, padding="post", truncating="post")
val_response_sequences = pad_sequences(val_response_sequences, maxlen=max_sequence_length, padding="post", truncating="post")

#reducing sample data size
train_df_sample = train_df.sample(frac=0.1, random_state=42)

train_question_sequences = np.array(train_question_sequences)
train_response_sequences = np.array(train_response_sequences)

val_question_sequences = np.array(val_question_sequences)
val_response_sequences = np.array(val_response_sequences)

embedding_dim = 256  # Dimension of word embeddings
hidden_units = 512   # Number of LSTM units
input_vocab_size = len(tokenizer_question.word_index) + 1  
target_vocab_size = len(tokenizer_response.word_index) + 1  # Vocabulary size for responses
max_sequence_length = 50  # Maximum sequence length

# Defining the encoder
encoder_inputs = tf.keras.layers.Input(shape=(max_sequence_length,))
encoder_embedding = tf.keras.layers.Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = tf.keras.layers.LSTM(hidden_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Defining the decoder
decoder_inputs = tf.keras.layers.Input(shape=(max_sequence_length,))
decoder_embedding = tf.keras.layers.Embedding(target_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(target_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


# Creating the model

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)



# Compiling the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training the model
history = model.fit(
    [train_question_sequences, train_response_sequences],
    train_response_sequences,
    batch_size=batch_size,

          
    epochs=epochs,
    validation_data=([val_question_sequences, val_response_sequences], val_response_sequences)
)

#saving model in h5 format. also save as a .keras extension 
model.save("seq2seq_lstm_model.h5")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

In [None]:
from tensorflow.keras.callbacks import Callback

class EarlyStoppingAtMinLoss(Callback):
    def __init__(self, patience=2):
        super(EarlyStoppingAtMinLoss, self).__init__()
        self.patience = patience
        self.best_weights = None
        self.best_val_loss = float('inf')
        self.wait = 0

    def on_epoch_end(self, epoch, logs=None):
        current_val_loss = logs.get("val_loss")
        if current_val_loss is None:
            return

        if current_val_loss < self.best_val_loss:
            self.best_val_loss = current_val_loss
            self.best_weights = self.model.get_weights()
            self.wait = 0
        else:
            self.wait += 1
            if self.wait >= self.patience:
                print(f"Epoch {epoch + 1}: Early stopping due to no improvement in validation loss.")
                self.model.stop_training = True
                self.model.set_weights(self.best_weights)


early_stopping_callback = EarlyStoppingAtMinLoss(patience=2)  

history = model.fit(
    [train_question_sequences, train_response_sequences],
    train_response_sequences,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([val_question_sequences, val_response_sequences], val_response_sequences),
    callbacks=[early_stopping_callback]  
)
