### This file consists of the code for training the neural network with the Questions and Answers to generate better search optimization.

This code uses the LSTM model
The code does the following:
- Converts questions and answers to sequences of integers using a tokenizer.
- We have the LSTM Architecture where the steps involving the encoder and decoder happen.
- We use sparse categorical crossentropy to optimize the prediction of the next word in th eanswer sequence.

In [17]:
# Import all the essential libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
# Load the dataset
data = pd.read_csv("../data/final_dataset.csv")

# Use the question as input and the answer as output
questions = data["Question_Body"].astype(str).values
answers = data["Answer_Body"].astype(str).values

In [21]:
# Split the Dataset into training and testing.
train_questions, test_questions, train_answers, test_answers = train_test_split(questions, answers, test_size=0.2, random_state=42)

# Tokenize the text
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(questions + answers)

# Convert the text to sequences
train_questions_seq = tokenizer.texts_to_sequences(train_questions)
train_answers_seq = tokenizer.texts_to_sequences(train_answers)
test_questions_seq = tokenizer.texts_to_sequences(test_questions)
test_answers_seq = tokenizer.texts_to_sequences(test_answers)

# Pad sequences to ensure uniform input size
max_length = 100
train_questions_padded = pad_sequences(train_questions_seq, maxlen=max_length, padding='post')
train_answers_padded = pad_sequences(train_answers_seq, maxlen=max_length, padding='post')
test_questions_padded = pad_sequences(test_questions_seq, maxlen=max_length, padding='post')
test_answers_padded = pad_sequences(test_answers_seq, maxlen=max_length, padding='post')

# Vocabulary size for embedding layer
vocab_size = len(tokenizer.word_index) + 1

### LSTM Model

In this section I have built the encoder and the decoder in order to train the neural network.
This is a LSTM model for prediction of answers.

In [24]:
# Build the neural network model.
# Encoder
encoder_inputs = Input(shape=(max_length,))
encoder_embeddings = Embedding(vocab_size, 128, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(128, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embeddings)

# Decoder
decoder_inputs = Input(shape=(max_length, ))
decoder_embedding = Embedding(vocab_size, 128, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(128, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Compile the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Summary of the mdoel
model.summary()

# Prepare the decoder output for training
train_answers_padded = np.expand_dims(train_answers_padded, axis=-1)
test_answers_padded = np.expand_dims(test_answers_padded, axis=-1)

# Train the model
model.fit(
    [train_questions_padded, train_answers_padded],
    train_answers_padded,
    validation_data=([test_questions_padded, test_answers_padded], test_answers_padded),
    batch_size=16,
    epochs=10
)

# Save the trained Model
model.save("../models/neural_network_model.h5")

Epoch 1/10




[1m2113/2113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1700s[0m 803ms/step - accuracy: 0.0918 - loss: 5.9435 - val_accuracy: 0.5365 - val_loss: 2.2341
Epoch 2/10
[1m2113/2113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1710s[0m 809ms/step - accuracy: 0.6080 - loss: 1.6338 - val_accuracy: 0.6970 - val_loss: 0.6041
Epoch 3/10
[1m2113/2113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1652s[0m 782ms/step - accuracy: 0.7097 - loss: 0.4609 - val_accuracy: 0.7227 - val_loss: 0.2364
Epoch 4/10
[1m2113/2113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1642s[0m 777ms/step - accuracy: 0.7354 - loss: 0.1658 - val_accuracy: 0.7347 - val_loss: 0.1018
Epoch 5/10
[1m2113/2113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1641s[0m 777ms/step - accuracy: 0.7420 - loss: 0.0566 - val_accuracy: 0.7383 - val_loss: 0.0526
Epoch 6/10
[1m2113/2113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1648s[0m 780ms/step - accuracy: 0.7437 - loss: 0.0205 - val_accuracy: 0.7398 - val_loss: 0.03

