In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense


In [None]:
dataset_path = "/content/science_questions_answers.txt"
with open(dataset_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

In [None]:
questions = []
answers = []
for i in range(len(lines)):
    if lines[i].startswith("Q:"):
        questions.append(lines[i].replace("Q:", "").strip())
    elif lines[i].startswith("A:"):
        answers.append(lines[i].replace("A:", "").strip())

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions + answers)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
question_sequences = tokenizer.texts_to_sequences(questions)
answer_sequences = tokenizer.texts_to_sequences(answers)

In [None]:
max_length = max(max(len(seq) for seq in question_sequences), max(len(seq) for seq in answer_sequences))
question_padded = pad_sequences(question_sequences, maxlen=max_length, padding='post')
answer_padded = pad_sequences(answer_sequences, maxlen=max_length, padding='post')

In [None]:
answers_output = np.array([seq[-1] for seq in answer_padded])

In [None]:
# Define the RNN model
model = Sequential([
    Embedding(vocab_size, 64, input_length=max_length),
    SimpleRNN(128, return_sequences=True),
    SimpleRNN(64),
    Dense(64, activation='relu'),
    Dense(vocab_size, activation='softmax')  # Output layer with softmax activation
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(question_padded, answers_output, epochs=5, batch_size=4, verbose=1)

Epoch 1/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.6199 - loss: 4.8967
Epoch 2/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.4565
Epoch 3/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0067
Epoch 4/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0028
Epoch 5/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 1.0000 - loss: 0.0022


<keras.src.callbacks.history.History at 0x7d0fb8292050>

In [None]:
def chat_response(text):
    seq = tokenizer.texts_to_sequences([text])
    seq_padded = pad_sequences(seq, maxlen=max_length, padding='post')
    pred = model.predict(seq_padded, verbose=0)
    predicted_index = np.argmax(pred, axis=1)[0]  # Get the highest probability word index
    response_word = tokenizer.index_word.get(predicted_index, "Sorry, I don't understand.")
    return response_word

print(chat_response("hello"))

Sorry, I don't understand.
