In [1]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
# Loading the dataset
data = pd.read_csv('/content/medquad.csv')

In [7]:
# Defining text cleaning functions
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = text.lower()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

In [8]:

# Applying cleaning functions to the dataset
data['question'] = data['question'].astype(str).apply(clean_text)
data['answer'] = data['answer'].astype(str).apply(clean_text)

# Use a smaller subset of the data for initial testing
data = data.sample(frac=0.5, random_state=42)

In [9]:
# Defining hyperparameters
MAX_SEQUENCE_LENGTH = 50
VOCABULARY_SIZE = 2000
EMBEDDING_DIM = 64
LSTM_UNITS = 256

In [10]:
# Tokenization
tokenizer = Tokenizer(num_words=VOCABULARY_SIZE)
tokenizer.fit_on_texts(data['question'].tolist() + data['answer'].tolist())

# Encoding sequences
question_sequences = tokenizer.texts_to_sequences(data['question'].tolist())
answer_sequences = tokenizer.texts_to_sequences(data['answer'].tolist())
question_sequences = pad_sequences(question_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
answer_sequences = pad_sequences(answer_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [11]:
# One-hot encoding the target sequences
answer_sequences = np.array([tf.keras.utils.to_categorical(seq, num_classes=VOCABULARY_SIZE) for seq in answer_sequences])

In [12]:
# Splitting the data to train and test the model
X_train, X_test, y_train, y_test = train_test_split(question_sequences, answer_sequences, test_size=0.2, random_state=42)

In [13]:
# Encoder
encoder_inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
encoder_embedding = Embedding(VOCABULARY_SIZE, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(encoder_inputs)
encoder_lstm = LSTM(LSTM_UNITS, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
decoder_embedding = Embedding(VOCABULARY_SIZE, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(decoder_inputs)
decoder_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(VOCABULARY_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [14]:
# This model turns
# encoder_input_data & decoder_input_data into decoder_target_data
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [15]:
# Compiling the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [16]:
# Callbacks for better training
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [17]:
# Training the model
model.fit([X_train, X_train], y_train, epochs=20, batch_size=32, validation_data=([X_test, X_test], y_test), callbacks=[checkpoint, early_stopping])

Epoch 1/20
Epoch 2/20


  saving_api.save_model(


Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7fc0a1126980>

In [18]:
# Defining the encoder model
encoder_model = Model(encoder_inputs, encoder_states)

# Defining the decoder model
decoder_state_input_h = Input(shape=(LSTM_UNITS,))
decoder_state_input_c = Input(shape=(LSTM_UNITS,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding_inference = Embedding(VOCABULARY_SIZE, EMBEDDING_DIM, input_length=1)
decoder_inputs_single = Input(shape=(1,))
decoder_embedding_single = decoder_embedding_inference(decoder_inputs_single)
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding_single, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [19]:
# Function to generate responses
def decode_sequence(input_seq):
    # Encoding the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generating empty target sequence of length 1.
    target_seq = np.zeros((1, 1))

    # Populating the first character of target sequence with the start character from input_seq.
    target_seq[0, 0] = input_seq[0, 0]

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = tokenizer.index_word.get(sampled_token_index, '')

        decoded_sentence += ' ' + sampled_char

        # Exit condition: either hit max length or find stop character.
        if len(decoded_sentence.split()) > MAX_SEQUENCE_LENGTH or sampled_char == '':
            stop_condition = True

        # Updating the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()


In [None]:
# User prompt for input
while True:
    input_text = input("You: ")
    if input_text.lower() in ['quit', 'exit']:
        break
    input_text = '/' + ' ' + input_text
    input_seq = tokenizer.texts_to_sequences([clean_text(input_text)])
    input_seq = pad_sequences(input_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    response = decode_sequence(input_seq)
    print("Bot:", response)
