In [1]:
import json
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import Adam

In [2]:
# Load dataset
with open("data.json", 'r') as f:
    intents = json.load(f)["intents"]

# Prepare data for TF-IDF
documents, labels = [], []

for intent in intents:
    for symptom in intent["symptoms"]:
        documents.append(symptom)
        labels.append(intent["name"])

# Vectorize the text samples into a 2D integer tensor with TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

# Save vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Convert categories to numbers
categories = sorted(list(set(labels)))
le = LabelEncoder()
y = le.fit_transform(labels)

# Save label encoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

# Split data into training and testing sets
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to numpy arrays
train_x = train_x.toarray()
test_x = test_x.toarray()

train_y = np.array(train_y)
test_y = np.array(test_y)

# Reshape data for LSTM
train_x = np.reshape(train_x, (train_x.shape[0], 1, train_x.shape[1]))
test_x = np.reshape(test_x, (test_x.shape[0], 1, test_x.shape[1]))

# Define LSTM model
model = Sequential()
model.add(LSTM(128, input_shape=(1, train_x.shape[2]), return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(len(categories), activation='softmax'))

# Compile model
optimizer = Adam(learning_rate=0.01)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train model
model.fit(train_x, train_y, validation_data=(test_x, test_y), epochs=200, batch_size=16, verbose=2)

# Save model
model.save('medical_chatbot_model.h5')


Epoch 1/200
79/79 - 6s - loss: 5.6196 - accuracy: 0.0032 - val_loss: 5.6075 - val_accuracy: 0.0000e+00 - 6s/epoch - 71ms/step
Epoch 2/200
79/79 - 0s - loss: 5.4899 - accuracy: 0.0152 - val_loss: 5.6983 - val_accuracy: 0.0000e+00 - 409ms/epoch - 5ms/step
Epoch 3/200
79/79 - 0s - loss: 5.2283 - accuracy: 0.0383 - val_loss: 5.6128 - val_accuracy: 0.0159 - 382ms/epoch - 5ms/step
Epoch 4/200
79/79 - 0s - loss: 4.8117 - accuracy: 0.0679 - val_loss: 5.5453 - val_accuracy: 0.0064 - 390ms/epoch - 5ms/step
Epoch 5/200
79/79 - 0s - loss: 4.4146 - accuracy: 0.0998 - val_loss: 5.6778 - val_accuracy: 0.0223 - 383ms/epoch - 5ms/step
Epoch 6/200
79/79 - 1s - loss: 4.1017 - accuracy: 0.1302 - val_loss: 5.6296 - val_accuracy: 0.0127 - 549ms/epoch - 7ms/step
Epoch 7/200
79/79 - 0s - loss: 3.8918 - accuracy: 0.1645 - val_loss: 5.7747 - val_accuracy: 0.0191 - 394ms/epoch - 5ms/step
Epoch 8/200
79/79 - 0s - loss: 3.6256 - accuracy: 0.2021 - val_loss: 5.9270 - val_accuracy: 0.0191 - 396ms/epoch - 5ms/step
Ep

Epoch 67/200
79/79 - 1s - loss: 2.3810 - accuracy: 0.3962 - val_loss: 8.3415 - val_accuracy: 0.0127 - 548ms/epoch - 7ms/step
Epoch 68/200
79/79 - 1s - loss: 2.3327 - accuracy: 0.3874 - val_loss: 8.3742 - val_accuracy: 0.0127 - 680ms/epoch - 9ms/step
Epoch 69/200
79/79 - 0s - loss: 2.3634 - accuracy: 0.3978 - val_loss: 8.2809 - val_accuracy: 0.0191 - 479ms/epoch - 6ms/step
Epoch 70/200
79/79 - 1s - loss: 2.3889 - accuracy: 0.3866 - val_loss: 8.3807 - val_accuracy: 0.0191 - 578ms/epoch - 7ms/step
Epoch 71/200
79/79 - 1s - loss: 2.3624 - accuracy: 0.3802 - val_loss: 8.5133 - val_accuracy: 0.0159 - 710ms/epoch - 9ms/step
Epoch 72/200
79/79 - 0s - loss: 2.2930 - accuracy: 0.3834 - val_loss: 8.6303 - val_accuracy: 0.0159 - 405ms/epoch - 5ms/step
Epoch 73/200
79/79 - 0s - loss: 2.3160 - accuracy: 0.3874 - val_loss: 8.5361 - val_accuracy: 0.0127 - 394ms/epoch - 5ms/step
Epoch 74/200
79/79 - 0s - loss: 2.3222 - accuracy: 0.3858 - val_loss: 8.4637 - val_accuracy: 0.0127 - 396ms/epoch - 5ms/step


Epoch 133/200
79/79 - 0s - loss: 2.2022 - accuracy: 0.4113 - val_loss: 9.3694 - val_accuracy: 0.0191 - 413ms/epoch - 5ms/step
Epoch 134/200
79/79 - 0s - loss: 2.1655 - accuracy: 0.4137 - val_loss: 9.4206 - val_accuracy: 0.0191 - 395ms/epoch - 5ms/step
Epoch 135/200
79/79 - 0s - loss: 2.2131 - accuracy: 0.3930 - val_loss: 9.4314 - val_accuracy: 0.0127 - 383ms/epoch - 5ms/step
Epoch 136/200
79/79 - 0s - loss: 2.1759 - accuracy: 0.4105 - val_loss: 9.5521 - val_accuracy: 0.0159 - 390ms/epoch - 5ms/step
Epoch 137/200
79/79 - 0s - loss: 2.1406 - accuracy: 0.4289 - val_loss: 9.6055 - val_accuracy: 0.0191 - 391ms/epoch - 5ms/step
Epoch 138/200
79/79 - 0s - loss: 2.1725 - accuracy: 0.4081 - val_loss: 9.6082 - val_accuracy: 0.0223 - 397ms/epoch - 5ms/step
Epoch 139/200
79/79 - 1s - loss: 2.1328 - accuracy: 0.4217 - val_loss: 9.6890 - val_accuracy: 0.0191 - 585ms/epoch - 7ms/step
Epoch 140/200
79/79 - 1s - loss: 2.1684 - accuracy: 0.4137 - val_loss: 9.6348 - val_accuracy: 0.0191 - 506ms/epoch - 6

Epoch 198/200
79/79 - 1s - loss: 2.0868 - accuracy: 0.4161 - val_loss: 10.4022 - val_accuracy: 0.0191 - 559ms/epoch - 7ms/step
Epoch 199/200
79/79 - 0s - loss: 2.0942 - accuracy: 0.4217 - val_loss: 10.4138 - val_accuracy: 0.0191 - 407ms/epoch - 5ms/step
Epoch 200/200
79/79 - 0s - loss: 2.1276 - accuracy: 0.4097 - val_loss: 10.2969 - val_accuracy: 0.0191 - 423ms/epoch - 5ms/step


In [6]:
from sklearn.metrics import classification_report, accuracy_score

# Calculate predictions for the testing set
test_pred = model.predict(test_x)

# Convert predictions classes to one hot vectors 
test_pred_classes = np.argmax(test_pred, axis=1) 

# For non one-hot encoded labels, you don't need np.argmax
test_true = test_y  

# Compute the confusion matrix
accuracy = accuracy_score(test_true, test_pred_classes)
print("Accuracy: ", accuracy)

# Display classification report
report = classification_report(test_true, test_pred_classes, output_dict=True)

# Extract F1-score
f1_score = report['weighted avg']['f1-score']
print("F1 Score: ", f1_score)


Accuracy:  0.01910828025477707
F1 Score:  0.01739343459088682


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
