# Symptom2Risk — Predicting Diseases from Symptoms using TensorFlow on AWS SageMaker

"""
Description:
This project builds a machine learning model that predicts the most probable disease
based on a list of patient symptoms using deep learning. It leverages a publicly available
symptom-disease dataset, encodes symptoms using one-hot vectors, and trains a
multi-class classification model using TensorFlow.

Key Features:
- Dataset: https://www.kaggle.com/datasets/choongqianzheng/disease-and-symptoms-dataset/data
- Input: Up to 17 symptoms per case, encoded into 0/1 vectors.
- Output: Predicted disease name from among 40+ possibilities.
- Uses: TensorFlow/Keras for model training and AWS SageMaker for cloud scalability.
- Real-world Use Case: Can serve as a backend for triage tools or digital symptom checkers.

This notebook supports model training, evaluation, and saving for deployment to a real-time endpoint.
"""

In [138]:
# =====================
# 1. Setup & Imports
# =====================
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
import os
from tensorflow.keras.metrics import Precision, Recall, F1Score, TopKCategoricalAccuracy, SparseTopKCategoricalAccuracy

In [69]:
drive.mount('/content/drive')

# =====================
# 2. Load Dataset
# =====================
df=pd.read_csv('/content/drive/My Drive/DiseaseAndSymptoms.csv')
df.shape

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


(4920, 18)

In [70]:
# =====================
# 3. Data Cleaning & Preprocessing
# =====================
# Extract all symptom columns
symptom_cols = [col for col in df.columns if col.startswith('Symptom_')]

# Function to clean symptoms per row (remove NaNs, strip whitespace)
def clean_symptoms(row):
    symptoms = []
    for col in symptom_cols:
        val = row[col]
        if isinstance(val, str):
            symptoms.append(val.strip().lower())
    return symptoms

# Apply the cleaning function
df['all_symptoms'] = df.apply(clean_symptoms, axis=1)

# Create a sorted list of all unique symptoms
all_symptoms = set()
df['all_symptoms'].apply(lambda x: all_symptoms.update(x))
all_symptoms = sorted(all_symptoms)


In [None]:
# One-hot encode symptoms into binary columns
for symptom in all_symptoms:
    df[symptom] = df['all_symptoms'].apply(lambda x: int(symptom in x))

# Encode disease labels
le = LabelEncoder()
df['disease_encoded'] = le.fit_transform(df['Disease'])
len(le.classes_)


In [72]:
# Features and target
df_X = df[all_symptoms]
df_y = df['disease_encoded']

In [73]:
# =====================
# 4. Train-Test Split
# =====================

X_train, X_test, y_train, y_test = train_test_split(
    df_X, df_y, test_size=0.2, stratify=df_y, random_state=42
)
X_train.shape, X_test.shape

((3936, 131), (984, 131))

In [93]:
# =====================
# 5. Build TensorFlow Model
# =====================

model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(le.classes_), activation='softmax')
])
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=[
                  'accuracy',
                  tf.keras.metrics.SparseTopKCategoricalAccuracy(k=3, name='sparse_top3_accuracy')
                  ]
              )

model.summary()

In [94]:
# =====================
# 6. Train the Model
# =====================

history = model.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.1
)

Epoch 1/20
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.3363 - loss: 3.1773 - sparse_top3_accuracy: 0.4835 - val_accuracy: 1.0000 - val_loss: 0.5399 - val_sparse_top3_accuracy: 1.0000
Epoch 2/20
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9967 - loss: 0.3879 - sparse_top3_accuracy: 1.0000 - val_accuracy: 1.0000 - val_loss: 0.0369 - val_sparse_top3_accuracy: 1.0000
Epoch 3/20
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9990 - loss: 0.0541 - sparse_top3_accuracy: 1.0000 - val_accuracy: 1.0000 - val_loss: 0.0118 - val_sparse_top3_accuracy: 1.0000
Epoch 4/20
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9999 - loss: 0.0246 - sparse_top3_accuracy: 1.0000 - val_accuracy: 1.0000 - val_loss: 0.0063 - val_sparse_top3_accuracy: 1.0000
Epoch 5/20
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - 

In [97]:

# =====================
# 7. Evaluate the Model
# =====================

loss, accuracy, sparse_top3_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 9.2884e-05 - sparse_top3_accuracy: 1.0000
Test Accuracy: 1.0000


In [102]:
# =====================
# 8. Save the Model for Deployment
# =====================

model_dir = 'symptom2risk'
os.makedirs(model_dir, exist_ok=True)
model.save(model_dir+'/'+model_dir+'_model.keras')
print(f"Model saved to {model_dir}/")

Model saved to symptom2risk/


In [103]:
# =====================
# 9. Export Label Mapping (for inference)
# =====================

# Save label encoder classes
label_map_path = os.path.join(model_dir, 'label_map.csv')
pd.Series(le.classes_).to_csv(label_map_path, index_label='Class_ID', header=['Disease'])
print(f"Label map saved to {label_map_path}")

Label map saved to symptom2risk/label_map.csv


In [109]:
# Load saved model from directory
model = tf.keras.models.load_model('symptom2risk/symptom2riskmodel.keras')

# Load label map to convert class index to disease name
label_map = pd.read_csv('symptom2risk/label_map.csv')
class_id_to_disease = dict(zip(label_map['Class_ID'], label_map['Disease']))

In [136]:
#Prepare data for prediction

#Input symptoms
input_symptoms = ['headache', 'rash', 'fatigue']

#One-hot encode the input symptoms and create an input vector
all_symptoms = list(df_X.columns)
input_vector = [1 if symptom in input_symptoms else 0 for symptom in all_symptoms]
input_vector = np.array([input_vector])

In [137]:
# Predict class probabilities
prediction = model.predict(input_vector)

# Get predicted class index (highest probability) ...remember it is softmax layer
prediction, tf.argmax(prediction[0]).numpy()

# Map index to actual disease name
predicted_disease = class_id_to_disease[tf.argmax(prediction[0]).numpy()]

print(f"Predicted Disease: {predicted_disease}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Predicted Disease: Hypertension 
