# Disease Prediction & Model Evaluation

# 1. Importing the Required Libraries

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score
)


# 2. Loading and Displaying the Dataset

In [36]:
# Paths
DATA_PATH = 'data/dataset.csv'
MODELS_DIR = 'models/'

# Ensure models directory exists
if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)

In [44]:
# Display first 5 rows
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,itching,skin_rash,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [45]:
# Load Dataset 
try:
    df = pd.read_csv(DATA_PATH, header=None, skiprows=1)
    # Drop rows where all elements are NaN
    df = df.dropna(how='all')
    # Fill NaNs with 0
    df = df.fillna(0)
    print(f"Dataset successfully loaded. Shape: {df.shape}")
except FileNotFoundError:
    print("Error: dataset.csv not found! Please ensure data/dataset.csv exists.")

Dataset successfully loaded. Shape: (4920, 18)


# 3. Data Cleaning and Preprocessing

In [47]:
# Fill missing values
df = df.fillna(0)

X_data = []
y_data = []
unique_symptoms = set()

for _, row in df.iterrows():
    disease = row[0]
    y_data.append(disease)
    
    symptoms = []
    for val in row[1:]:
        if val != 0 and isinstance(val, str):
            symptom = val.strip()
            unique_symptoms.add(symptom)
            symptoms.append(symptom)
    X_data.append(symptoms)


In [48]:
# Encode symptoms
sorted_symptoms = sorted(unique_symptoms)
symptom_index = {sym: i for i, sym in enumerate(sorted_symptoms)}

X = np.zeros((len(X_data), len(sorted_symptoms)))

for i, symptoms in enumerate(X_data):
    for sym in symptoms:
        X[i][symptom_index[sym]] = 1

y = np.array(y_data)

X.shape, y.shape


((4920, 131), (4920,))

# 5. Train-Test Split

In [50]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [51]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)


In [57]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 100.00%


In [58]:
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")


Precision: 1.00
Recall: 1.00
F1-Score: 1.00


In [59]:
print("Classification Report:\n")
print(classification_report(y_test, y_pred))


Classification Report:

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        18
                                   AIDS       1.00      1.00      1.00        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        25
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        23
                       Bronchial Asthma       1.00      1.00      1.00        33
                   Cervical spondylosis       1.00      1.00      1.00        23
                            Chicken pox       1.00      1.00      1.00        21
                    Chronic cholestasis       1.00      1.00      1.00        15
                            Common Cold       1.00      1.00      1.00        23
   

In [62]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(nb_model, X, y, cv=10)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


Cross-Validation Scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Mean CV Accuracy: 1.0


In [66]:
import os
import joblib
import json

# Ensure models directory exists
if not os.path.exists("models"):
    os.makedirs("models")

# Save the model
joblib.dump(final_model, "models/naive_bayes.pkl")

# Save the symptoms list
with open("models/symptoms.json", "w") as f:
    json.dump(sorted_symptoms, f)

print("Final model and symptom list saved successfully to models/ directory")

Final model and symptom list saved successfully to models/ directory


In [67]:
def predict_disease(symptom_list):
    input_vector = np.zeros(len(sorted_symptoms))
    
    for symptom in symptom_list:
        if symptom in symptom_index:
            input_vector[symptom_index[symptom]] = 1
    
    prediction = nb_model.predict([input_vector])[0]
    return prediction


In [68]:
test_symptoms = [
    "itching",
    "skin_rash",
    "nodal_skin_eruptions"
]

print("Predicted Disease:", predict_disease(test_symptoms))


Predicted Disease: Fungal infection


In [69]:
test_cases = [
    ["high_fever", "vomiting", "headache"],
    ["chest_pain", "shortness_of_breath"],
    ["joint_pain", "fatigue", "headache"]
]

for case in test_cases:
    print(case, "→", predict_disease(case))


['high_fever', 'vomiting', 'headache'] → Paralysis (brain hemorrhage)
['chest_pain', 'shortness_of_breath'] → Heart attack
['joint_pain', 'fatigue', 'headache'] → Hepatitis D


In [70]:
def predict_with_confidence(symptom_list):
    input_vector = np.zeros(len(sorted_symptoms))
    
    for symptom in symptom_list:
        if symptom in symptom_index:
            input_vector[symptom_index[symptom]] = 1
    
    probs = nb_model.predict_proba([input_vector])[0]
    max_prob = np.max(probs)
    disease = nb_model.classes_[np.argmax(probs)]
    
    return disease, max_prob


In [71]:
disease, confidence = predict_with_confidence(
    ["fever", "cough", "fatigue"]
)

print(f"Predicted Disease: {disease}")
print(f"Confidence: {confidence*100:.2f}%")


Predicted Disease: Bronchial Asthma
Confidence: 100.00%
