In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/diseases-and-symptoms-dataset/Final_Augmented_dataset_Diseases_and_Symptoms.csv
/kaggle/input/disease-symptom-description-dataset/symptom_Description.csv
/kaggle/input/disease-symptom-description-dataset/Symptom-severity.csv
/kaggle/input/disease-symptom-description-dataset/symptom_precaution.csv
/kaggle/input/disease-symptom-description-dataset/dataset.csv


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('/kaggle/input/disease-symptom-description-dataset/dataset.csv')


In [4]:
symptom_columns = [f'Symptom_{i}' for i in range(1, 18)]
symptoms = set()
for col in symptom_columns:
    symptoms.update(df[col].dropna().unique())

symptoms = sorted(list(symptoms))
symptom_to_index = {s: i for i, s in enumerate(symptoms)}

In [5]:
def row_to_feature_vector(row):
    vec = np.zeros(len(symptom_to_index))
    for col in symptom_columns:
        s = row[col]
        if pd.notna(s) and s in symptom_to_index:
            vec[symptom_to_index[s]] = 1
    return vec

In [6]:
X = np.array([row_to_feature_vector(row) for _, row in df.iterrows()])
y = df['Disease'].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

In [9]:
predictions = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))


Accuracy: 1.0


In [13]:
def dynamic_diagnosis():
    current_symptoms = []
    asked_symptoms = set()

    print("\nWelcome to Disease Diagnosis Assistant.")
    print("Enter initial symptoms (comma-separated):")
    initial_input = input("Initial symptoms: ")
    current_symptoms = [s.strip() for s in initial_input.split(",") if s.strip() in symptom_to_index]

    while True:
        feature_vec = np.zeros(len(symptom_to_index))
        for s in current_symptoms:
            if s in symptom_to_index:
                feature_vec[symptom_to_index[s]] = 1

        proba = model.predict_proba([feature_vec])[0]
        top_3 = proba.argsort()[-3:][::-1]
        top_diseases = [(model.classes_[idx], proba[idx]) for idx in top_3]

        print("\nTop possible diseases:")
        for d, p in top_diseases:
            print(f"- {d} ({p * 100:.2f}% confidence)")

        if top_diseases[0][1] > 0.85:
            print(f"\nHighly confident diagnosis: {top_diseases[0][0]}")
            break

        top_disease = top_diseases[0][0]

        # Smart next symptom selection: find symptoms most correlated with top disease
        disease_rows = df[df['Disease'] == top_disease]
        symptom_counts = {}

        for _, row in disease_rows.iterrows():
            for col in symptom_columns:
                symp = row[col]
                if pd.notna(symp):
                    symptom_counts[symp] = symptom_counts.get(symp, 0) + 1

        # Sort by frequency and ask if not asked yet and not already present
        next_symptom = None
        sorted_symptom_candidates = sorted(symptom_counts.items(), key=lambda x: x[1], reverse=True)

        for symp, freq in sorted_symptom_candidates:
            if symp not in current_symptoms and symp not in asked_symptoms:
                next_symptom = symp
                break

        # Fallback to feature importance if not found
        if next_symptom is None:
            feature_importances = model.feature_importances_
            sorted_symptoms = np.argsort(feature_importances)[::-1]
            for idx in sorted_symptoms:
                symptom_name = symptoms[idx]
                if symptom_name not in current_symptoms and symptom_name not in asked_symptoms:
                    next_symptom = symptom_name
                    break

        if next_symptom is None:
            print("\nNo more symptoms to ask.")
            break

        ans = input(f"Do you have '{next_symptom}'? (yes/no): ").strip().lower()
        asked_symptoms.add(next_symptom)
        if ans == "yes":
            current_symptoms.append(next_symptom)

    print("\nDiagnosis complete.")

In [17]:
dynamic_diagnosis()


Welcome to Disease Diagnosis Assistant.
Enter initial symptoms (comma-separated):


Initial symptoms:  shivering,watering_from_eyes



Top possible diseases:
- Arthritis (13.50% confidence)
- Urinary tract infection (10.50% confidence)
- Dimorphic hemmorhoids(piles) (9.50% confidence)


Do you have ' muscle_weakness'? (yes/no):  no



Top possible diseases:
- Arthritis (13.50% confidence)
- Urinary tract infection (10.50% confidence)
- Dimorphic hemmorhoids(piles) (9.50% confidence)


Do you have ' stiff_neck'? (yes/no):  yes



Top possible diseases:
- Arthritis (35.00% confidence)
- Migraine (12.00% confidence)
- Dimorphic hemmorhoids(piles) (6.00% confidence)


Do you have ' swelling_joints'? (yes/no):  yes



Top possible diseases:
- Arthritis (56.50% confidence)
- Osteoarthristis (9.50% confidence)
- Migraine (8.50% confidence)


Do you have ' movement_stiffness'? (yes/no):  yes



Top possible diseases:
- Arthritis (90.00% confidence)
- Migraine (2.00% confidence)
- Paralysis (brain hemorrhage) (1.50% confidence)

Highly confident diagnosis: Arthritis

Diagnosis complete.
