In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset
df = pd.read_csv("symbipredict_2022.csv")  # Change to your actual file name

# Split features and target variable
X = df.iloc[:, :-1]  # All columns except the last one
y = df.iloc[:, -1]   # Last column (prognosis)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',# Reduce number of trees
    max_depth=5,  # Limit tree depth to prevent overfitting
    random_state=42
)

model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=model.classes_, yticklabels=model.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

def predict_disease(user_symptoms):
    input_data = np.zeros(len(X.columns))  # Initialize with zeros
    
    for symptom in user_symptoms:
        if symptom in X.columns:
            input_data[X.columns.get_loc(symptom)] = 1  # Set corresponding symptom to 1
    
    print("Input Data:", input_data)  # Debugging step

    probabilities = model.predict_proba([input_data])[0]  # Get probabilities for each disease
    print("Raw probabilities:", probabilities)  # Debugging step

    disease_probabilities = {disease: prob for disease, prob in zip(model.classes_, probabilities)}
    sorted_diseases = sorted(disease_probabilities.items(), key=lambda x: x[1], reverse=True)
    
    print("Top 3 probable diseases:")
    for disease, probability in sorted_diseases[:3]:
        print(f"{disease}: {probability * 100:.2f}%")
    
    return sorted_diseases[:3]


predict_disease(["chills", "depression","fatigue","anxiety","weight_gain","blurred_and_distorted_vision"])

