In [77]:
# Disease Prediction System using ML (Diabetes, Heart Disease, Breast Cancer)

# --- 1. Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# --- 2. Helper Function ---
def train_models(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "SVM": SVC(),
        "Random Forest": RandomForestClassifier(),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    }

    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results[name] = acc
        print(f"\n{name} Accuracy: {acc:.2f}")
        print(classification_report(y_test, y_pred))
    return results

# --- 3. Diabetes Dataset ---
print("\n--- Diabetes Prediction ---")
diabetes = pd.read_csv("diabetes.csv")
X_diabetes = diabetes.drop("Outcome", axis=1)
y_diabetes = diabetes["Outcome"]
diabetes_results = train_models(X_diabetes, y_diabetes)

# --- 4. Heart Disease Dataset ---
print("\n--- Heart Disease Prediction ---")
heart = pd.read_csv("heart.csv")
# Encode categorical columns
heart = heart.copy()
categorical_cols = ["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]
heart = pd.get_dummies(heart, columns=categorical_cols)
X_heart = heart.drop("HeartDisease", axis=1)
y_heart = heart["HeartDisease"]
heart_results = train_models(X_heart, y_heart)

# --- 5. Breast Cancer Dataset ---
print("\n--- Breast Cancer Prediction ---")
cancer = pd.read_csv("data.csv")

# Drop unnamed ID column if exists
if 'Unnamed: 32' in cancer.columns:
    cancer = cancer.drop(['Unnamed: 32'], axis=1)
if 'id' in cancer.columns:
    cancer = cancer.drop(['id'], axis=1)

# Encode Diagnosis column (M = 1, B = 0)
cancer["diagnosis"] = cancer["diagnosis"].map({"M": 1, "B": 0})

# Handle NaN if present
imputer = SimpleImputer(strategy='mean')
cancer_imputed = pd.DataFrame(imputer.fit_transform(cancer), columns=cancer.columns)

X_cancer = cancer_imputed.drop("diagnosis", axis=1)
y_cancer = cancer_imputed["diagnosis"]
cancer_results = train_models(X_cancer, y_cancer)

# --- 6. Summary ---
print("\n--- Summary ---")
print("\nDiabetes:", diabetes_results)
print("Heart Disease:", heart_results)
print("Breast Cancer:", cancer_results)

# --- 7. Placeholder cells for Colab Upload ---
# Uncomment these in Google Colab to upload files
# from google.colab import files



--- Diabetes Prediction ---

Logistic Regression Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.81      0.79      0.80        99
           1       0.64      0.67      0.65        55

    accuracy                           0.75       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154


SVM Accuracy: 0.77
              precision    recall  f1-score   support

           0       0.78      0.88      0.83        99
           1       0.72      0.56      0.63        55

    accuracy                           0.77       154
   macro avg       0.75      0.72      0.73       154
weighted avg       0.76      0.77      0.76       154


Random Forest Accuracy: 0.73
              precision    recall  f1-score   support

           0       0.80      0.79      0.79        99
           1       0.62      0.64      0.63        55

    accuracy                           0.73       154
   macro avg 