In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report, roc_curve, auc)

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

df=pd.read_csv('Data_file.csv')

print("\nDtypes and missing counts:")
display(pd.DataFrame({
    "dtype": df.dtypes,
    "missing": df.isnull().sum(),
    "unique_values": [df[c].nunique() for c in df.columns]
}))

# Target distribution
print("\nTarget distribution:")
display(df['disease'].value_counts(normalize=True))

# Numeric summary
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
# exclude target from numeric features if included
if 'disease' in num_cols:
    num_cols.remove('disease')
print("\nNumeric summary (describe):")
display(df[num_cols].describe().T)

#histogram
if len(num_cols) <= 12:
    df[num_cols].hist(bins=20, figsize=(12, 8))
    plt.tight_layout()
    plt.show()

#preprocessing
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
for c in df.columns:
    if c != 'disease' and df[c].nunique() <= 6 and c not in cat_cols and c not in num_cols:
        cat_cols.append(c)
# Ensure target not included
if 'disease' in cat_cols: cat_cols.remove('disease')

print("\nNumeric columns:", num_cols)
print("Categorical columns:", cat_cols)

# Separate X and y
X = df.drop(columns=['disease'])
y = df['disease']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
], remainder='drop')  # drop any unknown columns

#splitting into training n testing data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print("\nTrain/test counts:", X_train.shape, X_test.shape)

def evaluate_model(pipe, X_train, y_train, X_test, y_test, model_name="model"):
    # Fit
    pipe.fit(X_train, y_train)
    # Predict
    y_pred = pipe.predict(X_test)
    if hasattr(pipe, "predict_proba"):
        y_proba = pipe.predict_proba(X_test)[:, 1]
    else:
        # some SVMs don't have predict_proba unless probability=True
        try:
            y_proba = pipe.decision_function(X_test)
            # scale to 0-1 via sigmoid-ish mapping if desired; but we'll use score only when possible
        except:
            y_proba = None

    metrics = OrderedDict()
    metrics['accuracy'] = accuracy_score(y_test, y_pred)
    metrics['precision'] = precision_score(y_test, y_pred, zero_division=0)
    metrics['recall'] = recall_score(y_test, y_pred, zero_division=0)
    metrics['f1'] = f1_score(y_test, y_pred, zero_division=0)
    if y_proba is not None:
        try:
            metrics['roc_auc'] = roc_auc_score(y_test, y_proba)
        except:
            metrics['roc_auc'] = None
    else:
        metrics['roc_auc'] = None

    print(f"\n=== Results for {model_name} ===")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}" if v is not None else f"{k}: None")
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))
    return metrics, pipe

results = {}
models = {
    "LogisticRegression": LogisticRegression(max_iter=500, class_weight='balanced', solver='liblinear'),
    "DecisionTree": DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced'),
    "SVM": SVC(kernel='rbf', probability=True, class_weight='balanced',max_iter=5000)
}

for name, clf in models.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('clf', clf)])
    metrics, fitted = evaluate_model(pipe, X_train, y_train, X_test, y_test, model_name=name)
    results[name] = metrics

# Summary table
summary = pd.DataFrame(results).T
display(summary)

