In [1]:
pip install pandas numpy scikit-learn xgboost joblib

Note: you may need to restart the kernel to use updated packages.


In [3]:
# ===============================
# ML Assignment 2 - FINAL PIPELINE
# Handles categorical + NaN values
# ===============================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# -------------------------------
# 1. Load Dataset
# -------------------------------
df = pd.read_csv("../data/heart_disease_uci.csv")

# -------------------------------
# 2. Target Variable
# -------------------------------
y = df["num"].apply(lambda x: 1 if x > 0 else 0)

# -------------------------------
# 3. Feature Selection
# -------------------------------
X = df.drop("num", axis=1)

# Drop non-predictive columns
X = X.drop(["id", "dataset"], axis=1)

# -------------------------------
# 4. Encode Categorical Columns
# -------------------------------
cat_cols = X.select_dtypes(include=["object"]).columns

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

print("Encoded categorical columns:", list(cat_cols))

# -------------------------------
# 5. Handle Missing Values (NaN)
# -------------------------------
imputer = SimpleImputer(strategy="median")
X = imputer.fit_transform(X)

# -------------------------------
# 6. Train-Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# -------------------------------
# 7. Feature Scaling
# -------------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# -------------------------------
# 8. Initialize Models
# -------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42
    )
}

# -------------------------------
# 9. Train & Evaluate Models
# -------------------------------
results = []

for model_name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    results.append([
        model_name,
        accuracy_score(y_test, y_pred),
        roc_auc_score(y_test, y_proba),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred),
        matthews_corrcoef(y_test, y_pred)
    ])

# -------------------------------
# 10. Results Table
# -------------------------------
results_df = pd.DataFrame(
    results,
    columns=[
        "Model",
        "Accuracy",
        "AUC",
        "Precision",
        "Recall",
        "F1 Score",
        "MCC"
    ]
)

print("\nModel Performance Comparison:")
print(results_df)


Encoded categorical columns: ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

Model Performance Comparison:
                 Model  Accuracy       AUC  Precision    Recall  F1 Score  \
0  Logistic Regression  0.809783  0.901961   0.819048  0.843137  0.830918   
1        Decision Tree  0.777174  0.770325   0.779817  0.833333  0.805687   
2                  KNN  0.826087  0.885521   0.872340  0.803922  0.836735   
3          Naive Bayes  0.809783  0.878049   0.825243  0.833333  0.829268   
4        Random Forest  0.836957  0.911765   0.846154  0.862745  0.854369   
5              XGBoost  0.836957  0.885103   0.833333  0.882353  0.857143   

        MCC  
0  0.613968  
1  0.546864  
2  0.653839  
3  0.614593  
4  0.669386  
5  0.669110  


Parameters: { "use_label_encoder" } are not used.

