# Heart Dataset Analysis and ML Models

This notebook performs exploratory data analysis (EDA), preprocessing, and trains multiple classification models using a single reusable function. It includes evaluation metrics, confusion matrices, and ROC curves. The dataset is loaded from `/mnt/data/heart.csv`.

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             confusion_matrix, roc_curve, auc, classification_report)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import pickle
import warnings
warnings.filterwarnings('ignore')
print('Libraries imported')

In [None]:
# Load dataset
df = pd.read_csv(r'/mnt/data/heart.csv')
print('Dataset loaded:', df.shape)
df.head()

In [None]:
# Basic info and missing values
print(df.info())
print('\nMissing values per column:\n', df.isnull().sum())
print('\nSummary statistics:\n', df.describe())

In [None]:
# Simple univariate plots for numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print('Numeric columns:', numeric_cols)
for col in numeric_cols:
    plt.figure(figsize=(6,2.2))
    plt.hist(df[col].dropna(), bins=20)
    plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation matrix heatmap (matplotlib)
corr = df.corr()
plt.figure(figsize=(8,6))
plt.imshow(corr, interpolation='nearest')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns, rotation=90)
plt.yticks(range(len(corr)), corr.index)
plt.title('Correlation matrix (heatmap)')
plt.tight_layout()
plt.show()
corr

In [None]:
# Preprocessing: simple steps
# Assumes target column is named 'target' or 'Target' or 'heart_disease' - try common names
possible_targets = ['target','Target','heart_disease','HeartDisease','output']
target_col = None
for t in possible_targets:
    if t in df.columns:
        target_col = t
        break
if target_col is None:
    # fallback: last column as target
    target_col = df.columns[-1]

print('Using target column:', target_col)

X = df.drop(columns=[target_col])
y = df[target_col]

# Handle categorical columns (one-hot encode), handle missing by simple imputation
X = pd.get_dummies(X, drop_first=True)
X = X.fillna(X.median())

# Standard scaling for numeric features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

print('Preprocessed X shape:', X_scaled.shape)

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

In [None]:
# Single function that trains multiple models, returns a summary and plots results
def run_models(models, X_train, y_train, X_test, y_test, cv=5):
    results = []
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    plt.figure(figsize=(8,6))
    for name, model in models:
        # cross-validated accuracy and roc_auc (if probability available)
        acc_cv = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy').mean()
        try:
            roc_cv = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc').mean()
        except Exception:
            roc_cv = np.nan
        # fit on train
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        probs = None
        if hasattr(model, 'predict_proba'):
            probs = model.predict_proba(X_test)[:,1]
        elif hasattr(model, 'decision_function'):
            try:
                probs = model.decision_function(X_test)
            except Exception:
                probs = None
        # metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        roc = roc_auc_score(y_test, probs) if probs is not None and len(np.unique(y_test))==2 else np.nan
        results.append({
            'model': name,
            'cv_accuracy': acc_cv,
            'cv_roc_auc': roc_cv,
            'test_accuracy': acc,
            'precision': prec,
            'recall': rec,
            'f1': f1,
            'roc_auc': roc
        })
        # Plot ROC curve if possible
        if probs is not None and len(np.unique(y_test))==2:
            fpr, tpr, _ = roc_curve(y_test, probs)
            plt.plot(fpr, tpr, label=f'{name} (AUC={roc:.3f})' if not np.isnan(roc) else name)
    # finalize ROC plot
    plt.plot([0,1],[0,1],'--', label='random')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curves (for models with probability output)')
    plt.legend(loc='lower right', fontsize='small')
    plt.tight_layout()
    plt.show()
    return pd.DataFrame(results)

In [None]:
# Define models to evaluate
models = [
    ('LogisticRegression', LogisticRegression(max_iter=1000)),
    ('RandomForest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('GradientBoosting', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('DecisionTree', DecisionTreeClassifier(random_state=42)),
    ('KNeighbors', KNeighborsClassifier()),
    ('SVC', SVC(probability=True))
]

results_df = run_models(models, X_train, y_train, X_test, y_test, cv=5)
results_df.sort_values(by='test_accuracy', ascending=False)

In [None]:
# Detailed classification report and confusion matrix for the best model (by test accuracy)
best_name = results_df.sort_values('test_accuracy', ascending=False).iloc[0]['model']
print('Best model:', best_name)
best_model = None
for name, model in models:
    if name == best_name:
        best_model = model
        break
# Refit best model on full training set and evaluate
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print('\nClassification report:\n', classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(4,3))
plt.imshow(cm, interpolation='nearest')
plt.colorbar()
plt.title(f'Confusion matrix - {best_name}')
plt.xticks([0,1])
plt.yticks([0,1])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

In [None]:
# Save the best model and the scaler to disk for later use
with open('/mnt/data/best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
with open('/mnt/data/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print('Saved best model to /mnt/data/best_model.pkl and scaler to /mnt/data/scaler.pkl')

## Notes

- This notebook is generated automatically. You can open and run it in Jupyter or VS Code.
- If the target column name is different, the notebook falls back to using the last column as target.
- The `run_models` function runs cross-validation, fits models, plots ROC curves for models that return probabilities, and returns a performance summary.