
# HW3 — CDC Diabetes Analysis (Model Development & Experimentation)

This notebook scaffolds **Week 3 / Homework Set 3**:
- Clean train/test split with **stratification**
- **Preprocessing** (scaling numeric, one-hot for nominal as needed)
- **SMOTE** on training only (to address class imbalance)
- Baseline models (LogReg, DecisionTree, RandomForest, GradientBoosting, Naive Bayes, KNN)
- Evaluation: Accuracy, Precision, Recall, F1, **ROC-AUC**, **PR-AUC**
- **Confusion matrix**, **ROC** and **PR** curves
- **Stratified K-Fold** cross-validation
- **MLflow** experiment tracking (local) — optional
- **Hyperparameter tuning** templates (RandomizedSearchCV)
- Optional **threshold tuning** for recall targets


In [None]:
# Import all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             confusion_matrix, roc_curve, precision_recall_curve, average_precision_score)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

from pprint import pprint

# Optional: MLflow (will no-op if not installed)
try:
    import mlflow
    import mlflow.sklearn
    MLFLOW_AVAILABLE = True
except Exception:
    MLFLOW_AVAILABLE = False
    print("MLflow not available. To enable, install with: pip install mlflow")
    
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)



## Load Data

Choose one approach below and comment out the others.


In [None]:

# Option A: Load from local file you uploaded/mounted in Colab
# Replace with your path or use the files UI to upload.
# from google.colab import files
# uploaded = files.upload()  # then: df = pd.read_csv('yourfile.csv')
# df = pd.read_csv('your_local_file.csv')

# Option B: Load from GitHub raw (if your CSV is public)
# import pandas as pd
# url = "https://raw.githubusercontent.com/BartGoodell/SOME_REPO/SOME_BRANCH/path/to/file.csv"
# df = pd.read_csv(url)

# Option C: Already-loaded DataFrame placeholder (replace this with real load)
# For safety, we initialize an empty frame; replace with actual loading code.
df = pd.DataFrame()  # TODO: replace with your actual dataset load

print("Shape:", df.shape)
# display(df.head())  # uncomment after loading


In [None]:
!pip3 install -U ucimlrepo

In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

# data (as pandas dataframes)
X = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets

# metadata
print(cdc_diabetes_health_indicators.metadata)

# variable information
print(cdc_diabetes_health_indicators.variables)

df = pd.DataFrame()  # TODO: replace with your actual dataset load

print("Shape:", df.shape)
display(df.head())


## Select Target and Features


In [None]:

# Set your target column name here:
TARGET = "Diabetes_binary"  # change if different

assert TARGET in df.columns, f"TARGET '{TARGET}' not found in columns: {df.columns.tolist()}"

X = df.drop(columns=[TARGET]).copy()
y = df[TARGET].copy()

print("Target distribution:", y.value_counts(normalize=True).round(4).to_dict())



## Optional Feature Engineering
- `BMI_category` from `BMI` (CDC cutoffs)
- `TotalHealthDays` = `PhysHlth` + `MentHlth`
These are optional; the code guards for missing columns.


In [None]:

# BMI_category (if BMI exists)
if 'BMI' in X.columns:
    def bmi_bucket(x):
        if x < 18.5: return "Underweight"
        if x < 25:   return "Normal"
        if x < 30:   return "Overweight"
        return "Obese"
    X['BMI_category'] = X['BMI'].apply(bmi_bucket)

# TotalHealthDays (if both exist)
if set(['PhysHlth','MentHlth']).issubset(X.columns):
    X['TotalHealthDays'] = X['PhysHlth'] + X['MentHlth']
    
print("Columns after feature engineering:", X.columns.tolist()[:10], "... (total:", len(X.columns), ")")



## Train / Test Split (Stratified)


In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
print("Train:", X_train.shape, "Test:", X_test.shape)
print("Train class balance:", y_train.value_counts(normalize=True).round(4).to_dict())



## Preprocessing & Modeling Pipelines

We detect numeric vs. non-numeric columns.  
- Numeric: impute median, scale with StandardScaler.  
- Categorical: impute most_frequent, one-hot encode.  
**SMOTE** is applied **after** preprocessing and **before** the classifier (on training only) via `imblearn` pipeline.


In [None]:

# Identify numeric vs categorical columns from X_train
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X_train.columns if c not in numeric_features]

numeric_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, numeric_features),
        ("cat", categorical_tf, categorical_features),
    ],
    remainder="drop"
)

def make_pipeline(clf):
    # SMOTE applies only during fit on training data inside this pipeline
    return ImbPipeline(steps=[
        ("preprocess", preprocess),
        ("smote", SMOTE(random_state=RANDOM_STATE)),
        ("clf", clf)
    ])

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, n_jobs=None, class_weight=None, random_state=RANDOM_STATE),
    "DecisionTree": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE),
    "GradientBoosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "NaiveBayes": GaussianNB(),
    "KNN": KNeighborsClassifier(n_neighbors=15)
}
pprint(models)



## Fit Baselines & Evaluate on Test Set


In [None]:

results = []
curves = {}

for name, clf in models.items():
    pipe = make_pipeline(clf)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    # predicted probabilities (required for AUCs)
    if hasattr(pipe.named_steps["clf"], "predict_proba"):
        y_proba = pipe.predict_proba(X_test)[:, 1]
    else:
        # fallback: decision_function if available
        if hasattr(pipe.named_steps["clf"], "decision_function"):
            # scale decision function to 0-1 via min-max for AUC computation
            df_scores = pipe.decision_function(X_test)
            m, M = df_scores.min(), df_scores.max()
            y_proba = (df_scores - m) / (M - m + 1e-12)
        else:
            y_proba = None
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc = roc_auc_score(y_test, y_proba) if y_proba is not None else np.nan
    pr_auc = average_precision_score(y_test, y_proba) if y_proba is not None else np.nan
    
    results.append({"model": name, "accuracy": acc, "precision": prec, "recall": rec, "f1": f1,
                    "roc_auc": roc, "pr_auc": pr_auc})
    
    # store curves for plotting
    if y_proba is not None:
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        prec_c, rec_c, _ = precision_recall_curve(y_test, y_proba)
        curves[name] = {"fpr": fpr, "tpr": tpr, "prec": prec_c, "rec": rec_c}
        
pd_results = pd.DataFrame(results).sort_values(by=["f1", "roc_auc"], ascending=False).reset_index(drop=True)
pd_results



## Confusion Matrix (Pick a Model)


In [None]:

best_name = pd_results.iloc[0]["model"]
print("Best by F1/ROC-AUC (heuristic):", best_name)

# Refit chosen model for CM plot
pipe_best = make_pipeline(models[best_name])
pipe_best.fit(X_train, y_train)
y_pred_best = pipe_best.predict(X_test)

cm = confusion_matrix(y_test, y_pred_best)
tn, fp, fn, tp = cm.ravel()
print("TN, FP, FN, TP:", tn, fp, fn, tp)

plt.figure()
plt.imshow(cm, interpolation='nearest')
plt.title(f'Confusion Matrix - {best_name}')
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ['No', 'Yes'])
plt.yticks(tick_marks, ['No', 'Yes'])
plt.xlabel('Predicted')
plt.ylabel('True')
for i in range(2):
    for j in range(2):
        plt.text(j, i, cm[i, j], ha="center", va="center")
plt.tight_layout()
plt.show()



## ROC & Precision–Recall Curves


In [None]:

# Plot ROC
plt.figure()
for name, d in curves.items():
    plt.plot(d["fpr"], d["tpr"], label=name)
plt.plot([0,1],[0,1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.show()

# Plot PR
plt.figure()
for name, d in curves.items():
    plt.plot(d["rec"], d["prec"], label=name)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curves")
plt.legend()
plt.show()



## Stratified K-Fold Cross-Validation (metrics on training data)


In [None]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

cv_summary = []
scoring = {"accuracy":"accuracy", "precision":"precision", "recall":"recall", "f1":"f1"}

for name, clf in models.items():
    pipe = make_pipeline(clf)
    scores = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)
    row = {"model": name}
    for k, v in scoring.items():
        row[f"cv_{k}_mean"] = scores[f"test_{k}"].mean()
        row[f"cv_{k}_std"] = scores[f"test_{k}"].std()
    cv_summary.append(row)

pd_cv = pd.DataFrame(cv_summary).sort_values(by=["cv_f1_mean","cv_recall_mean"], ascending=False)
pd_cv



## (Optional) MLflow Logging
Run this cell to log models/metrics to a local MLflow experiment.


In [None]:

if MLFLOW_AVAILABLE:
    mlflow.set_experiment("HW3_CDC_Diabetes")
    for name, clf in models.items():
        with mlflow.start_run(run_name=name):
            pipe = make_pipeline(clf)
            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_test)
            if hasattr(pipe.named_steps["clf"], "predict_proba"):
                y_proba = pipe.predict_proba(X_test)[:, 1]
            else:
                y_proba = None
            mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
            mlflow.log_metric("precision", precision_score(y_test, y_pred, zero_division=0))
            mlflow.log_metric("recall", recall_score(y_test, y_pred, zero_division=0))
            mlflow.log_metric("f1", f1_score(y_test, y_pred, zero_division=0))
            if y_proba is not None:
                mlflow.log_metric("roc_auc", roc_auc_score(y_test, y_proba))
                mlflow.log_metric("pr_auc", average_precision_score(y_test, y_proba))
            # Log basic params if available
            try:
                mlflow.log_params(pipe.named_steps["clf"].get_params())
            except Exception:
                pass
            mlflow.sklearn.log_model(pipe, artifact_path="model")
    print("Logged runs. In Colab, you can view the MLflow UI locally if you forward ports, or just rely on the runs list.")
else:
    print("MLflow not installed; skipping logging.")



## Hyperparameter Tuning (Templates)
Edit param grids as needed. RandomizedSearchCV shown for speed.


In [None]:

from sklearn.model_selection import RandomizedSearchCV

# Example: Gradient Boosting
gb_pipe = make_pipeline(GradientBoostingClassifier(random_state=RANDOM_STATE))
gb_params = {
    "clf__n_estimators": [100, 200, 300],
    "clf__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "clf__max_depth": [2, 3, 4],
    "clf__subsample": [0.7, 0.85, 1.0]
}
gb_search = RandomizedSearchCV(gb_pipe, gb_params, n_iter=12, scoring="f1", cv=3, n_jobs=-1, random_state=RANDOM_STATE)
# gb_search.fit(X_train, y_train)
# print("Best GB params:", gb_search.best_params_)
# print("Best GB F1:", gb_search.best_score_)

# Example: Logistic Regression
lr_pipe = make_pipeline(LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
lr_params = {
    "clf__C": np.logspace(-3, 2, 10),
    "clf__penalty": ["l2"],
    "clf__class_weight": [None, "balanced"]
}
lr_search = RandomizedSearchCV(lr_pipe, lr_params, n_iter=10, scoring="f1", cv=3, n_jobs=-1, random_state=RANDOM_STATE)

# Example: Random Forest
rf_pipe = make_pipeline(RandomForestClassifier(random_state=RANDOM_STATE))
rf_params = {
    "clf__n_estimators": [200, 400, 600],
    "clf__max_depth": [None, 6, 10, 14],
    "clf__max_features": ["sqrt", "log2", 0.5, 0.8],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4]
}
rf_search = RandomizedSearchCV(rf_pipe, rf_params, n_iter=12, scoring="f1", cv=3, n_jobs=-1, random_state=RANDOM_STATE)

print("Search objects prepared. Uncomment .fit(...) lines to run.")



## Threshold Tuning (Recall Target)
Find a probability threshold that achieves a target recall.


In [None]:

target_recall = 0.85  # adjust as needed

pipe_lr = make_pipeline(LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
pipe_lr.fit(X_train, y_train)

y_proba_lr = pipe_lr.predict_proba(X_test)[:, 1]
prec, rec, thr = precision_recall_curve(y_test, y_proba_lr)

# Find threshold meeting or exceeding target_recall
idx = np.where(rec >= target_recall)[0]
if len(idx):
    best_idx = idx[-1]  # highest threshold that still meets recall
    chosen_thr = thr[best_idx-1] if best_idx > 0 else 0.5
else:
    chosen_thr = 0.5  # fallback

y_pred_thr = (y_proba_lr >= chosen_thr).astype(int)

print(f"Chosen threshold: {chosen_thr:.3f}")
print("Precision:", precision_score(y_test, y_pred_thr, zero_division=0))
print("Recall:", recall_score(y_test, y_pred_thr, zero_division=0))
print("F1:", f1_score(y_test, y_pred_thr, zero_division=0))



## Save Results (CSV)


In [None]:

pd_results.to_csv("hw3_model_results.csv", index=False)
print("Saved hw3_model_results.csv in current working directory.")
