In [3]:
# =========================================================================
# 5-Fold Nested CV Pipeline for ADASYN + Logistic Regression Experiment
#
# - Outer 5-fold: Evaluates generalization performance (Main metrics: Recall/F1/AUC)
# - Inner 5-fold: Selects ADASYN (r, k) hyperparameters (using *only* the train set)
# =========================================================================

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, f1_score, precision_score, roc_auc_score
from imblearn.over_sampling import ADASYN

RANDOM_STATE = 42

In [4]:
# ------------------------------------------------
# 0. Load Data & Preprocessing (up to One-Hot Encoding)
# ------------------------------------------------

df = pd.read_csv("/Users/hwangsia/final_data.csv")

print("Original shape:", df.shape)
print(df.head())

y_all = df["COPD"]
X_raw = df.drop(columns=["COPD", "ID"])  # Remove unnecessary identifiers

# Store original feature names for later categorical decoding
orig_feature_names = list(X_raw.columns)

# Treat all features as categorical and convert to string (for one-hot consistency)
X_raw = X_raw.astype(str)

# One-hot encoding (drop_first=False keeps all levels -> allows decoding later)
X_encoded = pd.get_dummies(X_raw, drop_first=False)

print("Shape after one-hot encoding:", X_encoded.shape)
print(X_encoded.head())

# Map original variables -> corresponding dummy columns (for decoding)
dummy_map = {
    var: [c for c in X_encoded.columns if c.startswith(var + "_")]
    for var in orig_feature_names
}

# ------------------------------------------------
# 1. Define ADASYN Hyperparameter Grid
#    (sampling_strategy=r, n_neighbors=k)
# ------------------------------------------------

r_values = [round(v, 2) for v in np.arange(0.20, 1.01, 0.10)]  # 0.20, 0.30, ..., 1.00
k_values = [3, 5, 7, 9, 11, 13, 15]                        # Candidate k-neighbor values

# ------------------------------------------------
# 2. Outer StratifiedKFold (5-fold)
#    These are the folds for final reporting.
# ------------------------------------------------

outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# For storing results
fold_results = []  # Record performance for each fold
adasyn_train_decoded = {}  # Store decoded oversampled train set per fold
adasyn_test_decoded = {}   # Store decoded original test set per fold

# ------------------------------------------------
# Helper Function: Decode one-hot (or scaled) data back to categorical labels
# - row: A single pd.Series (one sample)
# - dummy_map: {orig_var: [var_level1, var_level2, ...]}
# ------------------------------------------------
def decode_row(row_series, dummy_map):
    rec = {}
    for var, cols_for_var in dummy_map.items():
        vals = row_series[cols_for_var].values.astype(float)
        winner_idx = int(np.argmax(vals))
        winner_col = cols_for_var[winner_idx]
        # e.g., "Smoking_Current" -> "Current"
        level_name = winner_col.split(var + "_", 1)[1]
        rec[var] = level_name
    return rec



Original shape: (2776, 11)
                ID  COPD  age     sex OccupationRisk Income         smoking  \
0  CODA24L27862226     0  50s    Male    Non–at-risk   High   Former Smoker   
1  CODA24L31534447     0  50s    Male        At-risk   High  Current Smoker   
2  CODA24L47263638     0  50s    Male    Non–at-risk    Low      Non-smoker   
3  CODA24L14080640     0  50s  Female    Non–at-risk   High      Non-smoker   
4  CODA24L76507177     1  50s    Male    Non–at-risk   High   Former Smoker   

  Packyears Asthma BMI_Group PM25_Group  
0       low     No     Obese       high  
1       low     No    Normal        low  
2       NaN     No     Obese       high  
3       NaN     No    Normal       high  
4       low     No    Normal        low  
Shape after one-hot encoding: (2776, 24)
   age_50s  age_60s  age_70+  sex_Female  sex_Male  OccupationRisk_At-risk  \
0     True    False    False       False      True                   False   
1     True    False    False       False      Tru

In [13]:
# ------------------------------------------------
# Start Outer CV Loop
# ------------------------------------------------

for fold_id, (train_idx, test_idx) in enumerate(outer_cv.split(X_encoded, y_all), start=1):
    print(f"\n========== [FOLD {fold_id}] ==========")

    X_tr_full = X_encoded.iloc[train_idx].copy()
    y_tr_full = y_all.iloc[train_idx].copy()
    X_te_full = X_encoded.iloc[test_idx].copy()
    y_te_full = y_all.iloc[test_idx].copy()

    # =========================================
    # (A) Inner 5-fold CV to select ADASYN (r, k)
    #     -> This uses *only* the outer-train set (X_tr_full, y_tr_full)
    # =========================================
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

    grid_records = []  # Summary of (r,k) performance within this outer fold

    for r in r_values:
        for k in k_values:

            cv_recalls = []
            cv_f1s = []

            # Inner 5-fold: (outer-train -> inner-train / inner-val)
            for inner_tr_idx, inner_val_idx in inner_cv.split(X_tr_full, y_tr_full):
                X_inner_tr = X_tr_full.iloc[inner_tr_idx].copy()
                y_inner_tr = y_tr_full.iloc[inner_tr_idx].copy()
                X_inner_val = X_tr_full.iloc[inner_val_idx].copy()
                y_inner_val = y_tr_full.iloc[inner_val_idx].copy()

                # Scaler is fit *only* on inner-train
                scaler_inner = StandardScaler()
                X_inner_tr_scaled = scaler_inner.fit_transform(X_inner_tr)
                X_inner_val_scaled = scaler_inner.transform(X_inner_val)

                # Oversample *only* the inner-train set with ADASYN
                ada = ADASYN(
                    sampling_strategy=r,
                    n_neighbors=k,
                    random_state=RANDOM_STATE
                )

                try:
                    X_inner_tr_aug, y_inner_tr_aug = ada.fit_resample(
                        X_inner_tr_scaled, y_inner_tr
                    )
                except Exception as e:
                    # ADASYN might fail with extreme r/k settings (e.g., k > minority samples)
                    # If it fails, just skip this combination
                    continue

                # Train Logistic Regression
                clf_inner = LogisticRegression(
                    random_state=RANDOM_STATE,
                    max_iter=1000
                )
                clf_inner.fit(X_inner_tr_aug, y_inner_tr_aug)

                # Predict on inner-val (using default 0.5 threshold)
                y_inner_val_pred = clf_inner.predict(X_inner_val_scaled)

                cv_recalls.append(recall_score(y_inner_val, y_inner_val_pred))
                cv_f1s.append(f1_score(y_inner_val, y_inner_val_pred))

            if len(cv_recalls) == 0:
                # If this (r,k) combo failed all inner folds, skip it
                continue

            mean_rec = float(np.mean(cv_recalls))
            std_rec = float(np.std(cv_recalls, ddof=1))
            mean_f1 = float(np.mean(cv_f1s))
            std_f1  = float(np.std(cv_f1s, ddof=1))

            grid_records.append({
                "r": r,
                "k": k,
                "mean_recall": mean_rec,
                "std_recall": std_rec,
                "mean_f1": mean_f1,
                "std_f1": std_f1
            })

    grid_df = pd.DataFrame(grid_records)

    # If grid_df is empty (all ADASYN combos failed), we must use a fallback.
    if grid_df.empty:
        print("WARNING: ADASYN grid search failed for this fold. Using fallback (r=1.0, k=3).")
        best_r, best_k = 1.0, 3
    else:
        # Select best: 1st by highest mean_recall, 2nd by lowest std_recall (tie-breaker)
        grid_df_sorted = grid_df.sort_values(
            by=["mean_recall", "std_recall"],
            ascending=[False, True]
        ).reset_index(drop=True)

        best_r = float(grid_df_sorted.loc[0, "r"])
        best_k = int(grid_df_sorted.loc[0, "k"])

    print(f"[FOLD {fold_id}] Selected ADASYN (r, k): ({best_r}, {best_k})")

    # =========================================
    # (B) Final Scaling & ADASYN Application (on the *entire* outer-train set)
    # =========================================

    scaler_full = StandardScaler()
    X_tr_scaled_full = scaler_full.fit_transform(X_tr_full)
    X_te_scaled_full = scaler_full.transform(X_te_full)

    # Model 1: Trained on the original, imbalanced train set (Baseline)
    clf_orig = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
    clf_orig.fit(X_tr_scaled_full, y_tr_full)

    # Model 2: Trained on the ADASYN-oversampled train set
    ada_best = ADASYN(
        sampling_strategy=best_r,
        n_neighbors=best_k,
        random_state=RANDOM_STATE
    )

    try:
        X_tr_aug, y_tr_aug = ada_best.fit_resample(X_tr_scaled_full, y_tr_full)
    except Exception as e:
        # Fallback in case the chosen params fail on the full outer-train
        print(f"[FOLD {fold_id}] WARNING: ADASYN failed with chosen params, using fallback r=1.0,k=3")
        ada_best = ADASYN(
            sampling_strategy=1.0,
            n_neighbors=3,
            random_state=RANDOM_STATE
        )
        X_tr_aug, y_tr_aug = ada_best.fit_resample(X_tr_scaled_full, y_tr_full)

    clf_aug = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
    clf_aug.fit(X_tr_aug, y_tr_aug)

    # =========================================
    # (C) Evaluate performance on this fold's test set (outer-test)
    #     (The test set is *never* oversampled)
    # =========================================

    # (1) Model trained on original imbalanced data (Baseline)
    y_pred_orig = clf_orig.predict(X_te_scaled_full)
    y_prob_orig = clf_orig.predict_proba(X_te_scaled_full)[:, 1]

    orig_recall = recall_score(y_te_full, y_pred_orig)
    orig_f1     = f1_score(y_te_full, y_pred_orig)
    orig_prec   = precision_score(y_te_full, y_pred_orig, zero_division=0)
    orig_auc    = roc_auc_score(y_te_full, y_prob_orig)

    # (2) Model trained on ADASYN-augmented data
    y_pred_aug = clf_aug.predict(X_te_scaled_full)
    y_prob_aug = clf_aug.predict_proba(X_te_scaled_full)[:, 1]

    aug_recall = recall_score(y_te_full, y_pred_aug)
    aug_f1     = f1_score(y_te_full, y_pred_aug)
    aug_prec   = precision_score(y_te_full, y_pred_aug, zero_division=0)
    aug_auc    = roc_auc_score(y_te_full, y_prob_aug)

    # Store per-fold summary
    fold_results.append({
        "fold": fold_id,
        "adasyn_r": best_r,
        "adasyn_k": best_k,

        "orig_recall": orig_recall,
        "orig_precision": orig_prec,
        "orig_f1": orig_f1,
        "orig_auc": orig_auc,

        "aug_recall": aug_recall,
        "aug_precision": aug_prec,
        "aug_f1": aug_f1,
        "aug_auc": aug_auc
    })

    print(f"[FOLD {fold_id}] orig_recall={orig_recall:.3f}, aug_recall={aug_recall:.3f}")
    print(f"[FOLD {fold_id}] orig_f1={orig_f1:.3f}, aug_f1={aug_f1:.3f}")
    print(f"[FOLD {fold_id}] orig_auc={orig_auc:.3f}, aug_auc={aug_auc:.3f}")

    # ----------------------------------------------------------------------------------
    # (D) Decode the oversampled train / original test data back into categorical format
    # ----------------------------------------------------------------------------------

    # 1) Oversampled train (X_tr_aug) is in the scaled continuous space.
    #    -> Use inverse_transform to revert it back to the "original one-hot space" (approx).
    #    This step is crucial for accurate categorical decoding (quantization).
    X_tr_aug_unscaled = scaler_full.inverse_transform(X_tr_aug)

    X_tr_aug_df = pd.DataFrame(
        X_tr_aug_unscaled,
        columns=X_tr_full.columns
    )
    y_tr_aug_series = pd.Series(y_tr_aug, name="COPD")

    # 2) Decode the oversampled train set (select category using argmax of the dummy group)
    decoded_rows_train = []
    for i in range(len(X_tr_aug_df)):
        # The decode_row function finds the max value (closest to 1) in the unscaled array
        decoded_record = decode_row(X_tr_aug_df.iloc[i], dummy_map)
        decoded_record["COPD"] = int(y_tr_aug_series.iloc[i])
        decoded_rows_train.append(decoded_record)
    decoded_train_df = pd.DataFrame(decoded_rows_train)

    # 3) Decode the test set (Test was never oversampled, so use the original one-hot X_te_full)
    decoded_rows_test = []
    for i in range(len(X_te_full)):
        decoded_record = decode_row(X_te_full.iloc[i], dummy_map)
        decoded_record["COPD"] = int(y_te_full.iloc[i])
        decoded_rows_test.append(decoded_record)
    decoded_test_df = pd.DataFrame(decoded_rows_test)

    # 4) Store data in dicts and save to files
    adasyn_train_decoded[fold_id] = decoded_train_df
    adasyn_test_decoded[fold_id]  = decoded_test_df

    output_dir = "/Users/hwangsia/"
    # Save files using na_rep='NA' for consistent representation of missing values
    decoded_train_df.to_csv(f"{output_dir}adasyn_train_fold{fold_id}.csv",
                            index=False, na_rep='NA')
    decoded_test_df.to_csv(f"{output_dir}adasyn_test_fold{fold_id}.csv",
                        index=False, na_rep='NA')

    # Save grid search results if available
    if not grid_df.empty:
        grid_df_sorted.to_csv(f"{output_dir}adasyn_grid_fold{fold_id}.csv", index=False)


[FOLD 1] Selected ADASYN (r, k): (1.0, 13)
[FOLD 1] orig_recall=0.014, aug_recall=0.739
[FOLD 1] orig_f1=0.028, aug_f1=0.336
[FOLD 1] orig_auc=0.750, aug_auc=0.729

[FOLD 2] Selected ADASYN (r, k): (1.0, 7)
[FOLD 2] orig_recall=0.015, aug_recall=0.721
[FOLD 2] orig_f1=0.029, aug_f1=0.339
[FOLD 2] orig_auc=0.734, aug_auc=0.732

[FOLD 3] Selected ADASYN (r, k): (1.0, 3)
[FOLD 3] orig_recall=0.059, aug_recall=0.618
[FOLD 3] orig_f1=0.104, aug_f1=0.311
[FOLD 3] orig_auc=0.708, aug_auc=0.706

[FOLD 4] Selected ADASYN (r, k): (1.0, 3)
[FOLD 4] orig_recall=0.015, aug_recall=0.603
[FOLD 4] orig_f1=0.028, aug_f1=0.293
[FOLD 4] orig_auc=0.676, aug_auc=0.669

[FOLD 5] Selected ADASYN (r, k): (1.0, 7)
[FOLD 5] orig_recall=0.043, aug_recall=0.725
[FOLD 5] orig_f1=0.083, aug_f1=0.355
[FOLD 5] orig_auc=0.752, aug_auc=0.757


In [14]:
# ------------------------------------------------
# 3. Final Results Summary (Mean/Std across all folds)
# ------------------------------------------------

results_df = pd.DataFrame(fold_results)
print("\n===== per-fold raw results =====")
print(results_df)

# Calculate mean and std (ddof=1 for sample std)
summary = {
    "orig_recall_mean":   results_df["orig_recall"].mean(),
    "orig_recall_std":    results_df["orig_recall"].std(ddof=1),
    "orig_f1_mean":       results_df["orig_f1"].mean(),
    "orig_f1_std":        results_df["orig_f1"].std(ddof=1),
    "orig_auc_mean":      results_df["orig_auc"].mean(),
    "orig_auc_std":       results_df["orig_auc"].std(ddof=1),

    "aug_recall_mean":    results_df["aug_recall"].mean(),
    "aug_recall_std":     results_df["aug_recall"].std(ddof=1),
    "aug_f1_mean":        results_df["aug_f1"].mean(),
    "aug_f1_std":         results_df["aug_f1"].std(ddof=1),
    "aug_auc_mean":       results_df["aug_auc"].mean(),
    "aug_auc_std":        results_df["aug_auc"].std(ddof=1),
}

summary_df = pd.DataFrame([summary])
print("\n===== summary across all 5 folds =====")
print(summary_df)

# Save final summaries
results_df.to_csv(f"{output_dir}fold_results_5fold.csv", index=False)
summary_df.to_csv(f"{output_dir}fold_summary_5fold.csv", index=False)

print(f"\nDone: All results saved to {output_dir}")


===== per-fold raw results =====
   fold  adasyn_r  adasyn_k  orig_recall  orig_precision   orig_f1  orig_auc  \
0     1       1.0        13     0.014493        0.500000  0.028169  0.750484   
1     2       1.0         7     0.014706        0.500000  0.028571  0.734388   
2     3       1.0         3     0.058824        0.444444  0.103896  0.708011   
3     4       1.0         3     0.014706        0.333333  0.028169  0.676395   
4     5       1.0         7     0.043478        1.000000  0.083333  0.751700   
5     1       1.0        13     0.014493        0.500000  0.028169  0.750484   
6     2       1.0         7     0.014706        0.500000  0.028571  0.734388   
7     3       1.0         3     0.058824        0.444444  0.103896  0.708011   
8     4       1.0         3     0.014706        0.333333  0.028169  0.676395   
9     5       1.0         7     0.043478        1.000000  0.083333  0.751700   

   aug_recall  aug_precision    aug_f1   aug_auc  
0    0.739130       0.217021  0.33