In [None]:
# # ─────────────────────────────────────────────────────────────
# # 0. 라이브러리
# # ─────────────────────────────────────────────────────────────
# import warnings, numpy as np, pandas as pd
# from pathlib import Path
# from sklearn.preprocessing import LabelEncoder, StandardScaler

# warnings.filterwarnings('ignore')

# # ─────────────────────────────────────────────────────────────
# # 1. 경로 및 데이터 로드
# # ─────────────────────────────────────────────────────────────
# ROOT_DIR = Path(r"C:\Users\shaun\Desktop\project\Daycon\cancer")
# train_fp = ROOT_DIR / "train.csv"
# test_fp  = ROOT_DIR / "test.csv"

# train = pd.read_csv(train_fp)
# test  = pd.read_csv(test_fp)

# TARGET = 'Cancer'     # 이진 타깃
# ID_COL = 'ID'         # 식별자

# y      = train[TARGET].astype('int8')
# train  = train.drop(columns=[TARGET])

# n_train = len(train)
# df_all  = pd.concat([train, test], axis=0, ignore_index=True)

# # ─────────────────────────────────────────────────────────────
# # 2. 변수 타입 정의
# # ─────────────────────────────────────────────────────────────
# int_cols   = ['Age']   # (정수 → 실수 캐스팅 후 스케일)
# float_cols = ['Nodule_Size', 'TSH_Result', 'T4_Result', 'T3_Result']
# bin_cols   = ['Gender', 'Family_Background', 'Radiation_History',
#               'Iodine_Deficiency', 'Smoke', 'Weight_Risk', 'Diabetes']
# cat_cols   = ['Country', 'Race']

# # ─────────────────────────────────────────────────────────────
# # 3. 이진 변수 인코딩 → 정확 매핑
# # ─────────────────────────────────────────────────────────────
# binary_maps = {
#     'Gender'            : {'m': 1, 'f': 0},
#     'Family_Background' : {'positive': 1,  'negative': 0},
#     'Radiation_History' : {'exposed': 1,   'unexposed': 0},
#     'Iodine_Deficiency' : {'deficient': 1, 'sufficient': 0},
#     'Smoke'             : {'smoker': 1, 'non-smoker': 0},
#     'Weight_Risk'       : {'obese': 1, 'not obese': 0},
#     'Diabetes'          : {'yes': 1, 'no': 0}
# }

# for col, mapper in binary_maps.items():
#     df_all[col] = (
#         df_all[col]
#         .astype(str)          # NaN → 'nan', 숫자 → 문자
#         .str.strip()          # 앞뒤 공백 제거
#         .str.lower()          # 소문자 통일
#         .replace({'non obese': 'not obese'})   # (예) 하이픈·공백 변형 정규화
#         .map(mapper)          # 매핑
#         .fillna(-1)           # 매핑 실패 → -1
#         .astype('int8')       # 안전하게 정수형 변환
#     )
# # ─────────────────────────────────────────────────────────────
# # 4. 다중 범주(Label Encoding)
# # ─────────────────────────────────────────────────────────────
# for col in cat_cols:
#     le = LabelEncoder()
#     df_all[col] = le.fit_transform(df_all[col])

# # ─────────────────────────────────────────────────────────────
# # 5. 수치형 스케일링 (Age 포함)
# # ─────────────────────────────────────────────────────────────
# #   - 스케일러는 훈련 데이터 기준으로 fit 후 전체 transform
# # ------------------------------------------------------------
# scaler = StandardScaler().fit(df_all.iloc[:n_train][float_cols + int_cols])

# df_all[float_cols + int_cols] = scaler.transform(df_all[float_cols + int_cols])

# # ─────────────────────────────────────────────────────────────
# # 6. 다시 train / test 분리 및 저장
# # ─────────────────────────────────────────────────────────────
# processed_train = pd.concat(
#     [df_all.iloc[:n_train].reset_index(drop=True), y.reset_index(drop=True)],
#     axis=1
# )
# processed_test  = df_all.iloc[n_train:].reset_index(drop=True)

# # 파일 저장
# processed_train.to_csv(ROOT_DIR / "processed_train.csv", index=False)
# processed_test.to_csv(ROOT_DIR / "processed_test.csv",  index=False)

# print("✅ processed_train.csv / processed_test.csv saved.")


✅ processed_train.csv / processed_test.csv saved.


In [5]:
# ─────────────────────────────────────────────────────────────
# 0. 라이브러리
# ─────────────────────────────────────────────────────────────
import warnings, sys, importlib
warnings.filterwarnings("ignore")

import pandas as pd, numpy as np
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline      import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics       import make_scorer, roc_auc_score, f1_score, accuracy_score

# 기본 모델들
from sklearn.linear_model  import LogisticRegression
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier, GradientBoostingClassifier

# ─────────────────────────────────────────────────────────────
# 1. 데이터 로드
# ─────────────────────────────────────────────────────────────
DATA_FP = Path(r"C:\Users\shaun\Desktop\project\Daycon\cancer\processed_train.csv")
df = pd.read_csv(DATA_FP)

TARGET  = "Cancer"
ID_COL  = "ID"

X = df.drop(columns=[TARGET, ID_COL])
y = df[TARGET].astype(int)

# ─────────────────────────────────────────────────────────────
# 2. 평가 지표 & CV 설정
# ─────────────────────────────────────────────────────────────
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    "roc_auc": "roc_auc",
    "f1"     : "f1",
    "acc"    : "accuracy"
}

# ─────────────────────────────────────────────────────────────
# 3. 모델 사전 (필요시 라이브러리 있는 것만)
# ─────────────────────────────────────────────────────────────
models = {
    "LogisticR" : Pipeline([("scaler", StandardScaler()),
                            ("clf", LogisticRegression(max_iter=1000, n_jobs=-1))]),

    "DecisionTree" : DecisionTreeClassifier(random_state=42),

    "RandomForest" : RandomForestClassifier(
                        n_estimators=500, n_jobs=-1, random_state=42),

    "GBDT"         : GradientBoostingClassifier(random_state=42)
}

# 선택 설치 모델 (xgboost, lightgbm, catboost) ────────────────
optional_pkgs = {
    "xgboost"  : ("XGBClassifier", dict(
                    n_estimators=400, learning_rate=0.1, max_depth=6,
                    subsample=0.8, colsample_bytree=0.8,
                    eval_metric="logloss", n_jobs=-1, random_state=42)),
    "lightgbm" : ("LGBMClassifier", dict(
                    n_estimators=400, learning_rate=0.1, max_depth=-1,
                    subsample=0.8, colsample_bytree=0.8,
                    n_jobs=-1, random_state=42)),
    "catboost" : ("CatBoostClassifier", dict(
                    iterations=400, learning_rate=0.1, depth=6,
                    verbose=False, random_state=42))
}

for pkg_name, (cls_name, kwargs) in optional_pkgs.items():
    if importlib.util.find_spec(pkg_name):
        cls_ = getattr(importlib.import_module(pkg_name), cls_name)
        models[cls_name] = cls_(**kwargs)
    else:
        print(f"💡 {pkg_name} 미설치 → 해당 모델 건너뜀")

# ─────────────────────────────────────────────────────────────
# 4. 5-Fold 교차검증 실행
# ─────────────────────────────────────────────────────────────
results = []
for name, model in models.items():
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    results.append({
        "Model"      : name,
        "ROC_AUC"    : f"{scores['test_roc_auc'].mean():.4f} ± {scores['test_roc_auc'].std():.4f}",
        "F1"         : f"{scores['test_f1'].mean():.4f}",
        "Accuracy"   : f"{scores['test_acc'].mean():.4f}"
    })

# 보기 좋게 정렬·출력
df_res = (pd.DataFrame(results)
            .sort_values("ROC_AUC", ascending=False)
            .reset_index(drop=True))
print("\n★ 5-Fold CV 기본 성능 비교")
print(df_res.to_string(index=False))



★ 5-Fold CV 기본 성능 비교
             Model         ROC_AUC     F1 Accuracy
CatBoostClassifier       nan ± nan    nan      nan
    LGBMClassifier 0.7035 ± 0.0025 0.2822   0.8812
      RandomForest 0.7011 ± 0.0041 0.2365   0.8811
     XGBClassifier 0.7006 ± 0.0048 0.2854   0.8806
              GBDT 0.7006 ± 0.0043 0.1446   0.8806
         LogisticR 0.6453 ± 0.0071 0.0000   0.8800
      DecisionTree 0.5863 ± 0.0053 0.2699   0.8134


In [9]:
# ─────────────────────────────────────────────────────────────
# 0. 준비 – 이미 불러온 X, y, models 사전을 그대로 사용
#    (CatBoost는 제거)
# ─────────────────────────────────────────────────────────────
from sklearn.metrics import precision_recall_curve, roc_auc_score, f1_score
from sklearn.model_selection import StratifiedKFold
import numpy as np, pandas as pd, copy, importlib

# models 딕셔너리 재구성 (CatBoost 제외)
models_opt = copy.deepcopy(models)
models_opt.pop("CatBoost", None)           # 혹시 있으면 삭제
models_opt.pop("CatBoostClassifier", None) # 이름이 이렇게 저장돼 있다면

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def cv_with_opt_threshold(model, X, y, cv):
    """각 fold마다 F1이 최대가 되는 cut-off를 찾아 AUC·F1·cut 저장"""
    aucs, f1s, thrs = [], [], []
    for tr, va in cv.split(X, y):
        model.fit(X.iloc[tr], y.iloc[tr])

        # ── 예측 확률 & AUC ──
        proba = model.predict_proba(X.iloc[va])[:, 1]
        aucs.append(roc_auc_score(y.iloc[va], proba))

        # ── cut-off 탐색 ──
        pr, rc, thr = precision_recall_curve(y.iloc[va], proba)
        f1 = 2 * pr * rc / (pr + rc + 1e-8)
        best_idx = f1.argmax()
        best_thr, best_f1 = (thr[best_idx] if best_idx < len(thr) else 0.5), f1[best_idx]
        thrs.append(best_thr); f1s.append(best_f1)
    return np.mean(aucs), np.std(aucs), np.mean(f1s), np.mean(thrs)

# ─────────────────────────────────────────────────────────────
# 1. 실행
# ─────────────────────────────────────────────────────────────
rows = []
for name, mdl in models_opt.items():
    try:
        auc_mean, auc_std, f1_opt, thr_opt = cv_with_opt_threshold(mdl, X, y, cv)
        rows.append([name, f"{auc_mean:.4f} ± {auc_std:.4f}",
                     f"{f1_opt:.4f}", f"{thr_opt:.3f}"])
    except Exception as e:
        rows.append([name, "ERROR", "ERROR", str(e)])

df_opt = (pd.DataFrame(rows,
                       columns=["Model", "AUC", "F1@OptThr", "Opt Thr"])
            .sort_values("F1@OptThr", ascending=False)
            .reset_index(drop=True))
print("★ 5-Fold CV — 최적 임계값 적용 성능")
print(df_opt.to_string(index=False))


[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001753 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1124
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463
[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001551 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1124
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 14
[LightGBM] [Info] [b

우선 순위

catboost
GBDT
Random Forest
LightGBM

In [11]:
# ─────────────────────────────────────────────────────────────
# 0. 라이브러리
# ─────────────────────────────────────────────────────────────
import warnings, numpy as np, pandas as pd, optuna, gc, time
from pathlib import Path
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_curve, f1_score, roc_auc_score

warnings.filterwarnings("ignore")

# ─────────────────────────────────────────────────────────────
# 1. 데이터 로드
# ─────────────────────────────────────────────────────────────
DATA_FP = Path(r"C:\Users\shaun\Desktop\project\Daycon\cancer\processed_train.csv")
df = pd.read_csv(DATA_FP)

TARGET, ID_COL = "Cancer", "ID"
X = df.drop(columns=[TARGET, ID_COL])
y = df[TARGET].astype(int)

neg_pos_ratio = (y == 0).sum() / max((y == 1).sum(), 1)

# Stratified K-Fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ─────────────────────────────────────────────────────────────
# 2. Optuna Objective
# ─────────────────────────────────────────────────────────────
def objective(trial):
    # ─ 하이퍼 샘플링 ─
    params = dict(
        depth            = trial.suggest_int("depth", 4, 12),
        learning_rate    = trial.suggest_float("lr", 0.01, 0.2, log=True),
        l2_leaf_reg      = trial.suggest_float("l2", 1e-2, 30, log=True),
        iterations       = trial.suggest_int("iters", 800, 2000),
        scale_pos_weight = trial.suggest_float("spw", 1.0, neg_pos_ratio),
        border_count     = trial.suggest_int("border", 32, 255),
        loss_function    = "Logloss",
        verbose          = False,
        random_state     = 42,
        task_type = "GPU",   # ★ GPU 모드
        devices   = "0"      # 여러 GPU 중 첫 번째 카드 사용
    )
    # ─ cut-off (F1 최적값용) ─
    cutoff = trial.suggest_float("thr", 0.05, 0.45)

    f1_list = []
    for tr_idx, va_idx in cv.split(X, y):
        model = CatBoostClassifier(**params)
        model.fit(X.iloc[tr_idx], y.iloc[tr_idx])

        proba = model.predict_proba(X.iloc[va_idx])[:, 1]
        pred  = (proba >= cutoff).astype(int)

        f1 = f1_score(y.iloc[va_idx], pred)
        f1_list.append(f1)

    # Optuna는 기본이 minimize이므로 음수 부호를 붙여 반환
    return -np.mean(f1_list)

# ─────────────────────────────────────────────────────────────
# 3. 스터디 실행
# ─────────────────────────────────────────────────────────────
study = optuna.create_study(direction="minimize",
                            sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=300, timeout=None)   # ✨ n_trials 늘리면 성능↑

# ─────────────────────────────────────────────────────────────
# 4. 결과 출력
# ─────────────────────────────────────────────────────────────
best_params = study.best_trial.params
best_cutoff = best_params.pop("thr")      # cut-off 분리
best_f1     = -study.best_value

print("🎉 Best F1 (CV) :", f"{best_f1:.4f}")
print("📌 Best cut-off :", f"{best_cutoff:.3f}")
print("🔧 Best params  :", best_params)

# ─────────────────────────────────────────────────────────────
# 5. 전체 Train + 5-Fold 검증 점수 재확인 (선택)
# ─────────────────────────────────────────────────────────────
def cv_score(model_params, cutoff, cv=cv):
    f1s, aucs = [], []
    for tr, va in cv.split(X, y):
        m = CatBoostClassifier(**model_params, verbose=False, random_state=42)
        m.fit(X.iloc[tr], y.iloc[tr])
        proba = m.predict_proba(X.iloc[va])[:, 1]
        pred  = (proba >= cutoff).astype(int)
        f1s.append(f1_score(y.iloc[va], pred))
        aucs.append(roc_auc_score(y.iloc[va], proba))
    return np.mean(aucs), np.mean(f1s)

auc_cv, f1_cv = cv_score(best_params, best_cutoff)
print(f"\n✅ Re-check 5-Fold  AUC {auc_cv:.4f} / F1 {f1_cv:.4f}")


[I 2025-06-05 01:41:59,501] A new study created in memory with name: no-name-b7498aba-3b51-4076-9620-8f14d4c4d1c5
[I 2025-06-05 01:42:40,769] Trial 0 finished with value: -0.26443615899534856 and parameters: {'depth': 7, 'lr': 0.17254716573280354, 'l2': 3.5093904781414595, 'iters': 1518, 'spw': 1.9881280009128441, 'border': 66, 'thr': 0.07323344486727978}. Best is trial 0 with value: -0.26443615899534856.
[I 2025-06-05 01:44:11,192] Trial 1 finished with value: -0.2957292154572363 and parameters: {'depth': 11, 'lr': 0.06054365855469246, 'l2': 2.897705266717829, 'iters': 824, 'spw': 7.1428242200079035, 'border': 218, 'thr': 0.13493564427131047}. Best is trial 1 with value: -0.2957292154572363.
[I 2025-06-05 01:44:39,417] Trial 2 finished with value: -0.4559178460977531 and parameters: {'depth': 5, 'lr': 0.017322667470546258, 'l2': 0.11425814516827706, 'iters': 1430, 'spw': 3.7356793173221523, 'border': 97, 'thr': 0.2947411578889518}. Best is trial 2 with value: -0.4559178460977531.
[I 2

KeyboardInterrupt: 

In [17]:
# ─────────────────────────────────────────────────────────────
# 0. 라이브러리
# ─────────────────────────────────────────────────────────────
import warnings, numpy as np, pandas as pd, optuna
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
warnings.filterwarnings("ignore")

# ─────────────────────────────────────────────────────────────
# 1. 데이터 로드
# ─────────────────────────────────────────────────────────────
DATA_FP = Path(r"C:\Users\shaun\Desktop\project\Daycon\cancer\processed_train.csv")
df = pd.read_csv(DATA_FP)

TARGET, ID_COL = "Cancer", "ID"
X = df.drop(columns=[TARGET, ID_COL])
y = df[TARGET].astype(int)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ─────────────────────────────────────────────────────────────
# 2. Optuna 스터디 설정
# ─────────────────────────────────────────────────────────────
STUDY_NAME = "rf_optuna_study3"
STORAGE    = f"sqlite:///{STUDY_NAME}.db"   # 👉 DB 파일 이름
N_TRIALS   = 300                            # 👉 원하는 탐색 횟수

study = optuna.create_study(
    study_name = STUDY_NAME,
    storage    = STORAGE,
    direction  = "maximize",
    sampler    = optuna.samplers.TPESampler(seed=42),
)

# ─────────────────────────────────────────────────────────────
# 3. class_weight 문자열↔값 매핑
# ─────────────────────────────────────────────────────────────
CW_MAP = {
    "none"    : None,
    "balanced": "balanced",
    "b_sub"   : "balanced_subsample",
    "w3"      : {0: 1, 1: 3},
    "w5"      : {0: 1, 1: 5},
}

# ─────────────────────────────────────────────────────────────
# 4. Objective
# ─────────────────────────────────────────────────────────────
def objective(trial):
    weight_key = trial.suggest_categorical("cw_key", list(CW_MAP.keys()))
    params = dict(
        n_estimators      = trial.suggest_int("n_estimators", 500, 2000, step=250),
        max_depth         = trial.suggest_int("max_depth", 15, 35),
        max_features      = (
            trial.suggest_float("max_features_f", 0.3, 0.9)
            if trial.suggest_categorical("mf_type", ["cont", "sqrt"]) == "cont"
            else "sqrt"
        ),
        min_samples_leaf  = trial.suggest_int("min_samples_leaf", 10, 40),
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20),
        class_weight      = CW_MAP[weight_key],
        n_jobs            = -1,
        random_state      = 42,
    )
    cutoff = trial.suggest_float("thr", 0.20, 0.35, step=0.01)

    f1s = []
    for tr_idx, va_idx in cv.split(X, y):
        clf = RandomForestClassifier(**params)
        clf.fit(X.iloc[tr_idx], y.iloc[tr_idx])
        proba = clf.predict_proba(X.iloc[va_idx])[:, 1]
        pred  = (proba >= cutoff).astype(int)
        f1s.append(f1_score(y.iloc[va_idx], pred))

    mean_f1 = np.mean(f1s)
    trial.set_user_attr("mean_f1", mean_f1)   # 대시보드 표시용
    return mean_f1                            # maximize

# ─────────────────────────────────────────────────────────────
# 5. 탐색 실행
# ─────────────────────────────────────────────────────────────
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

best = study.best_trial
best_cutoff   = best.params["thr"]
best_rf_params = {k: v for k, v in best.params.items() if k != "thr"}

print(f"\n🎯  Best Mean F1  : {best.value:.4f}")
print(f"📌  Best Cut-off : {best_cutoff:.3f}")
print(f"🔧  Best Params  : {best_rf_params}")

# ─────────────────────────────────────────────────────────────
# 6. 대시보드 실행 안내
# ─────────────────────────────────────────────────────────────
print("\n💻  대시보드 실행 (터미널에서):")
print(f"    optuna-dashboard {STORAGE}")


[I 2025-06-05 10:50:49,818] A new study created in RDB with name: rf_optuna_study3


  0%|          | 0/300 [00:00<?, ?it/s]

[I 2025-06-05 10:51:32,627] Trial 0 finished with value: 0.38373094091901117 and parameters: {'cw_key': 'balanced', 'n_estimators': 750, 'max_depth': 16, 'mf_type': 'cont', 'max_features_f': 0.7248435466776273, 'min_samples_leaf': 10, 'min_samples_split': 20, 'thr': 0.33}. Best is trial 0 with value: 0.38373094091901117.
[I 2025-06-05 10:52:19,367] Trial 1 finished with value: 0.4566704846433621 and parameters: {'cw_key': 'w5', 'n_estimators': 1250, 'max_depth': 21, 'mf_type': 'cont', 'max_features_f': 0.47528678912113087, 'min_samples_leaf': 21, 'min_samples_split': 10, 'thr': 0.32}. Best is trial 1 with value: 0.4566704846433621.
[I 2025-06-05 10:52:35,890] Trial 2 finished with value: 0.21547113825601527 and parameters: {'cw_key': 'w5', 'n_estimators': 750, 'max_depth': 16, 'mf_type': 'sqrt', 'min_samples_leaf': 35, 'min_samples_split': 7, 'thr': 0.21000000000000002}. Best is trial 1 with value: 0.4566704846433621.
[I 2025-06-05 10:54:06,113] Trial 3 finished with value: 0.485378162

KeyboardInterrupt: 

In [4]:
# ─────────────────────────────────────────────────────────────
# 0. 라이브러리
# ─────────────────────────────────────────────────────────────
import warnings, numpy as np, pandas as pd
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, f1_score

# imbalanced-learn
from imblearn.over_sampling  import (
    SMOTE, ADASYN, RandomOverSampler, BorderlineSMOTE
)
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine        import SMOTEENN
from imblearn.pipeline       import Pipeline as ImbPipeline

warnings.filterwarnings("ignore")

# ─────────────────────────────────────────────────────────────
# 1. 데이터
# ─────────────────────────────────────────────────────────────
DATA_FP = Path(r"C:\Users\shaun\Desktop\project\Daycon\cancer\processed_train.csv")
df = pd.read_csv(DATA_FP)

TARGET, ID_COL = "Cancer", "ID"
X = df.drop(columns=[TARGET, ID_COL])
y = df[TARGET].astype("int8")

# ─────────────────────────────────────────────────────────────
# 2. 공통 설정
# ─────────────────────────────────────────────────────────────
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ★ binary-F1 (양성 클래스 1) ★
f1_pos = make_scorer(f1_score, average="binary", pos_label=1)
scoring = {"f1": f1_pos}

clf_base = RandomForestClassifier(
    n_estimators=800,
    max_depth=20,
    min_samples_leaf=10,
    n_jobs=-1,
    random_state=42
)

# ─────────────────────────────────────────────────────────────
# 3. 리샘플링 사전
# ─────────────────────────────────────────────────────────────
resamplers = {
    "None"            : None,
    "ROS"             : RandomOverSampler(random_state=42),
    "RUS"             : RandomUnderSampler(random_state=42),
    "SMOTE"           : SMOTE(random_state=42),
    "BorderlineSMOTE" : BorderlineSMOTE(random_state=42),
    "ADASYN"          : ADASYN(random_state=42),
    "SMOTEENN"        : SMOTEENN(random_state=42),
}

# ─────────────────────────────────────────────────────────────
# 4. 5-Fold CV 평가
# ─────────────────────────────────────────────────────────────
rows = []
for name, sampler in resamplers.items():
    if sampler is None:
        model = clf_base                     # 원본 데이터
    else:
        model = ImbPipeline([("rs", sampler), ("rf", clf_base)])

    cv_scores = cross_validate(
        model, X, y, cv=cv, scoring=scoring,
        n_jobs=-1, return_train_score=False
    )
    rows.append({"Method": f"{name} + RF" if sampler else "RF",
                 "F1_pos": cv_scores["test_f1"].mean()})

# 정렬 & 출력
df_res = (pd.DataFrame(rows)
            .sort_values("F1_pos", ascending=False)
            .reset_index(drop=True))
print("★ 5-Fold CV | Binary-F1(positive class) 비교")
print(df_res.to_string(index=False, float_format="%.4f"))


★ 5-Fold CV | Binary-F1(positive class) 비교
              Method  F1_pos
            ROS + RF  0.4859
            RUS + RF  0.4330
BorderlineSMOTE + RF  0.4008
          SMOTE + RF  0.3439
         ADASYN + RF  0.3131
       SMOTEENN + RF  0.2983
                  RF  0.0994


In [6]:
# -*- coding: utf-8 -*-
"""
End‑to‑end pipeline
──────────────────
• Nested CV (outer 5 × inner 3) for 4 models
    CatBoost, GradientBoosting (sk‑learn), RandomForest, LightGBM
• Each inner search uses Optuna to maximise Binary‑F1 and includes
  a threshold (probability cut‑off) in the search space.
• After outer loops we retrain each model on the full training set
  with the single best hyper‑params found, save individual thresholds,
  soft‑vote the probabilities (simple mean), then optimise a final
  ensemble cut‑off on the whole training data.
• The ensemble predictions are written to submission.csv in the format:
    ID,Cancer
    TEST_00000,0
    …

Requires
────────
conda  activate  vaex_env
pip install lightgbm catboost optuna==3.5.0

Files expected in same folder:
    processed_train.csv   (train set, without leakage)
    processed_test.csv    (test  set)

Author : <your‑name>
Date   : 2025‑06‑05
"""

import warnings, json, joblib, os, sys, time, optuna, numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")

# ─────────────────────────────────────────────────────────────
# 1. Paths & data
# ─────────────────────────────────────────────────────────────
ROOT = Path.cwd()                          # ← 대체

TRAIN_FP = ROOT / "processed_train.csv"
TEST_FP  = ROOT / "processed_test.csv"
SUB_FP   = ROOT / "submission.csv"

train = pd.read_csv(TRAIN_FP)
X = train.drop(columns=["ID", "Cancer"])
y = train["Cancer"].astype(int)

X_test = pd.read_csv(TEST_FP).set_index("ID")

# make outer folds once for reproducibility
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=101)

# ─────────────────────────────────────────────────────────────
# 2. Search spaces per model
# ─────────────────────────────────────────────────────────────

def search_space(trial, model_name):
    """Return (model, params) tuple for a given Optuna trial."""
    if model_name == "rf":
        params = dict(
            n_estimators=trial.suggest_int("n_estimators", 600, 1600, 200),
            max_depth=trial.suggest_int("max_depth", 10, 35),
            min_samples_leaf=trial.suggest_int("min_leaf", 1, 30),
            max_features=trial.suggest_float("max_feat", 0.3, 1.0),
            n_jobs=-1,
            random_state=42,
        )
        model = RandomForestClassifier(**params)
    elif model_name == "gbdt":
        params = dict(
            n_estimators=trial.suggest_int("n_estimators", 200, 800, 100),
            learning_rate=trial.suggest_float("lr", 0.01, 0.2, log=True),
            max_depth=trial.suggest_int("max_depth", 3, 6),
            subsample=trial.suggest_float("subsample", 0.6, 1.0),
            random_state=42,
        )
        model = GradientBoostingClassifier(**params)
    elif model_name == "lgbm":
        params = dict(
            n_estimators=trial.suggest_int("n_estimators", 400, 1500, 100),
            learning_rate=trial.suggest_float("lr", 0.01, 0.2, log=True),
            num_leaves=trial.suggest_int("num_leaves", 31, 255),
            max_depth=trial.suggest_int("max_depth", -1, 16),
            min_child_samples=trial.suggest_int("min_child", 10, 100),
            subsample=trial.suggest_float("subsample", 0.6, 1.0),
            colsample_bytree=trial.suggest_float("colsample", 0.6, 1.0),
            random_state=42,
            n_jobs=-1,
        )
        model = LGBMClassifier(**params, objective="binary")
    elif model_name == "cat":
        params = dict(
            iterations=trial.suggest_int("iters", 400, 1500, 100),
            depth=trial.suggest_int("depth", 4, 10),
            learning_rate=trial.suggest_float("lr", 0.01, 0.2, log=True),
            l2_leaf_reg=trial.suggest_float("l2", 1e-2, 10, log=True),
            eval_metric="F1",
            verbose=False,
            random_state=42,
            task_type="GPU",
        )
        model = CatBoostClassifier(**params)
    else:
        raise ValueError(model_name)
    return model

# ─────────────────────────────────────────────────────────────
# 3. Nested CV optimisation wrapper
# ─────────────────────────────────────────────────────────────

def nested_optuna(model_name, n_trials=60):
    outer_scores, best_params, best_cutoffs = [], [], []

    for outer_idx, (train_idx, valid_idx) in enumerate(outer_cv.split(X, y)):
        X_tr, X_va = X.iloc[train_idx], X.iloc[valid_idx]
        y_tr, y_va = y.iloc[train_idx], y.iloc[valid_idx]

        def objective(trial):
            model = search_space(trial, model_name)
            cutoff = trial.suggest_float("thr", 0.1, 0.45)
            # inner CV score
            inner_f1 = []
            for tr_i, va_i in inner_cv.split(X_tr, y_tr):
                model.fit(X_tr.iloc[tr_i], y_tr.iloc[tr_i])
                prob = model.predict_proba(X_tr.iloc[va_i])[:, 1]
                pred = (prob >= cutoff).astype(int)
                inner_f1.append(f1_score(y_tr.iloc[va_i], pred))
            return np.mean(inner_f1)

        study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=outer_idx+123))
        study.optimize(objective, n_trials=n_trials, show_progress_bar=False)

        # best params & cutoff
        best = study.best_trial.params.copy()
        cutoff = best.pop("thr")
        best_params.append(best)
        best_cutoffs.append(cutoff)

        # retrain on outer-train with best params and evaluate on outer-valid
        model_best = search_space(optuna.trial.FixedTrial(best), model_name)
        model_best.fit(X_tr, y_tr)
        prob_va = model_best.predict_proba(X_va)[:, 1]
        pred_va = (prob_va >= cutoff).astype(int)
        outer_scores.append(f1_score(y_va, pred_va))

        print(f"[{model_name.upper()} | outer {outer_idx+1}] F1={outer_scores[-1]:.4f}")

    # choose params of best outer fold
    idx_best = int(np.argmax(outer_scores))
    final_params = best_params[idx_best]
    final_cutoff = best_cutoffs[idx_best]

    # train final model on full data
    final_model = search_space(optuna.trial.FixedTrial(final_params), model_name)
    final_model.fit(X, y)

    return final_model, final_cutoff, np.mean(outer_scores)

# ─────────────────────────────────────────────────────────────
# 4. Run optimisation for all four models
# ─────────────────────────────────────────────────────────────
models_info = {}
for key in ["rf", "gbdt", "lgbm", "cat"]:
    print("\nOptimising", key.upper())
    mdl, thr, f1cv = nested_optuna(key, n_trials=40)  # ↓ trials adjust as needed
    models_info[key] = {"model": mdl, "thr": thr, "cv_f1": f1cv}
    print(f"{key.upper()} done.  mean outer‑F1={f1cv:.4f}, cutoff={thr:.2f}\n")

# ─────────────────────────────────────────────────────────────
# 5. Soft voting ensemble + cutoff re‑optimisation
# ─────────────────────────────────────────────────────────────
probs_train = np.column_stack([info["model"].predict_proba(X)[:, 1] for info in models_info.values()])
prob_ens    = probs_train.mean(axis=1)
prec, rec, thr_grid = precision_recall_curve(y, prob_ens)
f1s = 2*prec*rec/(prec+rec+1e-8)
ens_cut = thr_grid[np.argmax(f1s)]
ens_f1  = f1s.max()
print(f"Ensemble best F1={ens_f1:.4f} at cutoff={ens_cut:.3f}")

# ─────────────────────────────────────────────────────────────
# 6. Predict test & build submission
# ─────────────────────────────────────────────────────────────
probs_test = np.column_stack([info["model"].predict_proba(X_test)[:,1] for info in models_info.values()])
prob_test_ens = probs_test.mean(axis=1)
final_pred    = (prob_test_ens >= ens_cut).astype(int)

submission = pd.DataFrame({"ID": X_test.index, "Cancer": final_pred})
submission.to_csv(SUB_FP, index=False)
print("submission.csv saved →", SUB_FP)


[I 2025-06-05 16:02:41,862] A new study created in memory with name: no-name-13308c4d-bcbd-4b83-a2fa-5b3b60f65434



Optimising RF


[I 2025-06-05 16:03:10,515] Trial 0 finished with value: 0.4804223849927098 and parameters: {'n_estimators': 1400, 'max_depth': 17, 'min_leaf': 7, 'max_feat': 0.6859203383580239, 'thr': 0.3518141394249471}. Best is trial 0 with value: 0.4804223849927098.
[I 2025-06-05 16:03:28,444] Trial 1 finished with value: 0.4871259660853681 and parameters: {'n_estimators': 1000, 'max_depth': 35, 'min_leaf': 21, 'max_feat': 0.6366523310390526, 'thr': 0.2372411313679527}. Best is trial 1 with value: 0.4871259660853681.
[I 2025-06-05 16:03:39,720] Trial 2 finished with value: 0.48700933864841617 and parameters: {'n_estimators': 1000, 'max_depth': 28, 'min_leaf': 14, 'max_feat': 0.3417745276266978, 'thr': 0.239315489365651}. Best is trial 1 with value: 0.4871259660853681.
[I 2025-06-05 16:04:05,790] Trial 3 finished with value: 0.4857104918972863 and parameters: {'n_estimators': 1400, 'max_depth': 14, 'min_leaf': 6, 'max_feat': 0.6720859616892868, 'thr': 0.2861396554839031}. Best is trial 1 with value

[RF | outer 1] F1=0.4852


[I 2025-06-05 16:16:14,369] Trial 0 finished with value: 0.4864345717168512 and parameters: {'n_estimators': 600, 'max_depth': 29, 'min_leaf': 18, 'max_feat': 0.620768828632617, 'thr': 0.23464706606544766}. Best is trial 0 with value: 0.4864345717168512.
[I 2025-06-05 16:16:27,083] Trial 1 finished with value: 0.4744028458913796 and parameters: {'n_estimators': 800, 'max_depth': 27, 'min_leaf': 12, 'max_feat': 0.4971010258028996, 'thr': 0.37297189687269605}. Best is trial 0 with value: 0.4864345717168512.
[I 2025-06-05 16:16:56,414] Trial 2 finished with value: 0.48479306414704504 and parameters: {'n_estimators': 1200, 'max_depth': 13, 'min_leaf': 12, 'max_feat': 0.9824918507763238, 'thr': 0.2636596152531965}. Best is trial 0 with value: 0.4864345717168512.
[I 2025-06-05 16:17:07,175] Trial 3 finished with value: 0.4670310249238058 and parameters: {'n_estimators': 600, 'max_depth': 11, 'min_leaf': 22, 'max_feat': 0.7309238564966251, 'thr': 0.4159562336938277}. Best is trial 0 with valu

[RF | outer 2] F1=0.4886


[I 2025-06-05 16:28:16,737] Trial 0 finished with value: 0.48107949760239355 and parameters: {'n_estimators': 1200, 'max_depth': 11, 'min_leaf': 19, 'max_feat': 0.3927867690401119, 'thr': 0.14587980239874604}. Best is trial 0 with value: 0.48107949760239355.
[I 2025-06-05 16:28:47,499] Trial 1 finished with value: 0.4880665912771709 and parameters: {'n_estimators': 1400, 'max_depth': 23, 'min_leaf': 13, 'max_feat': 0.7046060878502642, 'thr': 0.27086104336605227}. Best is trial 1 with value: 0.4880665912771709.
[I 2025-06-05 16:28:58,974] Trial 2 finished with value: 0.46366582276772283 and parameters: {'n_estimators': 1000, 'max_depth': 13, 'min_leaf': 27, 'max_feat': 0.3604181626753626, 'thr': 0.39662927881212573}. Best is trial 1 with value: 0.4880665912771709.
[I 2025-06-05 16:29:14,562] Trial 3 finished with value: 0.48820687130117363 and parameters: {'n_estimators': 1200, 'max_depth': 24, 'min_leaf': 26, 'max_feat': 0.4112984811577775, 'thr': 0.20429865071472492}. Best is trial 3 

[RF | outer 3] F1=0.4810


[I 2025-06-05 16:39:39,969] Trial 0 finished with value: 0.4758121483274021 and parameters: {'n_estimators': 600, 'max_depth': 13, 'min_leaf': 3, 'max_feat': 0.3808349938697293, 'thr': 0.36170466691286973}. Best is trial 0 with value: 0.4758121483274021.
[I 2025-06-05 16:40:02,988] Trial 1 finished with value: 0.480864810431779 and parameters: {'n_estimators': 1000, 'max_depth': 25, 'min_leaf': 2, 'max_feat': 0.6024199389299774, 'thr': 0.2756774418421123}. Best is trial 1 with value: 0.480864810431779.
[I 2025-06-05 16:40:46,751] Trial 2 finished with value: 0.39422836450619264 and parameters: {'n_estimators': 1400, 'max_depth': 31, 'min_leaf': 10, 'max_feat': 0.9754284961435042, 'thr': 0.11954122303990164}. Best is trial 1 with value: 0.480864810431779.
[I 2025-06-05 16:40:59,541] Trial 3 finished with value: 0.477379295701224 and parameters: {'n_estimators': 600, 'max_depth': 20, 'min_leaf': 18, 'max_feat': 0.6465595039460978, 'thr': 0.37379499633293145}. Best is trial 1 with value: 

KeyboardInterrupt: 