In [3]:
# 04_hyperparam_tuning.ipynb
import os, json
import numpy as np
import pandas as pd
from time import time
import joblib

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

# Optional boosters
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False

try:
    from lightgbm import LGBMClassifier
    HAS_LGBM = True
except Exception:
    HAS_LGBM = False

try:
    from catboost import CatBoostClassifier
    HAS_CAT = True
except Exception:
    HAS_CAT = False


In [4]:
# ---------- Config ----------
DATA_DIR = "../preprocessed_tabular_data"
OUT_MODELS_DIR = "../models"
OUT_METRICS_DIR = "../metrics"
os.makedirs(OUT_MODELS_DIR, exist_ok=True)
os.makedirs(OUT_METRICS_DIR, exist_ok=True)

RANDOM_STATE = 42
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# You can change the scoring/refit target:
# If missing true candidates is most costly, consider refit='recall' instead.
scoring = {'f1': 'f1', 'roc_auc': 'roc_auc'}
REFIT_METRIC = 'f1'   # refit best model by F1; change to 'recall' if you prefer recall

# Control search size (smaller = faster)
N_ITER = 30    # randomized search iterations

# ---------- Load Data ----------
train = pd.read_csv(os.path.join(DATA_DIR, "train_prepared.csv"))
test = pd.read_csv(os.path.join(DATA_DIR, "test_prepared.csv"))


In [5]:
# Map label strings to binary if needed
label_col = "label"
if train[label_col].dtype == 'object' or str(train[label_col].dtype).startswith("category"):
    label_map = {"planet candidate": 1, "false positive": 0}
    train[label_col] = train[label_col].map(label_map)
    test[label_col] = test[label_col].map(label_map)

# Drop identifier / textual columns (adjust names as needed)
drop_cols = ["tic_id", "obj_id", "object_name", "star_name"]
X_train = train.drop(columns=[label_col] + drop_cols, errors='ignore')
y_train = train[label_col].astype(int)

X_test = test.drop(columns=[label_col] + drop_cols, errors='ignore')
y_test = test[label_col].astype(int)

# Keep numeric features only (extend later with pipelines for categoricals)
X_train = X_train.select_dtypes(include=[np.number])
X_test = X_test.select_dtypes(include=[np.number])

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (12445, 6) Test shape: (3112, 6)


In [6]:
# ---------- Helper: tuned search runner ----------
def run_random_search(name, estimator, param_distributions, X, y, cv, scoring, refit, n_iter=30):
    print(f"\n=== Tuning {name} ===")
    rs = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=param_distributions,
        n_iter=n_iter,
        scoring=scoring,
        refit=refit,
        cv=cv,
        verbose=2,
        n_jobs=-1,
        random_state=RANDOM_STATE,
        return_train_score=True
    )
    t0 = time()
    try:
        rs.fit(X, y)
    except Exception as e:
        print(f"ERROR while tuning {name}: {e}")
        return None, None
    elapsed = time() - t0
    print(f"Tuning {name} done in {elapsed:.1f}s. Best {refit}: {rs.best_score_:.4f}")
    # Save cv_results, best_params, and best_estimator
    cv_df = pd.DataFrame(rs.cv_results_)
    cv_df.to_csv(os.path.join(OUT_METRICS_DIR, f"hyperparam_search_{name}.csv"), index=False)
    with open(os.path.join(OUT_MODELS_DIR, f"hyperparams_{name}.json"), "w") as f:
        json.dump(rs.best_params_, f, indent=2)
    joblib.dump(rs.best_estimator_, os.path.join(OUT_MODELS_DIR, f"{name}_best.joblib"))
    return rs, cv_df

In [7]:
# ---------- Parameter spaces ----------
param_spaces = {}

param_spaces['RandomForest'] = {
    'n_estimators': [100, 300, 600, 1000],
    'max_depth': [None, 10, 20, 40, 80],
    'max_features': ['sqrt', 'log2', 0.3, 0.5, 0.8],
    'min_samples_split': [2, 5, 10],
    'class_weight': [None, 'balanced']
}

param_spaces['ExtraTrees'] = {
    'n_estimators': [100, 300, 600],
    'max_depth': [None, 10, 20, 40],
    'max_features': ['sqrt', 'log2', 0.3, 0.6],
    'min_samples_split': [2, 5],
    'class_weight': [None, 'balanced']
}

param_spaces['AdaBoost'] = {
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
    'estimator__max_depth': [1, 2, 3, 5]  # use this for modern sklearn
}

param_spaces['GradientBoosting'] = {
    'n_estimators': [100, 200, 400],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 8],
    'subsample': [0.6, 0.8, 1.0]
}

if HAS_XGB:
    param_spaces['XGBoost'] = {
        'n_estimators': [100, 300, 600],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'gamma': [0, 0.1, 1.0],
        'min_child_weight': [1, 3, 5]
    }

if HAS_LGBM:
    param_spaces['LightGBM'] = {
        'n_estimators': [100, 300, 600],
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [31, 63, 127],
        'min_child_samples': [5, 10, 20],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }

if HAS_CAT:
    param_spaces['CatBoost'] = {
        'iterations': [100, 300, 600],
        'learning_rate': [0.01, 0.05, 0.1],
        'depth': [4, 6, 8],
        'l2_leaf_reg': [1, 3, 10]
    }


In [8]:


# ---------- Estimators dictionary ----------
estimators = {}

# RandomForest
estimators['RandomForest'] = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)

# ExtraTrees
estimators['ExtraTrees'] = ExtraTreesClassifier(random_state=RANDOM_STATE, n_jobs=-1)

# AdaBoost with tunable weak learner depth
# Modern sklearn versions (>=1.2) use 'estimator' instead of 'base_estimator'
weak_tree = DecisionTreeClassifier(random_state=RANDOM_STATE)
estimators['AdaBoost'] = AdaBoostClassifier(estimator=weak_tree, random_state=RANDOM_STATE)

# GradientBoosting
estimators['GradientBoosting'] = GradientBoostingClassifier(random_state=RANDOM_STATE)

# Optional gradient boosters
if HAS_XGB:
    estimators['XGBoost'] = XGBClassifier(
        random_state=RANDOM_STATE,
        use_label_encoder=False,
        eval_metric='logloss'
    )

if HAS_LGBM:
    estimators['LightGBM'] = LGBMClassifier(random_state=RANDOM_STATE)

if HAS_CAT:
    estimators['CatBoost'] = CatBoostClassifier(random_state=RANDOM_STATE, verbose=0)


In [9]:
# ---------- Run randomized search for each available estimator ----------
search_sessions = {}
for name, est in estimators.items():
    if name not in param_spaces:
        print(f"Skipping search for {name} (no param space defined).")
        continue
    params = param_spaces[name].copy()
    # If we used pipeline names (not here except advanced cases), adjust names.
    # Run RandomizedSearchCV
    rs, cv_df = run_random_search(
        name=name,
        estimator=est,
        param_distributions=params,
        X=X_train,
        y=y_train,
        cv=cv,
        scoring=scoring,
        refit=REFIT_METRIC,
        n_iter=N_ITER
    )
    if rs is None:
        print(f"Skipping further processing for {name} due to errors during tuning.")
        continue
    search_sessions[name] = rs



=== Tuning RandomForest ===
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Tuning RandomForest done in 210.7s. Best f1: 0.8606

=== Tuning ExtraTrees ===
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Tuning ExtraTrees done in 51.0s. Best f1: 0.8590

=== Tuning AdaBoost ===
Fitting 5 folds for each of 30 candidates, totalling 150 fits




Tuning AdaBoost done in 78.0s. Best f1: 0.8540

=== Tuning GradientBoosting ===
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Tuning GradientBoosting done in 80.3s. Best f1: 0.8595


In [13]:
# ---------- Evaluate best models on test set and save reports ----------
eval_rows = []
for name, rs in search_sessions.items():
    best_model = rs.best_estimator_
    try:
        y_pred = best_model.predict(X_test)
        y_proba = None
        if hasattr(best_model, "predict_proba"):
            y_proba = best_model.predict_proba(X_test)[:, 1]
        cr = classification_report(y_test, y_pred, output_dict=True)
        cm = confusion_matrix(y_test, y_pred)
        row = {
            'model': name,
            'test_precision': cr['1']['precision'] if '1' in cr else cr.get('precision', None),
            'test_recall': cr['1']['recall'] if '1' in cr else cr.get('recall', None),
            'test_f1': cr['1']['f1-score'] if '1' in cr else cr.get('f1-score', None),
            'test_accuracy': cr.get('accuracy', None),
            'confusion_matrix': cm.tolist()
        }
        eval_rows.append(row)
        # Save classification report & confusion matrix
        with open(os.path.join(OUT_METRICS_DIR, f"{name}_test_classification_report.json"), "w") as f:
            json.dump(cr, f, indent=2)
        joblib.dump(best_model, os.path.join(OUT_MODELS_DIR, f"{name}_best_final.joblib"))
    except Exception as e:
        print(f"Failed to evaluate {name} on test set: {e}")

pd.DataFrame(eval_rows).to_csv(os.path.join(OUT_METRICS_DIR, "hyperparam_test_evaluation.csv"), index=False)
print("Hyperparameter tuning notebook finished. Results saved under models/ and metrics/ .")

Hyperparameter tuning notebook finished. Results saved under models/ and metrics/ .


In [10]:
# ---------- Collect all best models into a dictionary ----------
model_dict = {}

for name, rs in search_sessions.items():
    best_model = rs.best_estimator_
    model_dict[name] = best_model

# Save the dictionary for later use
joblib.dump(model_dict, os.path.join(OUT_MODELS_DIR, "all_best_models.joblib"))
print("✅ model_dict saved to ../models/all_best_models.joblib")


✅ model_dict saved to ../models/all_best_models.joblib


In [16]:
comparison_df=pd.read_csv("../metrics/hyperparam_test_evaluation.csv")

In [17]:
import joblib
import os

# Choose evaluation metric
metric = "test_f1"   # or "roc_auc"

# How many models to keep
TOP_N = 3  

# Sort results
sorted_df = comparison_df.sort_values(by=metric, ascending=False).reset_index(drop=True)

# Create save directory
os.makedirs("../models", exist_ok=True)

# Dictionary to keep track of saved models
saved_models = {}

for i in range(min(TOP_N, len(sorted_df))):
    
    row = sorted_df.iloc[i]
    model_name = row["model"]
    
    print(f"🏆 Saving rank {i+1}: {model_name} ({metric} = {row[metric]:.4f})")
    
    model = model_dict[model_name]  # your trained models dict
    path = f"../models/{i+1:02d}_{model_name.replace(' ', '_')}.pkl"
    
    joblib.dump(model, path)
    saved_models[model_name] = path

print("✅ Top models saved:")
for name, path in saved_models.items():
    print(f" - {name}: {path}")


🏆 Saving rank 1: RandomForest (test_f1 = 0.8582)
🏆 Saving rank 2: GradientBoosting (test_f1 = 0.8560)
🏆 Saving rank 3: AdaBoost (test_f1 = 0.8510)
✅ Top models saved:
 - RandomForest: ../models/01_RandomForest.pkl
 - GradientBoosting: ../models/02_GradientBoosting.pkl
 - AdaBoost: ../models/03_AdaBoost.pkl
