In [12]:
# 04_stacking_and_meta.ipynb

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# --- Optional external libraries ---
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except ImportError:
    HAS_XGB = False

try:
    from lightgbm import LGBMClassifier
    HAS_LGBM = True
except ImportError:
    HAS_LGBM = False

try:
    from catboost import CatBoostClassifier
    HAS_CAT = True
except ImportError:
    HAS_CAT = False


In [8]:
from sklearn.ensemble import (
    GradientBoostingClassifier,
    RandomForestClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier
)


In [9]:
# --- Data loading ---
DATA_DIR = "../preprocessed_tabular_data"

train = pd.read_csv(os.path.join(DATA_DIR, "train_prepared.csv"))
test = pd.read_csv(os.path.join(DATA_DIR, "test_prepared.csv"))

# Separate features/labels
X = train.drop(columns=["label", "tic_id", "obj_id", "object_name", "star_name"], errors="ignore")
y = train["label"]

X = X.select_dtypes(include=[np.number])

print("Training shape:", X.shape)
print("Features used:", X.columns.tolist())



Training shape: (12445, 6)
Features used: ['period', 'duration', 'depth', 'stellar_radius', 'stellar_mass', 'stellar_mag']


In [10]:
# --- Define base learners for stacking ---

base_learners = []

# Example 1: LightGBM + GradientBoosting (paper’s best combo)
estimators_1 = [
    ("gb", GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42))
]
if HAS_LGBM:
    estimators_1.insert(0, ("lgbm", LGBMClassifier(n_estimators=300, learning_rate=0.1, random_state=42)))
base_learners.append(("Stack_LGBM_GB", estimators_1))


# Example 2: RF + XGB + LGBM + AdaBoost
estimators_2 = [
    ("rf", RandomForestClassifier(n_estimators=300, random_state=42)),
    ("ada", AdaBoostClassifier(n_estimators=200, learning_rate=0.1, random_state=42))
]
if HAS_XGB:
    estimators_2.append(("xgb", XGBClassifier(
        n_estimators=300, learning_rate=0.1, max_depth=5,
        random_state=42, use_label_encoder=False, eval_metric="logloss"
    )))
if HAS_LGBM:
    estimators_2.append(("lgbm", LGBMClassifier(n_estimators=300, learning_rate=0.1, random_state=42)))
base_learners.append(("Stack_RF_XGB_LGBM_Ada", estimators_2))


# Example 3: Diverse learners (always safe, all sklearn built-ins)
estimators_3 = [
    ("rf", RandomForestClassifier(n_estimators=300, random_state=42)),
    ("et", ExtraTreesClassifier(n_estimators=300, random_state=42)),
    ("gb", GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42))
]
base_learners.append(("Stack_Diverse", estimators_3))



In [5]:
# --- Meta-learner ---
meta_learner = LogisticRegression(max_iter=1000)

# --- Cross-validation ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

results = []

for stack_name, est_list in base_learners:
    print(f"Training {stack_name}...")
    stack_model = StackingClassifier(
        estimators=est_list,
        final_estimator=meta_learner,
        passthrough=False,   # only meta-learner sees predictions
        cv=5,                # inner CV for stacking
        n_jobs=-1
    )
    
    cv_results = cross_validate(stack_model, X, y, cv=cv, scoring=scoring, return_train_score=False)
    result_summary = {metric: np.mean(cv_results[f'test_{metric}']) for metric in scoring}
    result_summary['model'] = stack_name
    results.append(result_summary)

# Collect results
results_df = pd.DataFrame(results)
print(results_df)

# Save results
os.makedirs("../metrics", exist_ok=True)
results_df.to_csv("../metrics/stacking_results.csv", index=False)

Training Stack_LGBM_GB...
Training Stack_RF_XGB_LGBM_Ada...
Training Stack_Diverse...
   accuracy  precision    recall        f1   roc_auc                  model
0  0.803777   0.809975  0.896530  0.851035  0.870206          Stack_LGBM_GB
1  0.815428   0.827149  0.891003  0.857868  0.882111  Stack_RF_XGB_LGBM_Ada
2  0.816633   0.827424  0.893059  0.858958  0.883891          Stack_Diverse


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt
import pandas as pd

# --- Meta-learner hyperparameter tuning ---
param_grid = {
    "final_estimator__C": [0.01, 0.1, 1, 10, 100],
    "final_estimator__penalty": ["l2"],
    "final_estimator__solver": ["lbfgs", "saga"]
}

stack_results = []

for stack_name, estimators in base_learners:
    print(f"\n🔎 Tuning meta-learner for {stack_name}...")
    stack = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(max_iter=5000, random_state=42),
        cv=5,
        n_jobs=-1
    )
    
    grid = GridSearchCV(
        stack, param_grid, scoring="f1", cv=5, n_jobs=-1, verbose=1
    )
    grid.fit(X_train, y_train)
    
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]
    
    stack_results.append({
        "model": stack_name,
        "best_params": grid.best_params_,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob)
    })

# --- Compare with tuned single models ---
comparison_df = pd.DataFrame(stack_results + tuned_results)  # tuned_results = from prev notebook
display(comparison_df)

# --- Visualization ---
plt.figure(figsize=(10,6))
for metric in ["accuracy", "precision", "recall", "f1", "roc_auc"]:
    plt.bar(comparison_df["model"], comparison_df[metric], alpha=0.6, label=metric)

plt.xticks(rotation=45)
plt.ylabel("Score")
plt.title("Performance Comparison: Tuned Models vs Stacks")
plt.legend()
plt.save("../plots_of_experiment_on_tabular_dataset/Tuned_model_vs_Stacks")
plt.show()



🔎 Tuning meta-learner for Stack_LGBM_GB...
Fitting 5 folds for each of 10 candidates, totalling 50 fits


NameError: name 'accuracy_score' is not defined