In [1]:
# -----------------------------
# Imports
# -----------------------------
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# -----------------------------
# Base classifiers
# -----------------------------
brf = BalancedRandomForestClassifier(random_state=42, n_jobs=-1)

xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="aucpr",
    n_jobs=-1,
    random_state=42,
    use_label_encoder=False
)

cat = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    verbose=0,
    random_state=42
)

# -----------------------------
# Voting ensemble
# -----------------------------
ensemble = VotingClassifier(
    estimators=[('brf', brf), ('xgb', xgb), ('cat', cat)],
    voting='soft',
    n_jobs=1  # Windows-safe
)

# -----------------------------
# Parameter distributions
# -----------------------------
param_distributions = {
    # BalancedRandomForest
    'brf__n_estimators': randint(300, 1200),
    'brf__max_depth': randint(5, 25),
    'brf__min_samples_split': randint(2, 10),
    'brf__sampling_strategy': uniform(0.2, 0.3),

    # XGBoost
    'xgb__n_estimators': randint(400, 1500),
    'xgb__learning_rate': uniform(0.01, 0.2),
    'xgb__max_depth': randint(3, 10),
    'xgb__scale_pos_weight': uniform(1, 10),
    'xgb__subsample': uniform(0.6, 0.4),
    'xgb__colsample_bytree': uniform(0.5, 0.5),

    # CatBoost
    'cat__iterations': randint(500, 2000),
    'cat__depth': randint(4, 10),
    'cat__learning_rate': uniform(0.01, 0.1),
    'cat__scale_pos_weight': uniform(1, 10)
}

# -----------------------------
# RandomizedSearchCV
# -----------------------------
search = RandomizedSearchCV(
    estimator=ensemble,
    param_distributions=param_distributions,
    n_iter=10,   # increase for more thorough search
    scoring='average_precision',
    cv=3,
    verbose=2,
    n_jobs=1,    # Windows-safe
    random_state=42
)

# -----------------------------
# 1️⃣ Fit the RandomizedSearchCV (without early stopping)
# -----------------------------
search.fit(X_train, y_train)

# -----------------------------
# 2️⃣ Extract the best estimator
# -----------------------------
best_model = search.best_estimator_

# -----------------------------
# 3️⃣ Refit XGB and CatBoost with early stopping manually
# -----------------------------
# XGBoost
best_xgb = XGBClassifier(**best_model.named_estimators_['xgb'].get_params(), eval_metric='aucpr')
best_xgb.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=50,
    verbose=20
)

# CatBoost
best_cat = CatBoostClassifier(**best_model.named_estimators_['cat'].get_params())
best_cat.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=50,
    verbose=20
)

# -----------------------------
# 4️⃣ Rebuild the final VotingClassifier
# -----------------------------
final_ensemble = VotingClassifier(
    estimators=[
        ('brf', best_model.named_estimators_['brf']),
        ('xgb', best_xgb),
        ('cat', best_cat)
    ],
    voting='soft',
    n_jobs=1
)

# Fit final ensemble (optional, mainly for consistency)
final_ensemble.fit(X_train, y_train)

# -----------------------------
# 5️⃣ Evaluate / inspect best parameters
# -----------------------------
print("Best PR-AUC (from RandomizedSearchCV):", search.best_score_)
print("Best params:", search.best_params_)


NameError: name 'X_train' is not defined