In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, ParameterGrid
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, precision_recall_curve
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

from tqdm.auto import tqdm  


df = pd.read_csv('fd.csv')
X = df.drop(columns='Bleaching')
y = df['Bleaching']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

#Impute + SMOTE
imp        = SimpleImputer(strategy='median')
X_imp      = imp.fit_transform(X_train)
sm         = SMOTE(sampling_strategy=1.0, random_state=42)
X_res, y_res = sm.fit_resample(X_imp, y_train)


neg   = (y_res == 0).sum()
pos   = (y_res == 1).sum()
ratio = neg/pos

param_grid = {
    'scale_pos_weight': [ratio, ratio*2, ratio*5, ratio*10, ratio*20],
    'max_depth':        [4, 6, 8],
    'learning_rate':    [0.01, 0.1, 0.2],
    'subsample':        [0.7, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.9, 1.0],
    'n_estimators':     [200, 500]
}

# Loop with tqdm over all combinations
best_score  = -np.inf
best_params = None

print("🔎 Starting manual grid search with tqdm:")

for params in tqdm(list(ParameterGrid(param_grid)), desc="Hyperparam search"):
    model = XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1,
        **params
    )
    
    scores = cross_val_score(
        model,
        X_res,
        y_res,
        cv=5,
        scoring='recall',
        n_jobs=-1
    )
    mean_score = scores.mean()
    if mean_score > best_score:
        best_score, best_params = mean_score, params

print(f"\n✅ Best params (CV recall={best_score:.3f}): {best_params}")

#Refit on the entire resampled train set
best_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1,
    **best_params
).fit(X_res, y_res)

#Threshold tuning as before
probs = best_model.predict_proba(imp.transform(X_train))[:,1]
prec, rec, thr = precision_recall_curve(y_train, probs)
cands = [(p,r,t) for p,r,t in zip(prec,rec,thr) if r>=0.7]
best_thr = max(cands, key=lambda x: x[0])[2] if cands else 0.5
print(f"👉 Chosen threshold for recall≥0.7: {best_thr:.3f}")

#Final eval
probs_test = best_model.predict_proba(imp.transform(X_test))[:,1]
preds_test = (probs_test >= best_thr).astype(int)
print("\nFinal XGBoost Test Report:")
print(classification_report(y_test, preds_test, digits=3))


🔎 Starting manual grid search with tqdm:


Hyperparam search:   0%|          | 0/810 [00:00<?, ?it/s]


✅ Best params (CV recall=0.997): {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 200, 'scale_pos_weight': np.float64(20.0), 'subsample': 0.7}


Parameters: { "use_label_encoder" } are not used.



👉 Chosen threshold for recall≥0.7: 0.966

Final XGBoost Test Report:
              precision    recall  f1-score   support

         0.0      0.973     0.948     0.960       229
         1.0      0.294     0.455     0.357        11

    accuracy                          0.925       240
   macro avg      0.634     0.701     0.659       240
weighted avg      0.942     0.925     0.933       240



In [4]:
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
import pandas as pd
from sklearn.model_selection import train_test_split


df = pd.read_csv('fd.csv')
X = df.drop(columns='Bleaching'); y = df['Bleaching']
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)


pipe = Pipeline([
    ('imp', SimpleImputer(strategy='median')),
    ('sm',  SMOTE(sampling_strategy=1.0, random_state=42)),
    ('clf', XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    ))
])


neg, pos = (y_tr==0).sum(), (y_tr==1).sum()
ratio = neg/pos
param_dist = {
    'clf__max_depth':       [4, 6, 8, 12],
    'clf__learning_rate':   [0.01, 0.05, 0.1],
    'clf__subsample':       [0.6, 0.8, 1.0],
    'clf__colsample_bytree':[0.6, 0.8, 1.0],
    'clf__scale_pos_weight':[ratio, ratio*2, ratio*5]
}


search = HalvingRandomSearchCV(
    pipe,
    param_dist,
    resource='clf__n_estimators',
    max_resources=800,
    min_resources=100,
    factor=3,
    scoring='recall',
    cv=3,
    random_state=42,
    n_jobs=-1,
    verbose=2
)

search.fit(X_tr, y_tr)
print("✅ Best params:", search.best_params_)
print("Best n_estimators:", search.best_params_['clf__n_estimators'])  # how many trees were used


best = search.best_estimator_
probs = best.predict_proba(best.named_steps['imp'].transform(X_te))[:,1]
from sklearn.metrics import classification_report

print("\nFinal Test Report:")
print(classification_report(y_te, (probs>=0.5).astype(int), digits=3))


n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 100
max_resources_: 800
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 8
n_resources: 100
Fitting 3 folds for each of 8 candidates, totalling 24 fits
----------
iter: 1
n_candidates: 3
n_resources: 300
Fitting 3 folds for each of 3 candidates, totalling 9 fits


Parameters: { "use_label_encoder" } are not used.



✅ Best params: {'clf__subsample': 0.6, 'clf__scale_pos_weight': np.float64(21.325581395348838), 'clf__max_depth': 6, 'clf__learning_rate': 0.01, 'clf__colsample_bytree': 0.6, 'clf__n_estimators': 300}
Best n_estimators: 300

Final Test Report:
              precision    recall  f1-score   support

         0.0      0.990     0.878     0.931       229
         1.0      0.243     0.818     0.375        11

    accuracy                          0.875       240
   macro avg      0.617     0.848     0.653       240
weighted avg      0.956     0.875     0.905       240





In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE


df = pd.read_csv('fd.csv')  


target = 'Bleaching'


drop_cols = ['Year', 'Month']  # Optional, based on domain knowledge
features = df.drop(columns=[target] + drop_cols)
X = features
y = df[target]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


smote = SMOTE(sampling_strategy='minority', random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)


xgb = XGBClassifier(scale_pos_weight=len(y_train) / sum(y_train == 1), random_state=42)  # Adjust for class imbalance
xgb.fit(X_res, y_res)


y_pred = xgb.predict(X_test)

print("\nClassification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.97      0.98       230
         1.0       0.50      0.70      0.58        10

    accuracy                           0.96       240
   macro avg       0.74      0.83      0.78       240
weighted avg       0.97      0.96      0.96       240

