In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, make_scorer, fbeta_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from lightgbm import LGBMClassifier

# 1. Load DS A
df = pd.read_csv('fd.csv')  # your file with the DS A schema
X = df.drop(columns=['Bleaching'])
y = df['Bleaching']

# 2. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# 3. Preprocessing + SMOTE pipeline
imputer = SimpleImputer(strategy='median')
smote   = SMOTE(sampling_strategy='minority', random_state=42)

lgbm_pipe = Pipeline([
    ('impute', imputer),
    ('smote',  smote),
    ('clf',    LGBMClassifier(
        objective='binary',
        random_state=42,
        n_jobs=-1
    ))
])

# 4. Scale_pos_weight for class imbalance
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
ratio = neg / pos

# 5. Hyperparameter search space
param_dist = {
    'clf__n_estimators':    [100, 200, 500, 800],
    'clf__num_leaves':      [31, 50, 100, 150],
    'clf__learning_rate':   [0.01, 0.05, 0.1],
    'clf__feature_fraction':[0.6, 0.8, 1.0],
    'clf__bagging_fraction':[0.6, 0.8, 1.0],
    'clf__scale_pos_weight': [ratio, ratio*2, ratio*5]
}

# 6. F2 scorer to prioritize recall for class 1
f2 = make_scorer(fbeta_score, beta=2, pos_label=1)

# 7. RandomizedSearchCV
search = RandomizedSearchCV(
    lgbm_pipe,
    param_distributions=param_dist,
    n_iter=30,
    cv=5,
    scoring=f2,
    refit=True,
    random_state=42,
    n_jobs=-1,
    verbose=1
)
search.fit(X_train, y_train)

best_model = search.best_estimator_
print("🔍 Best LightGBM params:", search.best_params_)

# 8. Predict & evaluate
y_pred = best_model.predict(X_test)
print("\nLightGBM Test Report:")
print(classification_report(y_test, y_pred, digits=3))


Fitting 5 folds for each of 30 candidates, totalling 150 fits




[LightGBM] [Info] Number of positive: 917, number of negative: 917
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001014 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7098
[LightGBM] [Info] Number of data points in the train set: 1834, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
🔍 Best LightGBM params: {'clf__scale_pos_weight': np.float64(106.62790697674419), 'clf__num_leaves': 31, 'clf__n_estimators': 500, 'clf__learning_rate': 0.05, 'clf__feature_fraction': 0.8, 'clf__bagging_fraction': 0.6}

LightGBM Test Report:
              precision    recall  f1-score   support

         0.0      0.978     0.969     0.974       229
         1.0      0.462     0.545     0.500        11

    accuracy                          0.950       240
   macro avg      0.720     0.757     0.737       240
weighted avg      0.954     0.950     0.952       24



In [2]:
import numpy as np
from sklearn.metrics import precision_recall_curve, fbeta_score

# 1. Assume you have:
#    best_model  = your LGBMPipeline fit on X_train,y_train
#    X_train, y_train, X_test, y_test as before

# 2. Split off a small validation fold BEFORE SMOTE
from sklearn.model_selection import train_test_split
X_tr2, X_val, y_tr2, y_val = train_test_split(
    X_train, y_train,
    stratify=y_train,
    test_size=0.2,
    random_state=42
)

# 3. Re-fit your best_model on X_tr2/y_tr2 (with SMOTE inside pipeline)
best_model.fit(X_tr2, y_tr2)

# 4. Get probabilities on X_val (imputation done inside pipeline)
probs_val = best_model.predict_proba(X_val)[:,1]

# 5. Compute precision‐recall curve
prec, rec, thr = precision_recall_curve(y_val, probs_val)

# 6. Compute F₂ at each threshold and pick best
beta = 2
f2_scores = (1 + beta**2) * (prec * rec) / (beta**2 * prec + rec + 1e-8)
best_idx   = np.nanargmax(f2_scores)
best_thresh = thr[best_idx]
print(f"Picked threshold = {best_thresh:.3f} → val F₂ = {f2_scores[best_idx]:.3f}")

# 7. Apply to your hold-out test set
probs_test = best_model.predict_proba(X_test)[:,1]
y_pred     = (probs_test >= best_thresh).astype(int)

from sklearn.metrics import classification_report
print("Final Test Report (@ tuned threshold):")
print(classification_report(y_test, y_pred, digits=3))




[LightGBM] [Info] Number of positive: 734, number of negative: 734
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000931 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7470
[LightGBM] [Info] Number of data points in the train set: 1468, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Picked threshold = 0.017 → val F₂ = 0.636
Final Test Report (@ tuned threshold):
              precision    recall  f1-score   support

         0.0      0.982     0.948     0.964       229
         1.0      0.368     0.636     0.467        11

    accuracy                          0.933       240
   macro avg      0.675     0.792     0.716       240
weighted avg      0.954     0.933     0.942       240





In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    precision_recall_curve, fbeta_score, make_scorer, classification_report
)
from imblearn.over_sampling import SMOTE      # pip install imbalanced-learn
from imblearn.pipeline import Pipeline

from lightgbm import LGBMClassifier

# 1. Load DS A
df = pd.read_csv('fd.csv')
X = df.drop(columns=['Bleaching'])
y = df['Bleaching']

# 2. Train / val / test split
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, stratify=y_train_full, random_state=42
)

# 3. Impute + SMOTE for class imbalance
imputer = SimpleImputer(strategy='median')
smote   = SMOTE(sampling_strategy='minority', random_state=42)

# 4. Compute scale_pos_weight for LightGBM
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
ratio = neg / pos

# 5. Pipeline: impute → SMOTE → LGBM
pipe = Pipeline([
    ('impute', imputer),
    ('smote',  smote),
    ('clf',    LGBMClassifier(
        objective='binary',
        random_state=42,
        n_jobs=-1
    ))
])

# 6. F2 scorer (β=2)
f2_scorer = make_scorer(fbeta_score, beta=2, pos_label=1)

# 7. Hyperparameter distributions
param_dist = {
    'clf__n_estimators':     [100, 200, 500, 800],
    'clf__num_leaves':       [31, 50, 100, 150],
    'clf__learning_rate':    [0.01, 0.05, 0.1],
    'clf__feature_fraction': [0.6, 0.8, 1.0],
    'clf__bagging_fraction': [0.6, 0.8, 1.0],
    'clf__scale_pos_weight': [ratio, ratio*2, ratio*5, ratio*10]
}

# 8. RandomizedSearchCV optimizing F2
search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=30,
    cv=5,
    scoring=f2_scorer,
    refit=True,
    random_state=42,
    n_jobs=-1,
    verbose=1
)
search.fit(X_train, y_train)

best_model = search.best_estimator_
print("🔍 Best hyperparameters:", search.best_params_)

# 9. Threshold tuning on validation set
probs_val = best_model.predict_proba(X_val)[:, 1]
prec, rec, thr = precision_recall_curve(y_val, probs_val)
beta = 2
f2_scores = (1 + beta**2) * (prec * rec) / (beta**2 * prec + rec + 1e-8)
best_idx = np.nanargmax(f2_scores)
best_threshold = thr[best_idx]
print(f"🎯 Chosen threshold = {best_threshold:.3f}, Validation F2 = {f2_scores[best_idx]:.3f}")

# 10. Final evaluation on the test set
probs_test = best_model.predict_proba(X_test)[:, 1]
y_pred = (probs_test >= best_threshold).astype(int)

print("\nFinal LightGBM Test Report:")
print(classification_report(y_test, y_pred, digits=3))


Fitting 5 folds for each of 30 candidates, totalling 150 fits




[LightGBM] [Info] Number of positive: 734, number of negative: 734
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000871 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7470
[LightGBM] [Info] Number of data points in the train set: 1468, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
🔍 Best hyperparameters: {'clf__scale_pos_weight': np.float64(107.94117647058825), 'clf__num_leaves': 50, 'clf__n_estimators': 500, 'clf__learning_rate': 0.01, 'clf__feature_fraction': 0.6, 'clf__bagging_fraction': 1.0}
🎯 Chosen threshold = 0.981, Validation F2 = 0.795

Final LightGBM Test Report:
              precision    recall  f1-score   support

         0.0      0.970     0.987     0.978       229
         1.0      0.571     0.364     0.444        11

    accuracy                          0.958       240
   macro avg      0.771     0.675     0.711       



In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    precision_recall_curve,
    fbeta_score,
    make_scorer,
    classification_report
)
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline

from lightgbm import LGBMClassifier

# 1. Load DS A
df = pd.read_csv('fd.csv')
X = df.drop(columns='Bleaching')
y = df['Bleaching']

# 2. Train / validation / test split
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2,
    stratify=y_train_full, random_state=42
)

# 3. Pipeline: median imputation → SMOTE+ENN
pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('smp',    SMOTEENN(sampling_strategy='auto', random_state=42)),
    ('clf',    LGBMClassifier(objective='binary', random_state=42, n_jobs=-1))
])

# 4. F2 scorer (β=2)
f2 = make_scorer(fbeta_score, beta=2, pos_label=1)

# 5. Hyperparameter grid
param_dist = {
    'clf__boosting_type':    ['gbdt', 'dart'],
    'clf__n_estimators':     [200, 500, 800],
    'clf__num_leaves':       [31, 50, 100, 150],
    'clf__learning_rate':    [0.01, 0.05, 0.1],
    'clf__feature_fraction': [0.6, 0.8, 1.0],
    'clf__bagging_fraction': [0.6, 0.8, 1.0],
    'clf__reg_alpha':        [0, 0.1, 1],
    'clf__reg_lambda':       [0, 0.1, 1],
    # since SMOTEENN balances, leave scale_pos_weight=1
    'clf__scale_pos_weight': [1]
}

# 6. RandomizedSearchCV optimizing F2
search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=40,
    cv=5,
    scoring=f2,
    refit=True,
    random_state=42,
    n_jobs=-1,
    verbose=2
)
search.fit(X_train, y_train)

best_model = search.best_estimator_
print("🔍 Best hyperparameters:", search.best_params_)

# 7. Threshold tuning on validation set
probs_val = best_model.predict_proba(X_val)[:, 1]
prec, rec, thr = precision_recall_curve(y_val, probs_val)

beta = 2
f2_scores = (1 + beta**2) * (prec * rec) / (beta**2 * prec + rec + 1e-8)
best_idx = np.nanargmax(f2_scores)
best_threshold = thr[best_idx]
print(f"🎯 Chosen threshold = {best_threshold:.3f} → Val F₂ = {f2_scores[best_idx]:.3f}")

# 8. Final evaluation on the hold‐out test set
probs_test = best_model.predict_proba(X_test)[:, 1]
y_pred = (probs_test >= best_threshold).astype(int)

print("\nFinal LightGBM Test Report:")
print(classification_report(y_test, y_pred, digits=3))


Fitting 5 folds for each of 40 candidates, totalling 200 fits




[LightGBM] [Info] Number of positive: 728, number of negative: 656
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000882 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7451
[LightGBM] [Info] Number of data points in the train set: 1384, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.526012 -> initscore=0.104140
[LightGBM] [Info] Start training from score 0.104140
🔍 Best hyperparameters: {'clf__scale_pos_weight': 1, 'clf__reg_lambda': 0.1, 'clf__reg_alpha': 0, 'clf__num_leaves': 50, 'clf__n_estimators': 200, 'clf__learning_rate': 0.1, 'clf__feature_fraction': 1.0, 'clf__boosting_type': 'dart', 'clf__bagging_fraction': 1.0}
🎯 Chosen threshold = 0.956 → Val F₂ = 0.761

Final LightGBM Test Report:
              precision    recall  f1-score   support

         0.0      0.970     0.983     0.976       229
         1.0      0.500     0.364     0.421        11

    accuracy



In [2]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Load your dataset
df = pd.read_csv('fd.csv')

# Define features and target
X = df.drop(columns=['Bleaching'])
y = df['Bleaching']

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (optional but often helps with models like LightGBM)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Apply SMOTE to the training set to balance the classes
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize the LightGBM classifier
lgbm = lgb.LGBMClassifier(n_estimators=1000, class_weight='balanced', random_state=42)

# Train the model on the resampled data
lgbm.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = lgbm.predict(X_test)

# Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("\nClassification Report:")
print(class_report)



[LightGBM] [Info] Number of positive: 916, number of negative: 916
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000577 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3301
[LightGBM] [Info] Number of data points in the train set: 1832, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Classification Report:
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       230
         1.0       0.60      0.60      0.60        10

    accuracy                           0.97       240
   macro avg       0.79      0.79      0.79       240
weighted avg       0.97      0.97      0.97       240



