In [12]:
import os

import matplotlib.pyplot as plt
import numpy as np

"Machine learning tools"
import pickle

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import StratifiedKFold, train_test_split


from classification.datasets import Dataset
from classification.utils.audio_student import AudioUtil, Feature_vector_DS

from classification.utils.plots import (
    plot_decision_boundaries,
    plot_specgram,
    show_confusion_matrix,
)
from classification.utils.utils import accuracy

In [13]:
np.random.seed(0)

In [14]:
### TO RUN
dataset = Dataset()
classnames = dataset.list_classes()

print("\n".join(classnames))

chainsaw
fire
fireworks
gunshot


In [15]:
### TO RUN
fm_dir = "data/feature_matrices/"  # where to save the features matrices
model_dir = "data/models/xgb_new_features"  # where to save the models
os.makedirs(fm_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

In [16]:
### TO RUN

"Creation of the dataset"
myds = Feature_vector_DS(dataset, Nft=512, nmel=20, duration=950, shift_pct=0.0)

"Some attributes..."
myds.nmel
myds.duration
myds.shift_pct
myds.sr
myds.data_aug
myds.ncol

idx = 0

# XGBOOST PARAMETERS
n_estimators = 123
max_depth = 14
learning_rate = 0.1091
subsample = 0.5191
colsample_bytree = 0.9210

In [17]:

import numpy as np
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

train_pct = 0.7
data_aug_factor = 1
featveclen = len(myds["fire", 0, "", ""])  # Same for all classes
classnames = ["chainsaw", "fire", "fireworks", "gunshot"]  # Or wherever you store class names
nclass = len(classnames)

# Determine number of samples per class
naudio_per_class = {"chainsaw" : 70, "fire" : 76, "fireworks" : 75, "gunshot" : 40}


# Allocate feature matrix
total_samples_basic = sum(naudio_per_class[c] for c in classnames)
X_basic = np.zeros((total_samples_basic, featveclen))
y_basic = np.zeros((total_samples_basic), dtype=object)
total_samples_basic
# Fill feature matrix
idx = 0
for class_idx, classname in enumerate(classnames):
    for i in range(naudio_per_class[classname]):
        featvec = myds[classname, i, "", ""]
        X_basic[idx, :] = featvec
        y_basic[idx] = classname
        idx += 1


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_basic, y_basic, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

le           = LabelEncoder()
y_train     = le.fit_transform(y_train)
y_test      = le.transform(y_test)

# Save the feature matrix and labels
np.save(os.path.join(fm_dir, "X_train.npy"), X_train)
np.save(os.path.join(fm_dir, "X_test.npy"), X_test)
np.save(os.path.join(fm_dir, "y_train.npy"), y_train)
np.save(os.path.join(fm_dir, "y_test.npy"), y_test)
np.save(os.path.join(fm_dir, "X_train_norm.npy"), X_train_norm)
np.save(os.path.join(fm_dir, "X_test_norm.npy"), X_test_norm)

print(f"Shape of the training matrix : {X_train.shape}")
print(f"Shape of the test matrix : {X_test.shape}")

Shape of the training matrix : (182, 400)
Shape of the test matrix : (79, 400)


We can now create a new augmented dataset and observe if the classification results improve. 

In [18]:
### AUGMENTED DATASET
list_augmentation = ["original", "noise", "shifting"]
myds.mod_data_aug(list_augmentation)
print("Number of transformations : ", myds.data_aug_factor)


# Préparer les splits
X_train_list, y_train_list = [], []
X_test_list,  y_test_list  = [], []

for classname in classnames:
    n = naudio_per_class[classname]

    # Création des indices de base pour les sons originaux
    original_indices = list(range(n))
    train_idx, test_idx = train_test_split(original_indices, test_size=0.3, random_state=42)

    for i in train_idx:
        for aug in list_augmentation:
            featvec = myds[classname, i, aug, ""]
            X_train_list.append(featvec)
            y_train_list.append(classname)

    for i in test_idx:
        for aug in list_augmentation:
            featvec = myds[classname, i, aug, ""]
            X_test_list.append(featvec)
            y_test_list.append(classname)

# Conversion en tableaux numpy
X_train_aug = np.array(X_train_list)
y_train_aug = np.array(y_train_list, dtype=object)

X_test_aug = np.array(X_test_list)
y_test_aug = np.array(y_test_list, dtype=object)

# Save features and labels
scaler = StandardScaler()
X_train_aug_norm = scaler.fit_transform(X_train_aug)
X_test_aug_norm = scaler.fit_transform(X_test_aug)

y_train_aug     = le.fit_transform(y_train_aug)
y_test_aug      = le.transform(y_test_aug)

np.save(os.path.join(fm_dir, "X_train_aug.npy"), X_train_aug)
np.save(os.path.join(fm_dir, "X_test_aug.npy"), X_test_aug)
np.save(os.path.join(fm_dir, "y_train_aug.npy"), y_train_aug)
np.save(os.path.join(fm_dir, "y_test_aug.npy"), y_test_aug)
np.save(os.path.join(fm_dir, "X_train_aug_norm.npy"), X_train_aug_norm)
np.save(os.path.join(fm_dir, "X_test_aug_norm.npy"), X_test_aug_norm)

print(f"Shape of the training matrix : {X_train_aug.shape}")
print(f"Shape of the test matrix : {X_test_aug.shape}")
print(f"------------------------------------------------------------")
print(f"Transformations: {list_augmentation}. Labels aligned dynamically with class sizes.")


Number of transformations :  3
Shape of the training matrix : (546, 400)
Shape of the test matrix : (237, 400)
------------------------------------------------------------
Transformations: ['original', 'noise', 'shifting']. Labels aligned dynamically with class sizes.


In [19]:
# ------------------------------------------------------------------
# PARAMÈTRES
# ------------------------------------------------------------------
NEW_FEATURES   = True         # active / désactive le bloc
VARIANCE_MODE  = "both"       # "band" | "column" | "both"
N_MELS         = 20           # doit correspondre à vos melspectrogrammes
# ------------------------------------------------------------------

if NEW_FEATURES:

    def add_variance_features(X, n_mels=N_MELS, mode="both"):
        """
        Ajoute les variances au vecteur de base.

        Parameters
        ----------
        X : ndarray shape (n_samples, n_mels * n_cols)
            Matrice de features d'origine (melspectrogrammes aplatis).
        n_mels : int
            Nombre de bandes Mel (lignes du spectrogramme avant flatten).
        mode : {"band", "column", "both"}
            - "band"   : variance temporelle par bande (n_mels valeurs)
            - "column" : variance spectrale par trame  (n_cols valeurs)
            - "both"   : concatène les deux

        Returns
        -------
        X_new : ndarray shape (n_samples, n_features + ...)
                 Matrice augmentée.
        """
        if mode not in {"band", "column", "both"}:
            raise ValueError("mode must be 'band', 'column' or 'both'")

        n_cols = X.shape[1] // n_mels
        X_out = []

        for vec in X:
            mel = vec.reshape(n_mels, n_cols)  # remet en (mels, cols)
            feats = [vec]                      # vecteur original

            if mode in {"band", "both"}:
                var_band = np.var(mel, axis=1, ddof=0)      # (n_mels,)
                feats.append(var_band)

            if mode in {"column", "both"}:
                var_col = np.var(mel, axis=0, ddof=0)       # (n_cols,)
                feats.append(var_col)

            X_out.append(np.concatenate(feats))

        return np.vstack(X_out).astype(np.float32)

    # ------------------------------------------------------------------
    # APPLICATION AUX QUATRE JEU(X) DE FEATURES
    # ------------------------------------------------------------------
    X_train          = add_variance_features(X_train,          N_MELS, VARIANCE_MODE)
    X_test           = add_variance_features(X_test,           N_MELS, VARIANCE_MODE)
    X_train_aug      = add_variance_features(X_train_aug,      N_MELS, VARIANCE_MODE)
    X_test_aug       = add_variance_features(X_test_aug,       N_MELS, VARIANCE_MODE)
    X_train_norm     = add_variance_features(X_train_norm,    N_MELS, VARIANCE_MODE)
    X_test_norm      = add_variance_features(X_test_norm,     N_MELS, VARIANCE_MODE)
    X_train_aug_norm = add_variance_features(X_train_aug_norm, N_MELS, VARIANCE_MODE)
    X_test_aug_norm  = add_variance_features(X_test_aug_norm,  N_MELS, VARIANCE_MODE)
    # ------------------------------------------------------------------
    
    np.save(os.path.join(fm_dir, "X_train.npy"), X_train)
    np.save(os.path.join(fm_dir, "X_test.npy"), X_test)
    np.save(os.path.join(fm_dir, "X_train_norm.npy"), X_train_norm)
    np.save(os.path.join(fm_dir, "X_test_norm.npy"), X_test_norm)
    np.save(os.path.join(fm_dir, "X_train_aug.npy"), X_train_aug)
    np.save(os.path.join(fm_dir, "X_test_aug.npy"), X_test_aug)
    np.save(os.path.join(fm_dir, "X_train_aug_norm.npy"), X_train_aug_norm)
    np.save(os.path.join(fm_dir, "X_test_aug_norm.npy"), X_test_aug_norm)
        
    print(f"✅ Variance ({VARIANCE_MODE}) ajoutée !  "
          f"Nouvelles dimensions : {X_train.shape[1]} features")

✅ Variance (both) ajoutée !  Nouvelles dimensions : 440 features


In [20]:
from classification.utils.plots import plot_specgram_textlabel
# ------------------------------------------------------------------
BASIC = False
if BASIC:
    # Charger les données
    X = np.load(os.path.join(fm_dir, "X_test.npy"), allow_pickle=True)
    y = np.load(os.path.join(fm_dir, "y_test.npy"), allow_pickle=True)

    # Dossier où sauvegarder les images
    save_dir = os.path.join("src/classification/soundfiles_melspec")
    os.makedirs(save_dir, exist_ok=True)

    # Initialiser les compteurs par classe
    class_counters = {}

    for i in range(len(X)):
        melspec = X[i]
        class_of_spec = y[i]

        if class_of_spec not in class_counters:
            class_counters[class_of_spec] = 0
        class_idx = class_counters[class_of_spec]

        fig, ax = plt.subplots()
        plot_specgram_textlabel(  # ✅ fonction corrigée
            melspec.reshape((22, 20)),
            ax=ax,
            is_mel=True,
            title=f"MEL Spectrogram - {class_of_spec} #{class_idx}",
            xlabel="Mel vector",
            textlabel=f"{class_of_spec}",
        )
        plt.tight_layout()
        save_path = os.path.join(save_dir, f"melspec_{class_of_spec}_{class_idx}.png")
        fig.savefig(save_path)
        plt.close(fig)

        class_counters[class_of_spec] += 1


FINAL MODEL SAVE

In [21]:
import os
import numpy as np
import pickle
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder

# Config
TEST_SET = True

A = True  # PCA NOAUG NONORM
B = True  # NOPCA NOAUG NONORM
C = True  # PCA AUG NONORM
D = True  # NOPCA AUG NONORM
E = True  # NOPCA NOAUG NONORM
F = True  # PCA NOAUG Z-SCORE
G = True  # PCA AUG Z-SCORE
H = True  # NOPCA AUG Z-SCORE


# === Données NON augmentées ===
X_train        = np.load(os.path.join(fm_dir, "X_train.npy"))
X_test         = np.load(os.path.join(fm_dir, "X_test.npy"))
y_train        = np.load(os.path.join(fm_dir, "y_train.npy"), allow_pickle=True)
y_test         = np.load(os.path.join(fm_dir, "y_test.npy"), allow_pickle=True)

X_train_norm   = np.load(os.path.join(fm_dir, "X_train_norm.npy"))
X_test_norm    = np.load(os.path.join(fm_dir, "X_test_norm.npy"))

# === Données AUGMENTÉES ===
X_train_aug        = np.load(os.path.join(fm_dir, "X_train_aug.npy"))
X_test_aug         = np.load(os.path.join(fm_dir, "X_test_aug.npy"))
y_train_aug        = np.load(os.path.join(fm_dir, "y_train_aug.npy"), allow_pickle=True)
y_test_aug         = np.load(os.path.join(fm_dir, "y_test_aug.npy"), allow_pickle=True)

X_train_aug_norm   = np.load(os.path.join(fm_dir, "X_train_aug_norm.npy"))
X_test_aug_norm    = np.load(os.path.join(fm_dir, "X_test_aug_norm.npy"))


# =========================
# SCÉNARIO A –– PCA, NO AUG, NON‑NORM
# =========================
if A:
    pca_A = PCA(n_components=0.98)
    X_train_A = pca_A.fit_transform(X_train)
    X_test_A  = pca_A.transform(X_test)

    with open(os.path.join(model_dir, "pca_noaug_nonorm.pickle"), "wb") as f:
        pickle.dump(pca_A, f)

    xgb_A = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                          subsample=subsample, colsample_bytree=colsample_bytree,
                          eval_metric='mlogloss', random_state=42)
    xgb_A.fit(X_train_A, y_train)

    with open(os.path.join(model_dir, "xgb_new_features_pca_noaug_nonorm.pickle"), "wb") as f:
        pickle.dump(xgb_A, f)

# =========================
# SCÉNARIO B –– NOPCA, NO AUG, NON‑NORM
# =========================
if B:
    xgb_B = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                          subsample=subsample, colsample_bytree=colsample_bytree,
                          eval_metric='mlogloss', random_state=42)
    xgb_B.fit(X_train, y_train)

    with open(os.path.join(model_dir, "xgb_new_features_nopca_noaug_nonorm.pickle"), "wb") as f:
        pickle.dump(xgb_B, f)

# =========================
# SCÉNARIO C –– PCA, AUG, NON‑NORM
# =========================
if C:
    pca_C = PCA(n_components=0.98)
    X_train_C = pca_C.fit_transform(X_train_aug)
    X_test_C  = pca_C.transform(X_test_aug)

    with open(os.path.join(model_dir, "pca_aug_nonorm.pickle"), "wb") as f:
        pickle.dump(pca_C, f)

    xgb_C = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                          subsample=subsample, colsample_bytree=colsample_bytree,
                          eval_metric='mlogloss', random_state=42)
    xgb_C.fit(X_train_C, y_train_aug)

    with open(os.path.join(model_dir, "xgb_new_features_pca_aug_nonorm.pickle"), "wb") as f:
        pickle.dump(xgb_C, f)

# =========================
# SCÉNARIO D –– NOPCA, AUG, NON‑NORM
# =========================
if D:
    xgb_D = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                          subsample=subsample, colsample_bytree=colsample_bytree,
                          eval_metric='mlogloss', random_state=42)
    xgb_D.fit(X_train_aug, y_train_aug)

    with open(os.path.join(model_dir, "xgb_new_features_nopca_aug_nonorm.pickle"), "wb") as f:
        pickle.dump(xgb_D, f)

# =========================
# SCÉNARIO E –– NOPCA, NO AUG, NORM
# =========================
if E:
    xgb_E = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                          subsample=subsample, colsample_bytree=colsample_bytree,
                          eval_metric='mlogloss', random_state=42)
    xgb_E.fit(X_train_norm, y_train)

    with open(os.path.join(model_dir, "xgb_new_features_nopca_noaug_nonorm_dup.pickle"), "wb") as f:
        pickle.dump(xgb_E, f)

# =========================
# SCÉNARIO F –– Z‑SCORE, PCA, NO AUG
# =========================
if F:
    pca_F = PCA(n_components=0.98)
    X_train_F = pca_F.fit_transform(X_train_norm)
    X_test_F  = pca_F.transform(X_test_norm)

    with open(os.path.join(model_dir, "pca_noaug_norm.pickle"), "wb") as f:
        pickle.dump(pca_F, f)

    xgb_F = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                          subsample=subsample, colsample_bytree=colsample_bytree,
                          eval_metric='mlogloss', random_state=42)
    xgb_F.fit(X_train_F, y_train)

    with open(os.path.join(model_dir, "xgb_new_features_pca_noaug_norm.pickle"), "wb") as f:
        pickle.dump(xgb_F, f)

# =========================
# SCÉNARIO G –– Z‑SCORE, PCA, AUG
# =========================
if G:
    pca_G = PCA(n_components=0.98)
    X_train_G = pca_G.fit_transform(X_train_aug_norm)
    X_test_G  = pca_G.transform(X_test_aug_norm)

    with open(os.path.join(model_dir, "pca_aug_norm.pickle"), "wb") as f:
        pickle.dump(pca_G, f)

    xgb_G = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                          subsample=subsample, colsample_bytree=colsample_bytree,
                          eval_metric='mlogloss', random_state=42)
    xgb_G.fit(X_train_G, y_train_aug)

    with open(os.path.join(model_dir, "xgb_new_features_pca_aug_norm.pickle"), "wb") as f:
        pickle.dump(xgb_G, f)

# =========================
# SCÉNARIO H –– Z‑SCORE, AUG, NOPCA
# =========================
if H:
    xgb_H = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                          subsample=subsample, colsample_bytree=colsample_bytree,
                          eval_metric='mlogloss', random_state=42)
    xgb_H.fit(X_train_aug_norm, y_train_aug)

    with open(os.path.join(model_dir, "xgb_new_features_nopca_aug_norm.pickle"), "wb") as f:
        pickle.dump(xgb_H, f)

# ==========================================================
# FONCTION D'ÉVALUATION COMMUNE
# ==========================================================
def evaluate_model(model, X_test, y_test, description):
    preds = model.predict(X_test)
    classes = np.unique(y_test)

    precision = precision_score(y_test, preds, average=None, labels=classes)
    recall    = recall_score(y_test,    preds, average=None, labels=classes)
    cm        = confusion_matrix(y_test, preds, labels=classes)

    test_acc_per_class = [cm[i, i] / cm[i, :].sum() for i in range(len(classes))]
    cv_acc = cross_val_score(model, X_test, y_test, cv=5, scoring='accuracy').mean()

    print(f"\n=== {description} ===")
    print(f"Overall Test Accuracy: {np.mean(preds == y_test):.4f}")
    print(f"Mean CV Accuracy  (5‑fold): {cv_acc:.4f}")
    print("Per‑class metrics:")
    for i, cls in enumerate(classes):
        print(f"  Class {cls}: Precision={precision[i]:.4f}, Recall={recall[i]:.4f}, Accuracy={test_acc_per_class[i]:.4f}")

# ==========================================================
# ÉVALUATION DE TOUTES LES VARIANTES
# ==========================================================
if TEST_SET:
    if A: evaluate_model(xgb_A, X_test_A, y_test, "Scenario A : PCA NOAUG NONORM")
    if B: evaluate_model(xgb_B, X_test,    y_test, "Scenario B : NOPCA NOAUG NONORM")
    if C: evaluate_model(xgb_C, X_test_C,  y_test_aug, "Scenario C : PCA AUG NONORM")
    if D: evaluate_model(xgb_D, X_test_aug, y_test_aug, "Scenario D : NOPCA AUG NONORM")
    if E: evaluate_model(xgb_E, X_test_norm,    y_test, "Scenario E : NOPCA NOAUG NORM")
    if F: evaluate_model(xgb_F, X_test_F,  y_test, "Scenario F : PCA NOAUG NORM")
    if G: evaluate_model(xgb_G, X_test_G,  y_test_aug, "Scenario G : PCA AUG NORM")
    if H: evaluate_model(xgb_H, X_test_aug_norm, y_test_aug, "Scenario H : NOPCA AUG NORM")


=== Scenario A : PCA NOAUG NONORM ===
Overall Test Accuracy: 0.6456
Mean CV Accuracy  (5‑fold): 0.5950
Per‑class metrics:
  Class 0: Precision=0.4444, Recall=0.4211, Accuracy=0.4211
  Class 1: Precision=0.7727, Recall=0.6071, Accuracy=0.6071
  Class 2: Precision=0.5556, Recall=0.7895, Accuracy=0.7895
  Class 3: Precision=0.9167, Recall=0.8462, Accuracy=0.8462

=== Scenario B : NOPCA NOAUG NONORM ===
Overall Test Accuracy: 0.8987
Mean CV Accuracy  (5‑fold): 0.7733
Per‑class metrics:
  Class 0: Precision=0.9412, Recall=0.8421, Accuracy=0.8421
  Class 1: Precision=0.8667, Recall=0.9286, Accuracy=0.9286
  Class 2: Precision=0.8421, Recall=0.8421, Accuracy=0.8421
  Class 3: Precision=1.0000, Recall=1.0000, Accuracy=1.0000

=== Scenario C : PCA AUG NONORM ===
Overall Test Accuracy: 0.7173
Mean CV Accuracy  (5‑fold): 0.6071
Per‑class metrics:
  Class 0: Precision=0.6154, Recall=0.5079, Accuracy=0.5079
  Class 1: Precision=0.7500, Recall=0.7826, Accuracy=0.7826
  Class 2: Precision=0.7222, Re

KeyboardInterrupt: 

HYPERPARAMETER TUNING

In [None]:
import os
import numpy as np
import pickle
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from bayes_opt import BayesianOptimization

# Your custom accuracy function
from classification.utils.utils import accuracy

# --- CONFIG FLAGS ---
NORMALIZATION = True   # utilise X_train_aug_norm
TRANSFORMATION = True  # utilise jeu de données augmenté

# === Chargement des données ===
if TRANSFORMATION and NORMALIZATION:
    X = np.load(os.path.join(fm_dir, "X_train_aug_norm.npy" if NORMALIZATION else "X_train_aug.npy"))
    y = np.load(os.path.join(fm_dir, "y_train_aug.npy"), allow_pickle=True)
elif TRANSFORMATION:
    X = np.load(os.path.join(fm_dir, "X_train_aug.npy" if NORMALIZATION else "X_train.npy"))
    y = np.load(os.path.join(fm_dir, "y_train_aug.npy"), allow_pickle=True)
elif NORMALIZATION:
    X = np.load(os.path.join(fm_dir, "X_train_norm.npy" if NORMALIZATION else "X_train.npy"))
    y = np.load(os.path.join(fm_dir, "y_train.npy"), allow_pickle=True)
else:
    X = np.load(os.path.join(fm_dir, "X_train.npy"))
    y = np.load(os.path.join(fm_dir, "y_train.npy"), allow_pickle=True)

# --- STEP 2: Define the Objective Function for Bayesian Optimization ---
def xgb_cv(n_estimators, max_depth, learning_rate, subsample, colsample_bytree):
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)

    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        eval_metric='mlogloss',
        random_state=42
    )

    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    return cv_scores.mean()

# --- STEP 3: Define hyperparameter search space ---
pbounds = {
    'n_estimators': (50, 400),
    'max_depth': (2, 15),
    'learning_rate': (0.01, 0.3),
    'subsample': (0.5, 1),
    'colsample_bytree': (0.5, 1)
}

optimizer = BayesianOptimization(
    f=xgb_cv,
    pbounds=pbounds,
    random_state=42
)

# --- STEP 4: Run Bayesian Optimization ---
init_points = 3
n_iter = 20
print("Starting Bayesian Optimization...")

best_score_so_far = -1.0
early_stop_threshold = 0.90

optimizer.maximize(init_points=init_points, n_iter=n_iter)

for i, res in enumerate(optimizer.res):
    score = res['target']
    print(f"Iteration {i+1}, CV Accuracy: {score:.4f}, Parameters: {res['params']}")
    if score > best_score_so_far:
        best_score_so_far = score
    if best_score_so_far > early_stop_threshold:
        print(f"\nEarly stopping: Found cross-validation accuracy above {early_stop_threshold}\n")
        break

# --- STEP 5: Retrieve best hyperparameters ---
best_params = optimizer.max['params']
best_n_estimators = int(best_params['n_estimators'])
best_max_depth = int(best_params['max_depth'])
best_learning_rate = best_params['learning_rate']
best_subsample = best_params['subsample']
best_colsample_bytree = best_params['colsample_bytree']

print("\n=== BEST HYPERPARAMETERS FOUND ===")
print(f"n_estimators = {best_n_estimators}")
print(f"max_depth = {best_max_depth}")
print(f"learning_rate = {best_learning_rate:.4f}")
print(f"subsample = {best_subsample:.4f}")
print(f"colsample_bytree = {best_colsample_bytree:.4f}")
print(f"CV Accuracy = {optimizer.max['target']:.4f}")

# --- STEP 6: Final evaluation on hold-out test set ---
X_test = np.load(os.path.join(fm_dir, "X_test_aug_norm.npy" if (TRANSFORMATION and NORMALIZATION) else 
                              "X_test_aug.npy" if TRANSFORMATION else
                              "X_test_norm.npy" if NORMALIZATION else
                              "X_test.npy"))
y_test = np.load(os.path.join(fm_dir, "y_test_aug.npy" if TRANSFORMATION else "y_test.npy"), allow_pickle=True)

final_model = XGBClassifier(
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    learning_rate=best_learning_rate,
    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
    eval_metric='mlogloss',
    random_state=999
)

final_model.fit(X, y)
y_pred = final_model.predict(X_test)
test_acc = accuracy(y_pred, y_test)

print("\n=== FINAL EVALUATION ON HOLDOUT TEST SET ===")
print(f"Test Accuracy: {test_acc:.4f}")

# --- Save best model and optionally normalization stats ---
os.makedirs("models", exist_ok=True)
with open("models/xgb_bayesopt_best_model.pkl", "wb") as f:
    pickle.dump(final_model, f)

if NORMALIZATION:
    with open("models/zscore_normalization_stats.pickle", "wb") as f:
        pickle.dump((np.mean(X, axis=0), np.std(X, axis=0)), f)

Starting Bayesian Optimization...
|   iter    |  target   | colsam... | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------


KeyboardInterrupt: 