In [49]:
import os

import matplotlib.pyplot as plt
import numpy as np

"Machine learning tools"
import pickle

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import StratifiedKFold, train_test_split


from classification.datasets import Dataset
from classification.utils.audio_student import AudioUtil, Feature_vector_DS

from classification.utils.plots import (
    plot_decision_boundaries,
    plot_specgram,
    show_confusion_matrix,
)
from classification.utils.utils import accuracy

In [50]:
np.random.seed(0)

In [51]:
### TO RUN
dataset = Dataset()
classnames = dataset.list_classes()

print("\n".join(classnames))

chainsaw
fire
fireworks
gunshot


In [52]:
### TO RUN
fm_dir = "data/feature_matrices/"  # where to save the features matrices
new_dataset_dir = "src/classification/datasets/new_dataset/melvecs/"
model_dir = "data/models/xgb"  # where to save the models
os.makedirs(fm_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

In [53]:
### TO RUN

"Creation of the dataset"
myds = Feature_vector_DS(dataset, Nft=512, nmel=20, duration=950, shift_pct=0.0)

"Some attributes..."
myds.nmel
myds.duration
myds.shift_pct
myds.sr
myds.data_aug
myds.ncol

idx = 0

NONORM = False
zScore = False
MinMax = True

# XGBOOST PARAMETERS
if NONORM:
    n_estimators = 130
    max_depth = 2
    learning_rate = 0.2286
    subsample = 0.5984
    colsample_bytree = 0.6445

if zScore:  
    n_estimators = 311
    max_depth = 3
    learning_rate = 0.199
    subsample = 0.9796
    colsample_bytree = 0.6357
    
if MinMax:
    n_estimators = 130
    max_depth = 2
    learning_rate = 0.2286
    subsample = 0.5984
    colsample_bytree = 0.6445

In [54]:

import numpy as np

train_pct = 0.7
data_aug_factor = 1
featveclen = len(myds["fire", 0, "", ""])  # Same for all classes
classnames = ["chainsaw", "fire", "fireworks", "gunshot"]  # Or wherever you store class names
nclass = len(classnames)

# Determine number of samples per class
naudio_per_class = {"chainsaw" : 70, "fire" : 76, "fireworks" : 75, "gunshot" : 40}


# Allocate feature matrix
total_samples_basic = sum(naudio_per_class[c] for c in classnames)
X_basic = np.zeros((total_samples_basic, featveclen))
y_basic = np.zeros((total_samples_basic), dtype=object)
total_samples_basic
# Fill feature matrix
idx = 0
for class_idx, classname in enumerate(classnames):
    for i in range(naudio_per_class[classname]):
        featvec = myds[classname, i, "", ""]
        X_basic[idx, :] = featvec
        y_basic[idx] = classname
        idx += 1

# Save features and labels
np.save(fm_dir + "X_basic.npy", X_basic)
np.save(fm_dir + "y_basic.npy", y_basic)

print(f"Shape of the basic feature matrix : {X_basic.shape}")
print(f"Number of labels : {y_basic.shape}")


Shape of the basic feature matrix : (261, 400)
Number of labels : (261,)


We can now create a new augmented dataset and observe if the classification results improve. 

In [55]:

### AUGMENTED DATASET
list_augmentation = ["original", "noise", "shifting"]
myds.mod_data_aug(list_augmentation)
print("Number of transformations : ", myds.data_aug_factor)


# Calcul total des échantillons
total_aug_samples = sum(naudio_per_class[c] for c in classnames) * len(list_augmentation)
X_basic_aug = np.zeros((total_aug_samples, featveclen))
y_basic_aug = np.zeros((total_aug_samples), dtype=object)

# Remplissage des features
idx = 0
for aug in list_augmentation:
    for classname in classnames:
        for i in range(naudio_per_class[classname]):
            featvec = myds[classname, i, aug, ""]
            X_basic_aug[idx, :] = featvec
            y_basic_aug[idx] = classname
            idx += 1

# Sauvegarde
np.save(fm_dir + "X_basic_aug.npy", X_basic_aug)
np.save(fm_dir + "y_basic_aug.npy", y_basic_aug)

print(f"Shape of the feature matrix : {X_basic_aug.shape}")
print(f"Number of labels : {y_basic_aug.shape}")
print(f"------------------------------------------------------------")
print(f"Transformations: {list_augmentation}. Labels aligned dynamically with class sizes.")


Number of transformations :  3
Shape of the feature matrix : (783, 400)
Number of labels : (783,)
------------------------------------------------------------
Transformations: ['original', 'noise', 'shifting']. Labels aligned dynamically with class sizes.


In [56]:
# Normalisation L2 - Potentiellement bon pour XGBoost mais pas pour CNN, tester autres normalisations

def L2_normalization(X, method="l2"):
    if method == "l2":
        return np.array([x / np.linalg.norm(x) if np.linalg.norm(x) != 0 else x for x in X])
    elif method == "zscore":
        mean = np.mean(X, axis=0)
        std = np.std(X, axis=0)
        std[std == 0] = 1e-8  # éviter division par zéro
        return (X - mean) / std
    elif method == "minmax":
        min_val = np.min(X, axis=0)
        max_val = np.max(X, axis=0)
        range_val = max_val - min_val
        range_val[range_val == 0] = 1e-8
        return (X - min_val) / range_val
    else:
        raise ValueError("Méthode de normalisation inconnue")

# Normalistation Z-score, 2ème méthode de normalisation
def zscore_normalization(X_train, X_test=None):
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    std[std == 0] = 1e-8  # éviter division par 0

    X_train_norm = (X_train - mean) / std
    if X_test is not None:
        X_test_norm = (X_test - mean) / std  # même centrage pour le test
        return X_train_norm, X_test_norm
    return X_train_norm

# Normalisation Min-Max, 3ème méthode de normalisation
def minmax_normalization(X_train, X_test=None):
    min_val = np.min(X_train, axis=0)
    max_val = np.max(X_train, axis=0)
    range_val = max_val - min_val
    range_val[range_val == 0] = 1e-8  # éviter division par 0

    X_train_norm = (X_train - min_val) / range_val
    if X_test is not None:
        X_test_norm = (X_test - min_val) / range_val
        return X_train_norm, X_test_norm
    return X_train_norm


def apply_normalization(X_train, X_test=None, method="l2"):
    if method == "zscore":
        return zscore_normalization(X_train, X_test)
    elif method == "minmax":
        return minmax_normalization(X_train, X_test)
    elif method == "l2":
        if X_test is not None:
            return L2_normalization(X_train), L2_normalization(X_test)
        return L2_normalization(X_train)
    else:
        if X_test is not None:
            return X_train, X_test
        return X_train

In [57]:
AUG = False

if AUG:
    import os
    import numpy as np
    import matplotlib.pyplot as plt
    from classification.utils.plots import plot_specgram_textlabel

    # Charger les données
    X = np.load(os.path.join(fm_dir, "X_basic_aug.npy"), allow_pickle=True)
    y = np.load(os.path.join(fm_dir, "y_basic_aug.npy"), allow_pickle=True)

    # Dossier où sauvegarder les images
    save_dir = os.path.join("src/classification/soundfiles_melspec_augmentation")
    os.makedirs(save_dir, exist_ok=True)

    # Nombre d'exemples de base (avant augmentation)
    length_X_basic = int(len(X) / len(list_augmentation))

    # Boucle de sauvegarde
    for i in range(length_X_basic):
        for j, aug_name in enumerate(list_augmentation):
            idx = i + j * length_X_basic
            melspec = X[idx]
            class_of_spec = y[idx]

            fig, ax = plt.subplots()
            plot_specgram_textlabel(
                melspec.reshape((20, 20)),
                ax=ax,
                is_mel=True,
                title=f"MEL Spectrogram #{i} - {aug_name}",
                xlabel="Mel vector",
                textlabel=f"{class_of_spec} (aug: {aug_name})",
            )
            plt.tight_layout()
            save_path = os.path.join(save_dir, f"melspec_{i}_{aug_name}.png")
            fig.savefig(save_path)
            plt.close(fig)


FINAL MODEL SAVE

In [58]:

import os
import numpy as np
import pickle
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

TEST_SET = True

A = True # PCA NOAUG NONORM
B = True # NOPCA NOAUG NONORM
C = True # PCA AUG NONORM
D = True # NOPCA AUG NONORM
E = True # NOPCA NOAUG NONORM
F = True # PCA NOAUG NORM
G = True # PCA AUG NORM
H = True # NOPCA AUG NORM
NORMALIZATION_METHOD = "zscore"  # ou "zscore", "minmax", ou None


# Load datasets
X_basic_aug = np.load(os.path.join(fm_dir, "X_basic_aug.npy"))
y_basic_aug = np.load(os.path.join(fm_dir, "y_basic_aug.npy"), allow_pickle=True)

X_basic = np.load(os.path.join(fm_dir, "X_basic.npy"))
y_basic = np.load(os.path.join(fm_dir, "y_basic.npy"), allow_pickle=True)

# Encode labels
label_encoder = LabelEncoder()
y_basic = label_encoder.fit_transform(y_basic)
y_basic_aug = label_encoder.transform(y_basic_aug)


# Split the dataset into training and testing sets
if TEST_SET:
    X_train, X_test, y_train, y_test = train_test_split(X_basic, y_basic, test_size=0.3, random_state=42)
    X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(X_basic_aug, y_basic_aug, test_size=0.3, random_state=42)
else:
    X_train = X_basic
    y_train = y_basic
    X_train_aug = X_basic_aug
    y_train_aug = y_basic_aug

# =========================
# SCENARIO A: WITH PCA (no aug)
# =========================
if A:
    pca = PCA(n_components=0.98)
    X_train_pca = pca.fit_transform(X_train)
    if TEST_SET:
        X_test_pca = pca.transform(X_test)

    pca_filename = os.path.join(model_dir, "pca_noaug_nonorm.pickle")
    with open(pca_filename, "wb") as f:
        pickle.dump(pca, f)

    xgb_pca = XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1,
                            subsample=0.8, colsample_bytree=0.8, eval_metric='mlogloss', random_state=42)
    xgb_pca.fit(X_train_pca, y_train)

    model_filename = os.path.join(model_dir, "xgb_pca_noaug_nonorm.pickle")
    with open(model_filename, "wb") as f:
        pickle.dump(xgb_pca, f)

# =========================
# SCENARIO B: WITHOUT PCA (no aug)
# =========================
if B:
    xgb_no_pca = XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1,
                            subsample=0.8, colsample_bytree=0.8, eval_metric='mlogloss', random_state=42)
    xgb_no_pca.fit(X_train, y_train)

    model_filename = os.path.join(model_dir, "xgb_nopca_noaug_nonorm.pickle")
    with open(model_filename, "wb") as f:
        pickle.dump(xgb_no_pca, f)

# =========================
# SCENARIO C: WITH PCA (aug)
# =========================
if C :
    pca = PCA(n_components=0.98)
    X_train_aug_pca = pca.fit_transform(X_train_aug)
    if TEST_SET:
        X_test_aug_pca = pca.transform(X_test_aug)

    pca_filename = os.path.join(model_dir, "pca_aug_nonorm.pickle")
    with open(pca_filename, "wb") as f:
        pickle.dump(pca, f)

    xgb_model_pca = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                                subsample=subsample, colsample_bytree=colsample_bytree,
                                eval_metric='mlogloss', random_state=42)
    xgb_model_pca.fit(X_train_aug_pca, y_train_aug)

    model_filename = os.path.join(model_dir, "xgb_pca_aug_nonorm.pickle")
    with open(model_filename, "wb") as f:
        pickle.dump(xgb_model_pca, f)

# =========================
# SCENARIO D: WITHOUT PCA (aug)
# =========================
if D :
    xgb_model_no_pca = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                                    subsample=subsample, colsample_bytree=colsample_bytree,
                                    eval_metric='mlogloss', random_state=42)
    xgb_model_no_pca.fit(X_train_aug, y_train_aug)

    model_filename = os.path.join(model_dir, "xgb_nopca_aug_nonorm.pickle")
    with open(model_filename, "wb") as f:
        pickle.dump(xgb_model_no_pca, f)

# =========================
# SCENARIO E: NO DATA TRANSFORMATION (no aug)
# =========================
if E :
    xgb_model_no_transform = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                                        subsample=subsample, colsample_bytree=colsample_bytree,
                                        eval_metric='mlogloss', random_state=42)
    xgb_model_no_transform.fit(X_train, y_train)

    model_filename = os.path.join(model_dir, "xgb_nopca_noaug_nonorm.pickle")
    with open(model_filename, "wb") as f:
        pickle.dump(xgb_model_no_transform, f)

# =========================
# SCENARIO F: NORMALIZATION + PCA (no aug)
# =========================
if F :
    X_train_norm = np.array([x/np.linalg.norm(x) if np.linalg.norm(x) != 0 else x for x in X_train])
    if TEST_SET:
        X_test_norm = np.array([x/np.linalg.norm(x) if np.linalg.norm(x) != 0 else x for x in X_test])

    pca = PCA(n_components=0.98)
    X_train_norm_pca = pca.fit_transform(X_train_norm)
    if TEST_SET:
        X_test_norm_pca = pca.transform(X_test_norm)

    pca_filename = os.path.join(model_dir, "pca_noaug_norm.pickle")
    with open(pca_filename, "wb") as f:
        pickle.dump(pca, f)

    xgb_model_norm_pca = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                                    subsample=subsample, colsample_bytree=colsample_bytree,
                                    eval_metric='mlogloss', random_state=42)
    xgb_model_norm_pca.fit(X_train_norm_pca, y_train)

    model_filename = os.path.join(model_dir, "xgb_pca_noaug_norm.pickle")
    with open(model_filename, "wb") as f:
        pickle.dump(xgb_model_norm_pca, f)

# =========================
# SCENARIO G: NORMALIZATION + AUG + PCA
# =========================
if G : 
    X_train_aug_norm, X_test_aug_norm = apply_normalization(X_train_aug, X_test_aug, method=NORMALIZATION_METHOD)

    pca = PCA(n_components=0.98)
    X_train_aug_norm_pca = pca.fit_transform(X_train_aug_norm)
    if TEST_SET:
        X_test_aug_norm_pca = pca.transform(X_test_aug_norm)

    pca_filename = os.path.join(model_dir, "pca_aug_norm.pickle")
    with open(pca_filename, "wb") as f:
        pickle.dump(pca, f)

    xgb_model_norm_aug_pca = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                                        subsample=subsample, colsample_bytree=colsample_bytree,
                                        eval_metric='mlogloss', random_state=42)
    xgb_model_norm_aug_pca.fit(X_train_aug_norm_pca, y_train_aug)

    model_filename = os.path.join(model_dir, "xgb_pca_aug_norm.pickle")
    with open(model_filename, "wb") as f:
        pickle.dump(xgb_model_norm_aug_pca, f)


# =========================
# SCENARIO H: NORMALIZATION + AUG (no PCA)
# =========================
if H:
    X_train_aug_norm, X_test_aug_norm = apply_normalization(X_train_aug, X_test_aug, method=NORMALIZATION_METHOD)


    xgb_model_aug_norm = XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1,
                                       subsample=0.8, colsample_bytree=0.8,
                                       eval_metric='mlogloss', random_state=42)
    xgb_model_aug_norm.fit(X_train_aug_norm, y_train_aug)

    model_filename = os.path.join(model_dir, "xgb_nopca_aug_norm.pickle")
    with open(model_filename, "wb") as f:
        pickle.dump(xgb_model_aug_norm, f)


# =========================
# EVALUATION FUNCTION
# =========================
def evaluate_model(model, X_test, y_test, description):
    predict = model.predict(X_test)

    classes = np.unique(y_test)
    precision_per_class = precision_score(y_test, predict, average=None, labels=classes)
    recall_per_class = recall_score(y_test, predict, average=None, labels=classes)
    test_accuracy_per_class = []
    conf_matrix = confusion_matrix(y_test, predict, labels=classes)

    for i, cls in enumerate(classes):
        acc = conf_matrix[i, i] / conf_matrix[i, :].sum()
        test_accuracy_per_class.append(acc)

    cv_scores = cross_val_score(model, X_test, y_test, cv=5, scoring='accuracy')
    mean_cv_accuracy = np.mean(cv_scores)

    print(f"\n=== {description} ===")
    print(f"Test Accuracy (Overall): {np.mean(predict == y_test):.4f}")
    print(f"Mean CV Accuracy: {mean_cv_accuracy:.4f}")

    print("\nPer-Class Metrics:")
    for i, cls in enumerate(classes):
        print(f"Class {cls}: Precision={precision_per_class[i]:.4f}, Recall={recall_per_class[i]:.4f}, Accuracy={test_accuracy_per_class[i]:.4f}")

# =========================
# EVALUATE ALL MODELS
# =========================
if TEST_SET:
    if A :
        evaluate_model(xgb_pca, X_test_pca, y_test, "Scenario A: PCA NOAUG NONORM")
    if B :
        evaluate_model(xgb_no_pca, X_test, y_test, "Scenario B: NOPCA NOAUG NONORM")
    if C:
        evaluate_model(xgb_model_pca, X_test_aug_pca, y_test_aug, "Scenario C: PCA AUG NONORM")
    if D:
        evaluate_model(xgb_model_no_pca, X_test, y_test, "Scenario D: NOPCA AUG NONORM")
    if E:
        evaluate_model(xgb_model_no_transform, X_test, y_test, "Scenario E: NOPCA NOAUG NONORM")
    if F:
        evaluate_model(xgb_model_norm_pca, X_test_norm_pca, y_test, "Scenario F: PCA NOAUG NORM")
    if G:
        evaluate_model(xgb_model_norm_aug_pca, X_test_aug_norm_pca, y_test_aug, "Scenario G: PCA AUG NORM")
    if H:
        evaluate_model(xgb_model_aug_norm, X_test_aug_norm, y_test_aug, "Scenario H: NOPCA AUG NORM")


=== Scenario A: PCA NOAUG NONORM ===
Test Accuracy (Overall): 0.7089
Mean CV Accuracy: 0.6975

Per-Class Metrics:
Class 0: Precision=0.7778, Recall=0.7368, Accuracy=0.7368
Class 1: Precision=0.7857, Recall=0.7857, Accuracy=0.7857
Class 2: Precision=0.5000, Recall=0.6316, Accuracy=0.6316
Class 3: Precision=0.8889, Recall=0.6154, Accuracy=0.6154

=== Scenario B: NOPCA NOAUG NONORM ===
Test Accuracy (Overall): 0.8608
Mean CV Accuracy: 0.7858

Per-Class Metrics:
Class 0: Precision=0.8421, Recall=0.8421, Accuracy=0.8421
Class 1: Precision=0.8621, Recall=0.8929, Accuracy=0.8929
Class 2: Precision=0.8000, Recall=0.8421, Accuracy=0.8421
Class 3: Precision=1.0000, Recall=0.8462, Accuracy=0.8462

=== Scenario C: PCA AUG NONORM ===
Test Accuracy (Overall): 0.8511
Mean CV Accuracy: 0.7702

Per-Class Metrics:
Class 0: Precision=0.8548, Recall=0.8281, Accuracy=0.8281
Class 1: Precision=0.9178, Recall=0.9178, Accuracy=0.9178
Class 2: Precision=0.7571, Recall=0.8413, Accuracy=0.8413
Class 3: Precisio

HYPERPARAMETER TUNING

In [59]:
# Modified Bayesian Optimization script for XGBoost with consistent normalization

import os
import numpy as np
import pickle
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from bayes_opt import BayesianOptimization

from classification.utils.utils import accuracy

# --- CONFIG FLAGS ---
TRANSFORMATION = True
NORMALIZATION = True
# --- STEP 1: Load/Select Data ---
if TRANSFORMATION:
    try:
        X = X_basic_aug
        y = y_basic_aug
    except NameError:
        raise ValueError("X_basic_aug and y_basic_aug must be defined before running this script.")
else:
    try:
        X = X_basic
        y = y_basic
    except NameError:
        raise ValueError("X_basic and y_basic must be defined before running this script.")

# --- STEP 2: Normalize if needed ---
if NORMALIZATION and NORMALIZATION_METHOD:
    X = apply_normalization(X, method=NORMALIZATION_METHOD)

# --- STEP 3: Define the Objective Function for Bayesian Optimization ---
def xgb_cv(
    n_estimators,
    max_depth,
    learning_rate,
    subsample,
    colsample_bytree
):
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)

    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        eval_metric='mlogloss',
        random_state=42
    )
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    return cv_scores.mean()

# --- STEP 4: Set Up the Bayesian Optimizer ---
pbounds = {
    'n_estimators': (50, 400),
    'max_depth': (2, 15),
    'learning_rate': (0.01, 0.3),
    'subsample': (0.5, 1),
    'colsample_bytree': (0.5, 1)
}

optimizer = BayesianOptimization(
    f=xgb_cv,
    pbounds=pbounds,
    random_state=42
)

print("Starting Bayesian Optimization...")
optimizer.maximize(init_points=3, n_iter=20)

# --- STEP 5: Final Training with Best Params ---
best_params = optimizer.max['params']

final_model = XGBClassifier(
    n_estimators=int(best_params['n_estimators']),
    max_depth=int(best_params['max_depth']),
    learning_rate=best_params['learning_rate'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    eval_metric='mlogloss',
    random_state=999
)

# === Train/Test Split + re-normalisation ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=999)
if NORMALIZATION and NORMALIZATION_METHOD:
    X_train, X_test = apply_normalization(X_train, X_test, method=NORMALIZATION_METHOD)

final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)
test_acc = accuracy(y_pred, y_test)

print("\n=== FINAL EVALUATION ON HOLDOUT TEST SET ===")
print(f"Test Accuracy: {test_acc:.4f}")


Starting Bayesian Optimization...
|   iter    |  target   | colsam... | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------


KeyboardInterrupt: 