In [9]:
import os

import matplotlib.pyplot as plt
import numpy as np

"Machine learning tools"
import pickle

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier

from classification.datasets import Dataset
from classification.utils.audio_student import AudioUtil, Feature_vector_DS

from classification.utils.plots import (
    plot_decision_boundaries,
    plot_specgram,
    show_confusion_matrix,
)
from classification.utils.utils import accuracy

In [10]:
np.random.seed(0)

In [11]:
### TO RUN
dataset = Dataset()
classnames = dataset.list_classes()

print("\n".join(classnames))

chainsaw
fire
fireworks
gunshot


In [12]:
### TO RUN
fm_dir = "data/feature_matrices/"  # where to save the features matrices
new_dataset_dir = "src/classification/datasets/new_dataset/melvecs/"
model_dir = "data/models/random_forest"  # where to save the models
os.makedirs(fm_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

In [13]:
### TO RUN

"Creation of the dataset"
myds = Feature_vector_DS(dataset, Nft=512, nmel=20, duration=950, shift_pct=0.0)

"Some attributes..."
myds.nmel
myds.duration
myds.shift_pct
myds.sr
myds.data_aug
myds.ncol


idx = 0


In [14]:
# TRANSFORMATION ON FEATURE VECTOR

def add_noise(feature_vector, snr_db=20):
    """Adds white noise to a feature vector based on the given SNR (Signal-to-Noise Ratio)."""
    power_signal = np.mean(feature_vector ** 2)
    power_noise = power_signal / (10 ** (snr_db / 10))
    noise = np.random.normal(0, np.sqrt(power_noise), feature_vector.shape)
    return feature_vector + noise

def shifting(feature_vector, shift_max=20):
    """Shifts mel spectrogram feature vectors along the time axis by a random shift between 0 and shift_max."""
    shift = np.random.randint(0, shift_max)
    return np.roll(feature_vector, shift, axis=0)  # Rolling along the first axis

In [15]:

import numpy as np

train_pct = 0.7
data_aug_factor = 1
featveclen = len(myds["fire", 0, "", ""])  # Same for all classes
classnames = ["chainsaw", "fire", "fireworks", "gunshot"]  # Or wherever you store class names
nclass = len(classnames)

# Determine number of samples per class
naudio_per_class = {"chainsaw" : 76, "fire" : 76, "fireworks" : 76, "gunshot" : 40}


# Allocate feature matrix
total_samples_basic = sum(naudio_per_class[c] for c in classnames)
X_basic = np.zeros((total_samples_basic, featveclen))
y_basic = np.zeros((total_samples_basic), dtype=object)
total_samples_basic
# Fill feature matrix
idx = 0
for class_idx, classname in enumerate(classnames):
    for i in range(naudio_per_class[classname]):
        featvec = myds[classname, i, "", ""]
        X_basic[idx, :] = featvec
        y_basic[idx] = classname
        idx += 1

# Save features and labels
np.save(fm_dir + "X_basic.npy", X_basic)
np.save(fm_dir + "y_basic.npy", y_basic)

print(f"Shape of the basic feature matrix : {X_basic.shape}")
print(f"Number of labels : {y_basic.shape}")


Shape of the basic feature matrix : (268, 400)
Number of labels : (268,)


We can now create a new augmented dataset and observe if the classification results improve. 

In [16]:

### AUGMENTED DATASET
list_augmentation = ["original", "noise", "shifting"]
myds.mod_data_aug(list_augmentation)
print("Number of transformations : ", myds.data_aug_factor)


# Calcul total des échantillons
total_aug_samples = sum(naudio_per_class[c] for c in classnames) * len(list_augmentation)
X_basic_aug = np.zeros((total_aug_samples, featveclen))
y_basic_aug = np.zeros((total_aug_samples), dtype=object)

# Remplissage des features
idx = 0
for aug in list_augmentation:
    for classname in classnames:
        for i in range(naudio_per_class[classname]):
            featvec = myds[classname, i, aug, ""]
            X_basic_aug[idx, :] = featvec
            y_basic_aug[idx] = classname
            idx += 1

# Sauvegarde
np.save(fm_dir + "X_basic_aug.npy", X_basic_aug)
np.save(fm_dir + "y_basic_aug.npy", y_basic_aug)

print(f"Shape of the feature matrix : {X_basic_aug.shape}")
print(f"Number of labels : {y_basic_aug.shape}")
print(f"------------------------------------------------------------")
print(f"Transformations: {list_augmentation}. Labels aligned dynamically with class sizes.")


Number of transformations :  3
Shape of the feature matrix : (804, 400)
Number of labels : (804,)
------------------------------------------------------------
Transformations: ['original', 'noise', 'shifting']. Labels aligned dynamically with class sizes.


FINAL MODEL SAVE

In [18]:
import os
import numpy as np
import pickle

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score

# =========================
# 1) HYPERPARAMS & PATHS
# =========================
# Define or ensure these variables exist:
# fm_dir = "/path/to/features/"
# model_dir = "/path/to/save/models/"

n_estimators = 400
max_depth = 20
min_samples_split = 5
min_samples_leaf = 2
random_state = 42

# =========================
# 2) LOAD DATA
# =========================
X_basic_aug = np.load(os.path.join(fm_dir, "X_basic_aug.npy"))
y_basic_aug = np.load(os.path.join(fm_dir, "y_basic_aug.npy"), allow_pickle=True)

X_basic = np.load(os.path.join(fm_dir, "X_basic.npy"))
y_basic = np.load(os.path.join(fm_dir, "y_basic.npy"), allow_pickle=True)

# =========================
# 3) SPLIT DATASETS
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X_basic, y_basic, 
    test_size=0.3, 
    stratify=y_basic, 
    random_state=random_state
)

X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(
    X_basic_aug, y_basic_aug, 
    test_size=0.3, 
    stratify=y_basic_aug, 
    random_state=random_state
)

# ==============================================
# SCENARIO A: PCA + NO AUG + NO NORMALIZATION
# ==============================================
pca = PCA(n_components=0.99)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

pca_path = os.path.join(model_dir, "pca_noaug_nonorm.pickle")
with open(pca_path, "wb") as f:
    pickle.dump(pca, f)

rf_pca_noaug_nonorm = RandomForestClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=random_state
)
rf_pca_noaug_nonorm.fit(X_train_pca, y_train)

model_path = os.path.join(model_dir, "rf_pca_noaug_nonorm.pickle")
with open(model_path, "wb") as f:
    pickle.dump(rf_pca_noaug_nonorm, f)

# ===========================================
# SCENARIO B: NO PCA + NO AUG + NO NORMALIZATION
# ===========================================
rf_nopca_noaug_nonorm = RandomForestClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=random_state
)
rf_nopca_noaug_nonorm.fit(X_train, y_train)

model_path = os.path.join(model_dir, "rf_nopca_noaug_nonorm.pickle")
with open(model_path, "wb") as f:
    pickle.dump(rf_nopca_noaug_nonorm, f)

# ===========================================
# SCENARIO C: PCA + AUG + NO NORMALIZATION
# ===========================================
pca_aug = PCA(n_components=0.99)
X_train_aug_pca = pca_aug.fit_transform(X_train_aug)
X_test_aug_pca = pca_aug.transform(X_test_aug)

pca_path = os.path.join(model_dir, "pca_aug_nonorm.pickle")
with open(pca_path, "wb") as f:
    pickle.dump(pca_aug, f)

rf_pca_aug_nonorm = RandomForestClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=random_state
)
rf_pca_aug_nonorm.fit(X_train_aug_pca, y_train_aug)

model_path = os.path.join(model_dir, "rf_pca_aug_nonorm.pickle")
with open(model_path, "wb") as f:
    pickle.dump(rf_pca_aug_nonorm, f)

# ===========================================
# SCENARIO D: NO PCA + AUG + NO NORMALIZATION
# ===========================================
rf_nopca_aug_nonorm = RandomForestClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=random_state
)
rf_nopca_aug_nonorm.fit(X_train_aug, y_train_aug)

model_path = os.path.join(model_dir, "rf_nopca_aug_nonorm.pickle")
with open(model_path, "wb") as f:
    pickle.dump(rf_nopca_aug_nonorm, f)

# ===========================================
# SCENARIO E: NO DATA TRANSFORMATION (NO AUG) 
#             - effectively the same as B
# ===========================================
# If you want a separate reference model for "no data transformation," 
# we simply reuse X_train, y_train:
rf_no_transform = RandomForestClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=random_state
)
rf_no_transform.fit(X_train, y_train)

model_path = os.path.join(model_dir, "rf_nopca_noaug_nonorm.pickle")
with open(model_path, "wb") as f:
    pickle.dump(rf_no_transform, f)

# ===========================================
# SCENARIO F: NORMALIZATION + PCA (NO AUG)
# ===========================================
X_train_norm = np.array([
    x / np.linalg.norm(x) if np.linalg.norm(x) != 0 else x
    for x in X_train
])
X_test_norm = np.array([
    x / np.linalg.norm(x) if np.linalg.norm(x) != 0 else x
    for x in X_test
])

pca_norm = PCA(n_components=0.99)
X_train_norm_pca = pca_norm.fit_transform(X_train_norm)
X_test_norm_pca = pca_norm.transform(X_test_norm)

pca_path = os.path.join(model_dir, "pca_noaug_norm.pickle")
with open(pca_path, "wb") as f:
    pickle.dump(pca_norm, f)

rf_pca_noaug_norm = RandomForestClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=random_state
)
rf_pca_noaug_norm.fit(X_train_norm_pca, y_train)

model_path = os.path.join(model_dir, "rf_pca_noaug_norm.pickle")
with open(model_path, "wb") as f:
    pickle.dump(rf_pca_noaug_norm, f)

# ===========================================
# SCENARIO G: NORMALIZATION + PCA (AUG)
# ===========================================
X_train_aug_norm = np.array([
    x / np.linalg.norm(x) if np.linalg.norm(x) != 0 else x
    for x in X_train_aug
])
X_test_aug_norm = np.array([
    x / np.linalg.norm(x) if np.linalg.norm(x) != 0 else x
    for x in X_test_aug
])

pca_aug_norm = PCA(n_components=0.99)
X_train_aug_norm_pca = pca_aug_norm.fit_transform(X_train_aug_norm)
X_test_aug_norm_pca = pca_aug_norm.transform(X_test_aug_norm)

pca_path = os.path.join(model_dir, "pca_aug_norm.pickle")
with open(pca_path, "wb") as f:
    pickle.dump(pca_aug_norm, f)

rf_pca_aug_norm = RandomForestClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=random_state
)
rf_pca_aug_norm.fit(X_train_aug_norm_pca, y_train_aug)

model_path = os.path.join(model_dir, "rf_pca_aug_norm.pickle")
with open(model_path, "wb") as f:
    pickle.dump(rf_pca_aug_norm, f)

# ===========================================
# EVALUATION FUNCTION
# ===========================================
def evaluate_model(model, X_test, y_test, description):
    predictions = model.predict(X_test)
    classes = np.unique(y_test)

    # Overall accuracy
    overall_accuracy = np.mean(predictions == y_test)

    # Per-class precision & recall
    precision_per_class = precision_score(y_test, predictions, average=None, labels=classes)
    recall_per_class = recall_score(y_test, predictions, average=None, labels=classes)

    # Confusion Matrix -> per-class accuracy
    conf_matrix = confusion_matrix(y_test, predictions, labels=classes)
    accuracy_per_class = []
    for i, cls in enumerate(classes):
        acc_cls = conf_matrix[i, i] / conf_matrix[i, :].sum()
        accuracy_per_class.append(acc_cls)

    # Cross-validation (on the test set, typically not done, but for demonstration):
    cv_scores = cross_val_score(model, X_test, y_test, cv=5, scoring='accuracy')
    mean_cv_accuracy = np.mean(cv_scores)

    # Print results
    print(f"\n=== {description} ===")
    print(f"Overall Test Accuracy: {overall_accuracy:.4f}")
    print(f"Mean CV Accuracy (5-fold on test): {mean_cv_accuracy:.4f}")
    print("Per-Class Metrics:")
    for i, cls in enumerate(classes):
        print(f"  Class {cls}: Precision={precision_per_class[i]:.4f}, "
              f"Recall={recall_per_class[i]:.4f}, Accuracy={accuracy_per_class[i]:.4f}")

# ===========================================
#  EVALUATE ALL MODELS
# ===========================================
evaluate_model(rf_pca_noaug_nonorm,  X_test_pca,      y_test,     "Scenario A: PCA NOAUG NONORM")
evaluate_model(rf_nopca_noaug_nonorm, X_test,         y_test,     "Scenario B: NOPCA NOAUG NONORM")
evaluate_model(rf_pca_aug_nonorm,     X_test_aug_pca, y_test_aug, "Scenario C: PCA AUG NONORM")
evaluate_model(rf_nopca_aug_nonorm,   X_test_aug,     y_test_aug, "Scenario D: NOPCA AUG NONORM")
evaluate_model(rf_no_transform,       X_test,         y_test,     "Scenario E: NOPCA NOAUG NONORM")
evaluate_model(rf_pca_noaug_norm,     X_test_norm_pca, y_test,    "Scenario F: PCA NOAUG NORM")
evaluate_model(rf_pca_aug_norm,       X_test_aug_norm_pca, y_test_aug, "Scenario G: PCA AUG NORM")



=== Scenario A: PCA NOAUG NONORM ===
Overall Test Accuracy: 0.6296
Mean CV Accuracy (5-fold on test): 0.6691
Per-Class Metrics:
  Class chainsaw: Precision=0.5143, Recall=0.7826, Accuracy=0.7826
  Class fire: Precision=0.7826, Recall=0.7826, Accuracy=0.7826
  Class fireworks: Precision=0.6000, Recall=0.5217, Accuracy=0.5217
  Class gunshot: Precision=1.0000, Recall=0.2500, Accuracy=0.2500

=== Scenario B: NOPCA NOAUG NONORM ===
Overall Test Accuracy: 0.7654
Mean CV Accuracy (5-fold on test): 0.7551
Per-Class Metrics:
  Class chainsaw: Precision=0.5938, Recall=0.8261, Accuracy=0.8261
  Class fire: Precision=0.8571, Recall=0.7826, Accuracy=0.7826
  Class fireworks: Precision=0.9375, Recall=0.6522, Accuracy=0.6522
  Class gunshot: Precision=0.8333, Recall=0.8333, Accuracy=0.8333

=== Scenario C: PCA AUG NONORM ===
Overall Test Accuracy: 0.8554
Mean CV Accuracy (5-fold on test): 0.7188
Per-Class Metrics:
  Class chainsaw: Precision=0.7532, Recall=0.8529, Accuracy=0.8529
  Class fire: Prec

MEAN ACCURACY ON 20 ITERATIONS

In [None]:
"""
import os
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from classification.utils.utils import accuracy

NORMALIZATION = True
TRANSFORMATION = True

# Ensure dataset (X_aug, y_aug) exists

if TRANSFORMATION:
    try:
        X = X_basic_aug
        y = y_basic_aug
    except NameError:
        raise ValueError("X_aug and y_aug must be defined before running this script.")
else:
    try:
        X = X_basic
        y = y_basic
    except NameError:
        raise ValueError("X and y must be defined before running this script.")
if NORMALIZATION:
    X = np.array([x/np.linalg.norm(x) if np.linalg.norm(x) != 0 else x for x in X])
    


# Number of iterations
num_iterations = 20

# Lists to store scores
accuracy_scores = []
cv_accuracy_scores = []

for i in range(num_iterations):
    print(f"\nIteration {i + 1}/{num_iterations}")

    # Split the dataset into training and testing subsets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=i  # Different splits per iteration
    )

    # Train the Random Forest model
    model = RandomForestClassifier(
        n_estimators=400,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=i  # Different initialization per iteration
    )
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Compute overall accuracy
    test_accuracy = accuracy(y_pred, y_test)
    accuracy_scores.append(test_accuracy)

    # Perform cross-validation on the training set
    cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy')
    mean_cv_accuracy = np.mean(cv_scores)
    cv_accuracy_scores.append(mean_cv_accuracy)

    print(f"Test Accuracy: {test_accuracy:.4f} | Mean CV Accuracy: {mean_cv_accuracy:.4f}")

# Compute overall statistics
mean_test_accuracy = np.mean(accuracy_scores)
std_test_accuracy = np.std(accuracy_scores)

mean_cv_accuracy = np.mean(cv_accuracy_scores)
std_cv_accuracy = np.std(cv_accuracy_scores)

# Print final results
print("\n=== FINAL RESULTS AFTER 20 ITERATIONS ===")
print(f"Mean Test Accuracy: {mean_test_accuracy:.4f} ± {std_test_accuracy:.4f}")
print(f"Mean Cross-Validation Accuracy: {mean_cv_accuracy:.4f} ± {std_cv_accuracy:.4f}")

"""

'\nimport os\nimport numpy as np\nimport pickle\nimport matplotlib.pyplot as plt\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split, cross_val_score\nfrom sklearn.metrics import confusion_matrix, precision_score, recall_score\nfrom classification.utils.utils import accuracy\n\nNORMALIZATION = True\nTRANSFORMATION = True\n\n# Ensure dataset (X_aug, y_aug) exists\n\nif TRANSFORMATION:\n    try:\n        X = X_basic_aug\n        y = y_basic_aug\n    except NameError:\n        raise ValueError("X_aug and y_aug must be defined before running this script.")\nelse:\n    try:\n        X = X_basic\n        y = y_basic\n    except NameError:\n        raise ValueError("X and y must be defined before running this script.")\nif NORMALIZATION:\n    X = np.array([x/np.linalg.norm(x) if np.linalg.norm(x) != 0 else x for x in X])\n    \n\n\n# Number of iterations\nnum_iterations = 20\n\n# Lists to store scores\naccuracy_scores = []\ncv_accuracy_sc