In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np

"Machine learning tools"
import pickle

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier

from classification.datasets import Dataset
from classification.utils.audio_student import AudioUtil, Feature_vector_DS

from classification.utils.plots import (
    plot_decision_boundaries,
    plot_specgram,
    show_confusion_matrix,
)
from classification.utils.utils import accuracy

In [2]:
np.random.seed(0)

In [3]:
### TO RUN
dataset = Dataset()
classnames = dataset.list_classes()

print("\n".join(classnames))

chainsaw
fire
fireworks
gunshot


In [4]:
### TO RUN
fm_dir = "data/feature_matrices/"  # where to save the features matrices
model_dir = "data/models/"  # where to save the models
os.makedirs(fm_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

In [5]:
### TO RUN

"Creation of the dataset"
myds = Feature_vector_DS(dataset, Nft=512, nmel=20, duration=950, shift_pct=0.0)

"Some attributes..."
myds.nmel
myds.duration
myds.shift_pct
myds.sr
myds.data_aug
myds.ncol


idx = 0


In [6]:
### TO RUN
"Random split of 70:30 between training and validation"
train_pct = 0.7

featveclen = len(myds["fire", 0, ""])  # number of items in a feature vector
nitems = len(myds)  # number of sounds in the dataset
naudio = dataset.naudio  # number of audio files in each class
nclass = dataset.nclass  # number of classes
nlearn = round(naudio * train_pct)  # number of sounds among naudio for training

data_aug_factor = 1
class_ids_aug = np.repeat(classnames, naudio * data_aug_factor)

"""
X = np.zeros((data_aug_factor * nclass * naudio, featveclen))
for s in range(data_aug_factor):
    for class_idx, classname in enumerate(classnames):
        for idx in range(naudio):
            featvec = myds[classname, idx, ""]
            X[s * nclass * naudio + class_idx * naudio + idx, :] = featvec
np.save(fm_dir + "X_basic.npy", X)
y = class_ids_aug.copy()
np.save(fm_dir + "y_basic.npy", y)
"""

X = np.load(fm_dir+"X_basic.npy")
y = np.load(fm_dir+"y_basic.npy")

print(f"Shape of the feature matrix : {X.shape}")
print(f"Number of labels : {len(y)}")

Shape of the feature matrix : (200, 400)
Number of labels : 200


We can now create a new augmented dataset and observe if the classification results improve. 

In [7]:
### AUGMENTED DATASET
list_augmentation = ["original", "noise", "echo", "shifting"]
myds.mod_data_aug(list_augmentation)
print("Number of transformations : ", myds.data_aug_factor)
y_aug = np.repeat(classnames, dataset.naudio * myds.data_aug_factor)
X_aug = np.zeros((myds.data_aug_factor * nclass * naudio, featveclen))

for s in range(len(list_augmentation)):
    aug = list_augmentation[s]
    for idx in range(dataset.naudio):
        for class_idx, classname in enumerate(classnames):
            featvec = myds[classname, idx, aug]
            X_aug[s * nclass * naudio + class_idx * naudio + idx, :] = featvec
            y_aug[s * nclass * naudio + class_idx * naudio + idx] = classname


print(f"Shape of the feature matrix : {X_aug.shape}")
print(f"------------------------------------------------------------")
print(f"200 of each transformation. Order : chainsaw1, fire1, fireworks1, gun1, chainsaw2, fire2, ...")


Number of transformations :  4
Shape of the feature matrix : (640, 400)
------------------------------------------------------------
200 of each transformation. Order : chainsaw1, fire1, fireworks1, gun1, chainsaw2, fire2, ...


FINAL MODEL SAVE

In [10]:
import os
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score

# Directories
fm_dir = "data/feature_matrices/"
model_dir = "data/models/"
os.makedirs(fm_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

# Load dataset
X = X_aug  # Ensure this is defined in your script
y = y_aug

# Shuffle then split the dataset into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# Apply PCA separately
pca = PCA(n_components=0.98)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)  # Apply the same transformation to test data
print(f"Number of principal features kept: {pca.n_components_}")

# Save the PCA model separately
pca_filename = os.path.join(model_dir, "pca.pickle")
with open(pca_filename, "wb") as pca_file:
    pickle.dump(pca, pca_file)
print(f"\nPCA model saved as {pca_filename}")

# Train the Random Forest model on PCA-reduced features
rf_model = RandomForestClassifier(
    n_estimators=400,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

rf_model.fit(X_train_pca, y_train)

# Evaluate model
predict = rf_model.predict(X_test_pca)

# Compute metrics for each class
classes = np.unique(y_test)
precision_per_class = precision_score(y_test, predict, average=None, labels=classes)
recall_per_class = recall_score(y_test, predict, average=None, labels=classes)

test_accuracy_per_class = []
conf_matrix = confusion_matrix(y_test, predict, labels=classes)

for i, cls in enumerate(classes):
    class_accuracy = conf_matrix[i, i] / conf_matrix[i, :].sum()
    test_accuracy_per_class.append(class_accuracy)

# Cross-validation accuracy
cv_scores = cross_val_score(rf_model, X_train_pca, y_train, cv=10, scoring='accuracy')
mean_cv_accuracy = np.mean(cv_scores)

# Print results
print("\nOverall Metrics:")
print(f"Test Accuracy (Overall): {np.mean(predict == y_test):.4f}")
print(f"Mean CV Accuracy: {mean_cv_accuracy:.4f}")

print("\nPer-Class Metrics:")
for i, cls in enumerate(classes):
    print(f"Class {cls}:")
    print(f"  Precision: {precision_per_class[i]:.4f}")
    print(f"  Recall: {recall_per_class[i]:.4f}")
    print(f"  Accuracy: {test_accuracy_per_class[i]:.4f}")

# Save the trained Random Forest model separately
rf_filename = os.path.join(model_dir, "model.pickle")
with open(rf_filename, "wb") as rf_file:
    pickle.dump(rf_model, rf_file)

print(f"\nRandom Forest model saved as {rf_filename}")

Number of principal features kept: 121

PCA model saved as data/models/pca.pickle

Overall Metrics:
Test Accuracy (Overall): 0.7917
Mean CV Accuracy: 0.7896

Per-Class Metrics:
Class chainsaw:
  Precision: 0.8605
  Recall: 0.7708
  Accuracy: 0.7708
Class fire:
  Precision: 0.6610
  Recall: 0.8125
  Accuracy: 0.8125
Class fireworks:
  Precision: 0.8780
  Recall: 0.7500
  Accuracy: 0.7500
Class gunshot:
  Precision: 0.8163
  Recall: 0.8333
  Accuracy: 0.8333

Random Forest model saved as data/models/model.pickle


MEAN ACCURACY ON 100 ITERATIONS

In [9]:
import os
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from classification.utils.utils import accuracy

# Directories for saving models
model_dir = "data/models/"
os.makedirs(model_dir, exist_ok=True)

# Ensure dataset (X_aug, y_aug) exists
try:
    X = X_aug
    y = y_aug
except NameError:
    raise ValueError("X_aug and y_aug must be defined before running this script.")

# Number of iterations
num_iterations = 20

# Lists to store scores
accuracy_scores = []
cv_accuracy_scores = []

for i in range(num_iterations):
    print(f"\nIteration {i + 1}/{num_iterations}")

    # Split the dataset into training and testing subsets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=i  # Different splits per iteration
    )

    # Train the Random Forest model
    model = RandomForestClassifier(
        n_estimators=400,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=i  # Different initialization per iteration
    )
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Compute overall accuracy
    test_accuracy = accuracy(y_pred, y_test)
    accuracy_scores.append(test_accuracy)

    # Perform cross-validation on the training set
    cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy')
    mean_cv_accuracy = np.mean(cv_scores)
    cv_accuracy_scores.append(mean_cv_accuracy)

    print(f"Test Accuracy: {test_accuracy:.4f} | Mean CV Accuracy: {mean_cv_accuracy:.4f}")

# Compute overall statistics
mean_test_accuracy = np.mean(accuracy_scores)
std_test_accuracy = np.std(accuracy_scores)

mean_cv_accuracy = np.mean(cv_accuracy_scores)
std_cv_accuracy = np.std(cv_accuracy_scores)

# Print final results
print("\n=== FINAL RESULTS AFTER 100 ITERATIONS ===")
print(f"Mean Test Accuracy: {mean_test_accuracy:.4f} ± {std_test_accuracy:.4f}")
print(f"Mean Cross-Validation Accuracy: {mean_cv_accuracy:.4f} ± {std_cv_accuracy:.4f}")



Iteration 1/20
Test Accuracy: 0.7552 | Mean CV Accuracy: 0.7881

Iteration 2/20


KeyboardInterrupt: 