## CNN Creation and development

In [6]:
# Import standard libraries
import os
import numpy as np
import pandas as pd

# Audio processing libraries
import librosa

# Machine learning libraries
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix

# Deep learning libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, models, optimizers, callbacks
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For saving and loading data
import pickle

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')


ImportError: Error importing numpy: you should not try to import numpy from
        its source directory; please exit the numpy source tree, and relaunch
        your python interpreter from there.

definir constantes e paths

In [None]:
# Define constants
TARGET_SAMPLE_RATE = 22050  # Sampling rate for audio files
TARGET_LENGTH = 4  # Target length in seconds
N_FFT = 2048  # Number of samples in FFT
HOP_LENGTH = 512  # Number of samples between successive frames

# Paths to dataset and output files
FOLDS_PATH = 'UrbanSound8K/audio'
DATA_INFO_PATH = 'UrbanSound8K/metadata/UrbanSound8K.csv' 
OUTPUT_PKL = 'urbansound8k.pkl'


In [3]:
# Load metadata
data_info = pd.read_csv(DATA_INFO_PATH)
data_info.head()

NameError: name 'pd' is not defined

Normalizar o audio para mater o standart de 4 segundos

In [None]:
def load_and_pad_audio(file_path, target_sr, target_length):
    """
    Loads an audio file, resamples it to the target sample rate,
    and pads or trims it to the target length.
    """
    y, sr = librosa.load(file_path, sr=target_sr)
    target_samples = int(target_length * sr)
    
    if len(y) > target_samples:
        # Trim the audio to the target length
        y = y[:target_samples]
    else:
        # Pad the audio with zeros (silence) to reach the target length
        padding = target_samples - len(y)
        y = np.pad(y, (0, padding), 'constant')
    
    return y


Data augmentation

In [None]:
import random

def augment_audio(y, sr):
    """
    Apply random augmentation techniques to an audio signal.
    
    Parameters:
        y (np.ndarray): Audio time series.
        sr (int): Sampling rate of y.
        
    Returns:
        List[np.ndarray]: List of augmented audio signals.
    """
    augmented_audios = []
    
    # Time Stretching
    if random.choice([True, False]):
        rate = random.uniform(0.8, 1.2)
        y_stretch = librosa.effects.time_stretch(y, rate)
        augmented_audios.append(y_stretch)
    
    # Pitch Shifting
    if random.choice([True, False]):
        n_steps = random.randint(-2, 2)
        y_shift = librosa.effects.pitch_shift(y, sr, n_steps)
        augmented_audios.append(y_shift)
    
    # Adding Noise
    if random.choice([True, False]):
        noise_amp = 0.005 * np.random.uniform() * np.amax(y)
        y_noise = y + noise_amp * np.random.normal(size=y.shape[0])
        augmented_audios.append(y_noise)
    
    return augmented_audios


Damos process aos audio files e extraimos features. Iteramos sobre cada ficheiro de audio, damos load e extraimos features para usar depois

In [None]:
def extract_features(y, sr):
    """
    Extracts multiple audio features from a given audio signal.
    
    Parameters:
        y (np.ndarray): Audio time series.
        sr (int): Sampling rate of y.
        
    Returns:
        np.ndarray: 2D array of stacked features.
    """
    # Mel Spectrogram
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # MFCCs
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    
    # Stack features vertically (along the feature axis)
    features = np.vstack((mel_spec_db, mfcc))
    
    return features


In [None]:
# Initialize data structures
feature_list = []
label_list = []
fold_list = []

if not os.path.exists(OUTPUT_PKL):
    folds = [fold for fold in os.listdir(FOLDS_PATH) if "fold" in fold]
    for fold in folds:
        print(f"Processing {fold}...")
        audio_files = librosa.util.find_files(os.path.join(FOLDS_PATH, fold))
        for i, audio in enumerate(audio_files):
            file_name = os.path.basename(audio)
            # Retrieve class ID from metadata
            matching_rows = data_info.loc[data_info['slice_file_name'] == file_name, 'classID']
            if not matching_rows.empty:
                classid = matching_rows.values[0]
            else:
                print(f"No matching classID found for {file_name}")
                continue  # Skip this file
            
            # Load and pad audio
            y = load_and_pad_audio(audio, TARGET_SAMPLE_RATE, TARGET_LENGTH)
            
            # Extract features from original audio
            features = extract_features(y, TARGET_SAMPLE_RATE)
            
            # Append original features
            feature_list.append(features)
            label_list.append(classid)
            fold_list.append(fold)
            
            # Data Augmentation
            augmented_audios = augment_audio(y, TARGET_SAMPLE_RATE)
            for aug_y in augmented_audios:
                aug_features = extract_features(aug_y, TARGET_SAMPLE_RATE)
                # Append augmented features
                feature_list.append(aug_features)
                label_list.append(classid)
                fold_list.append(fold)
            
            # Progress logging
            if i % 100 == 0:
                print(f"{i} files processed in {fold}")
        print(f"Finished processing {fold}")
    print("Feature extraction and augmentation completed.")
    
    # Convert lists to numpy arrays
    X = np.array(feature_list)
    y = np.array(label_list)
    folds_array = np.array(fold_list)
    
    # Save the data
    with open(OUTPUT_PKL, 'wb') as f:
        pickle.dump((X, y, folds_array), f)
    print("Processed data saved successfully.")
else:
    # Load processed data from the pickle file
    with open(OUTPUT_PKL, 'rb') as f:
        X, y, folds_array = pickle.load(f)
    print("Processed data loaded successfully.")


Agora normalizamos as features, convertemos lists para arrays e aplicamos o min-max scaling

In [None]:
if not os.path.exists(OUTPUT_PKL):
    # Convert lists to numpy arrays
    X = np.array(feature_list)
    y = np.array(label_list)
    folds_array = np.array(fold_list)
    # Normalize features to [0, 1]

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)


Preparação da data para Model training

In [None]:
# Check the shape of features
print(f"Feature shape before reshaping: {X.shape}")  # Expected: (num_samples, num_features_rows, num_features_columns)

# Ensure all features have the same shape
feature_shapes = [feat.shape for feat in X]
unique_shapes = set(feature_shapes)
print(f"Unique feature shapes: {unique_shapes}")

# If all features have the same shape, stack them into a 4D array
X_array = np.stack(X, axis=0)
print(f"Features shape after stacking: {X_array.shape}")  # (num_samples, height, width)

# Expand dimensions for CNN input
X_array = np.expand_dims(X_array, axis=-1)  # Shape: (num_samples, height, width, channels)
print(f"Features shape after expanding dimensions: {X_array.shape}")


Aplicamos one-hot encoding para as class ficarem suitable para treino

In [None]:
# One-Hot Encode labels
encoder = OneHotEncoder(sparse=False)
y_encoded = encoder.fit_transform(y.reshape(-1, 1))
print(f'Encoded labels shape: {y_encoded.shape}')


Split da data em treino, validation e testing sets

In [None]:
# Use Stratified K-Fold Cross-Validation
num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize lists to store results
fold_metrics = []
fold_no = 1

for train_index, test_index in skf.split(X_array, y):
    print(f'\nTraining fold {fold_no}...')
    X_train, X_test = X_array[train_index], X_array[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    # Further split training data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
    
    print(f'Training samples: {X_train.shape[0]}, Validation samples: {X_val.shape[0]}, Testing samples: {X_test.shape[0]}')
    
    # Proceed to model definition and training
    # ...
    
    # Break after one fold for demonstration
    fold_no += 1
    break


CNN Model

In [None]:
def create_cnn_model(input_shape, num_classes=10):
    """
    Creates a Convolutional Neural Network model.
    
    Parameters:
        input_shape (tuple): Shape of the input data (height, width, channels).
        num_classes (int): Number of output classes.
        
    Returns:
        keras.Model: Compiled CNN model.
    """
    model = Sequential()
    
    # First Convolutional Block
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.3))
    
    # Second Convolutional Block
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.3))
    
    # Third Convolutional Block
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.3))
    
    # Flatten and Dense Layers
    model.add(Flatten())
    model.add(Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
    model.add(Dropout(0.4))
    model.add(Dense(num_classes, activation='softmax'))
    
    return model

# Create the model
input_shape = X_train.shape[1:]  # (height, width, channels)
model = create_cnn_model(input_shape)
model.summary()


Compilação do Modelo

In [None]:
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])


DEfinir callbacks

In [None]:
# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)


Treinar o modelo

In [None]:
# Train the model
history = model.fit(X_train, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping, reduce_lr],
                    verbose=1)


Avaliar o modelo

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {test_accuracy*100:.2f}%')


Graficos

In [None]:
# Plot training & validation accuracy values
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()

plt.show()


Classification report e confusion matrix

In [None]:
# Predict on test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

# Classification report
class_labels = encoder.categories_[0]
print('Classification Report')
print(classification_report(y_true, y_pred_classes, target_names=class_labels.astype(str)))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred_classes)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
