# Imports

In [None]:
import pandas as pd
import numpy as np
import re
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from itertools import product

# Input de DataSet

In [None]:
datasets = [pd.read_csv(f'datasets/urbansounds_features_{i}.csv') for i in range(1, 11)]

Clean the DataSet

In [None]:
def calculate_mean_from_string(string):
    cleaned_string = string.replace('\n', '')
    numbers = re.findall(r"[-+]?\d*\.\d+|\d+", cleaned_string)
    array = np.array(numbers, dtype=float)
    mean_value = np.mean(array)
    return mean_value

In [None]:
for df in datasets:
    for column in df.columns:
        if column != 'Label':
            if df[column].dtype != float and df[column].dtype != int:
                df[column] = df[column].apply(calculate_mean_from_string)
        else:
            df[column] = df[column].str.split('-').str[1].astype(int)

# Classification

# Using TenserFlow

Getting the best Hyperparameters

In [None]:
learning_rate = '0.1'

In [None]:
# Combine all labels from different datasets
all_labels = np.concatenate([df['Label'].values for df in datasets])

# Define the stratified k-fold
stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
# Hyperparameter Grid
dropout_values = np.arange(0, 1, 0.01)
patience_values = [3, 6]
optimizers = ['adam', 'sgd', 'adagrad']
regulizers_value = np.arange(0, 0.1, 0.01)
batch_sizes = [32, 64, 128]

In [None]:
def grid_search_best_parameters():
    # Best results tracker
    best_result = {
        'dropout': None,
        'patience': None,
        'optimizer': None,
        'regulaizer': None,
        'batch_size': None,
        'average_accuracy': 0
    }
    
    total_iterations = len(dropout_values) * len(regulizers_value) * len(optimizers) * len(patience_values) * len(batch_sizes)
    
    with tqdm(total=total_iterations, desc="Grid Search Progress") as pbar:
        # Grid Search Loop
        for dropout in dropout_values:
            for regulizer in regulizers_value:
                for optimizer in optimizers:
                    for patience in patience_values:
                        for batch_size in batch_sizes:
                            cv_scores = []
    
                            for fold, (train_index, val_index) in enumerate(stratified_kfold.split(range(len(all_labels)), all_labels)):
                                # Use the current fold as the validation set
                                validation_dataset = datasets[fold]
    
                                # Combine the remaining datasets as the training set
                                training_datasets = [dataset for index, dataset in enumerate(datasets) if index != fold]
                                combined_df = pd.concat(training_datasets, ignore_index=True)
    
                                # Classification
                                X_train = combined_df.drop('Label', axis=1)
                                y_train = combined_df['Label']
    
                                # Oversample the features values using SMOTE
                                smote = SMOTE(random_state=42)
                                X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
                                # Standardize the feature values
                                scaler = StandardScaler()
                                X_train_scaled = scaler.fit_transform(X_resampled)
    
                                # Classification for validation set
                                X_val = validation_dataset.drop('Label', axis=1)
                                y_val = validation_dataset['Label']
    
                                # Oversample the features values using SMOTE for validation set
                                X_val_resampled, y_val_resampled = smote.fit_resample(X_val, y_val)
                                X_val_scaled = scaler.transform(X_val_resampled)
    
                                mean_neurons = (X_train_scaled.shape[1] + len(np.unique(y_resampled))) // 2
                                num_input_neurons = X_train_scaled.shape[1]
                                num_output_neurons = len(np.unique(y_resampled))
                                neurons_hidden_layer = int(2 / 3 * num_input_neurons + 1 / 3 * num_output_neurons)
    
                                # Define and compile the model with hyperparameters
                                model = tf.keras.Sequential([
                                    tf.keras.layers.Dense(units=neurons_hidden_layer, activation='relu',
                                                          input_shape=(X_train_scaled.shape[1],),
                                                          kernel_regularizer=tf.keras.regularizers.l1_l2(l1=regulizer, l2=regulizer)),
                                    tf.keras.layers.Dropout(dropout),
                                    tf.keras.layers.Dense(units=mean_neurons, activation='relu'),
                                    tf.keras.layers.Dropout(dropout),
                                    tf.keras.layers.Dense(units=len(np.unique(y_resampled)), activation='softmax')
                                ])
                                model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)
    
    
                                early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)
                                # Train the model
                                model.fit(X_train_scaled, y_resampled, validation_data=(X_val_scaled, y_val_resampled),
                                          batch_size=batch_size, callbacks=[early_stopping])
    
                                # Evaluate the model on the validation set
                                y_val_pred_probs = model.predict(X_val_scaled)
                                y_val_pred = np.argmax(y_val_pred_probs, axis=1)
    
                                # Calculate and store accuracy for this fold
                                fold_accuracy = accuracy_score(y_val_resampled, y_val_pred)
                                cv_scores.append(fold_accuracy)
    
                                # Update progress bar
                                pbar.update(1)
    
                            # Calculate and store the average accuracy for these hyperparameters
                            overall_average_accuracy = np.mean(cv_scores)
    
                            # Check if the current set of hyperparameters is better than the best
                            if overall_average_accuracy > best_result['average_accuracy']:
                                best_result = {
                                    'dropout': dropout,
                                    'patience': patience,
                                    'optimizer': optimizer,
                                    'regulizer': regulizer,
                                    'batch_size': batch_size,
                                    'average_accuracy': overall_average_accuracy
                                }

In [None]:
best_result = grid_search_best_parameters()
dropout = best_result['dropout']
patience = best_result['patience']
optimizer = best_result['optimizer']
regulizer = best_result['regulizer']
batch_size = best_result['batch_size']

In [None]:
best_result_strint = f"Best Hyperparameters and Accuracy:\n" \
                     f"dropout: {dropout}\n" \
                     f"patience: {patience}\n" \
                     f"optimizer: {optimizer}\n" \
                     f"regulizer: {regulizer}\n" \
                     f"batch_size: {batch_size}\n" \
                     f"average_accuracy: {best_result['average_accuracy']}\n"

print(best_result_strint)

Now lets test for the best epochs

In [None]:
def find_best_epochs():
    epochs_range=(150, 350)
    accuracy_threshold=0.1

    cv_scores = []

    for num_epochs in range(epochs_range[0], epochs_range[1] + 1):
        print(f"Testing with {num_epochs} epochs...")

        for fold, (train_index, val_index) in enumerate(stratified_kfold.split(range(len(all_labels)), all_labels)):
            # Use the current fold as the validation set
            validation_dataset = datasets[fold]

            # Combine the remaining datasets as the training set
            training_datasets = [dataset for index, dataset in enumerate(datasets) if index != fold]
            combined_df = pd.concat(training_datasets, ignore_index=True)

            # Classification
            X_train = combined_df.drop('Label', axis=1)
            y_train = combined_df['Label']

            # Oversample the features values using SMOTE
            smote = SMOTE(random_state=42)
            X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

            # Standardize the feature values
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_resampled)

            # Classification for validation set
            X_val = validation_dataset.drop('Label', axis=1)
            y_val = validation_dataset['Label']

            # Oversample the features values using SMOTE for validation set
            X_val_resampled, y_val_resampled = smote.fit_resample(X_val, y_val)
            X_val_scaled = scaler.transform(X_val_resampled)

            mean_neurons = (X_train_scaled.shape[1] + len(np.unique(y_resampled))) // 2
            num_input_neurons = X_train_scaled.shape[1]
            num_output_neurons = len(np.unique(y_resampled))
            neurons_hidden_layer = int(2 / 3 * num_input_neurons + 1 / 3 * num_output_neurons)

            # Define and compile the model with hyperparameters
            model = tf.keras.Sequential([
                tf.keras.layers.Dense(units=neurons_hidden_layer, activation='relu',
                                      input_shape=(X_train_scaled.shape[1],),
                                      kernel_regularizer=tf.keras.regularizers.l1_l2(l1=regulizer, l2=regulizer)),
                tf.keras.layers.Dropout(dropout),
                tf.keras.layers.Dense(units=mean_neurons, activation='relu'),
                tf.keras.layers.Dropout(dropout),
                tf.keras.layers.Dense(units=len(np.unique(y_resampled)), activation='softmax')
            ])
            model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)

            early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)

            # Train the model
            model.fit(X_train_scaled, y_resampled, validation_data=(X_val_scaled, y_val_resampled),
                      batch_size=batch_size, epochs=num_epochs, callbacks=[early_stopping])

            # Evaluate the model on the validation set
            y_val_pred_probs = model.predict(X_val_scaled)
            y_val_pred = np.argmax(y_val_pred_probs, axis=1)

            # Calculate and store accuracy for this fold
            fold_accuracy = accuracy_score(y_val_resampled, y_val_pred)
            cv_scores.append(fold_accuracy)

        # Calculate and store the average accuracy for these hyperparameters
        overall_average_accuracy = np.mean(cv_scores)

        # Check if accuracy improvement is below the threshold
        if num_epochs > epochs_range[0] and overall_average_accuracy - cv_scores[-5] < accuracy_threshold:
            print(f"Stopped testing at {num_epochs} epochs.")
            return num_epochs

    print("Maximum number of epochs tested. Consider increasing the range.")
    return epochs_range[1]


In [None]:
best_epochs = find_best_epochs()
print(f"The best number of epochs is: {best_epochs}")

Now lets train the model with the best parameters

In [None]:
cv_scores = []

for fold, (train_index, val_index) in enumerate(stratified_kfold.split(range(len(all_labels)), all_labels)):
    # Use the current fold as the validation set
    validation_dataset = datasets[fold]

    # Combine the remaining datasets as the training set
    training_datasets = [dataset for index, dataset in enumerate(datasets) if index != fold]
    combined_df = pd.concat(training_datasets, ignore_index=True)

    # Classification
    X_train = combined_df.drop('Label', axis=1)
    y_train = combined_df['Label']

    # Oversample the features values using SMOTE
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    # Standardize the feature values
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_resampled)

    # Classification for validation set
    X_val = validation_dataset.drop('Label', axis=1)
    y_val = validation_dataset['Label']

    # Oversample the features values using SMOTE for validation set
    X_val_resampled, y_val_resampled = smote.fit_resample(X_val, y_val)
    X_val_scaled = scaler.transform(X_val_resampled)

    mean_neurons = (X_train_scaled.shape[1] + len(np.unique(y_resampled))) // 2
    num_input_neurons = X_train_scaled.shape[1]
    num_output_neurons = len(np.unique(y_resampled))
    neurons_hidden_layer = int(2 / 3 * num_input_neurons + 1 / 3 * num_output_neurons)

    # Define and compile the model with hyperparameters
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(units=neurons_hidden_layer, activation='relu',
                              input_shape=(X_train_scaled.shape[1],),
                              kernel_regularizer=tf.keras.regularizers.l1_l2(l1=regulizer, l2=regulizer)),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(units=mean_neurons, activation='relu'),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(units=len(np.unique(y_resampled)), activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)


    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)
    # Train the model
    model.fit(X_train_scaled, y_resampled, validation_data=(X_val_scaled, y_val_resampled),
              batch_size=batch_size,epochs=best_epochs, callbacks=[early_stopping])

    # Evaluate the model on the validation set
    y_val_pred_probs = model.predict(X_val_scaled)
    y_val_pred = np.argmax(y_val_pred_probs, axis=1)

    # Calculate and store accuracy for this fold
    fold_accuracy = accuracy_score(y_val_resampled, y_val_pred)
    cv_scores.append(fold_accuracy)

# Calculate and store the average accuracy for these hyperparameters
overall_average_accuracy = np.mean(cv_scores)

# Using Scikit-Learn

In [None]:
# Initialize MLP classifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(mean_neurons,), activation="relu", max_iter=300)

# Store accuracy scores for each fold
cv_scores = []

# Iterate through the datasets
for val_index, train_index in stratified_kfold.split(datasets):
    # Split the data into training and validation sets
    X_train_combined = np.concatenate([datasets[i] for i in train_index])
    y_train_combined = np.concatenate([np.zeros(len(datasets[i])) + i for i in train_index])

    # Use the current dataset for validation
    X_val, y_val = datasets[val_index[0]], np.zeros(len(datasets[val_index[0]])) + val_index[0]

    # Oversample the features values using SMOTE
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_combined, y_train_combined)

    # Standardize the feature values
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_resampled)
    X_val_scaled = scaler.transform(X_val)

    # Fit the classifier
    mlp_classifier.fit(X_train_scaled, y_train_resampled)

    # Predict on the validation set
    y_val_pred_probs = mlp_classifier.predict(X_val_scaled)
    y_val_pred = np.argmax(y_val_pred_probs, axis=1)

    # Calculate and store accuracy for this fold
    fold_accuracy = accuracy_score(y_val, y_val_pred)
    cv_scores.append(fold_accuracy)

# Calculate and store the average accuracy for these hyperparameters
overall_average_accuracy = np.mean(cv_scores)
print(f"\nOverall Average Accuracy: {overall_average_accuracy:.4f}")

```
### CNN
# Reshape data for CNN
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1], 1))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1], 1))

# Convert labels to categorical one-hot encoding
y_train_onehot = to_categorical(y_train_encoded)
y_test_onehot = to_categorical(y_test_encoded)
# Define the CNN model with different activation functions for hidden layers
activation_functions = ['tanh', 'relu', 'sigmoid']

for activation1 in activation_functions:
    for activation2 in activation_functions:
        # Define the CNN model
        model = Sequential()
        model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(1, X_train_scaled.shape[1], 1)))
        model.add(MaxPooling2D((2, 2)))
        model.add(Conv2D(64, (3, 3), activation=activation1))
        model.add(MaxPooling2D((2, 2)))
        model.add(Conv2D(64, (3, 3), activation=activation2))
        model.add(Flatten())
        model.add(Dense(64, activation=activation1))
        model.add(Dense(y_train_onehot.shape[1], activation='sigmoid'))  # Sigmoid for the output layer
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(X_train_reshaped, y_train_onehot, epochs=10, validation_split=0.2)
# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test_reshaped, y_test_onehot)
print(f'Test 
```