# Imports

In [1]:
import pandas as pd
import numpy as np
import re
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from itertools import product

# Input de DataSet

In [2]:
datasets = [pd.read_csv(f'datasets/urbansounds_features_{i}.csv') for i in range(1, 11)]

Clean the DataSet

In [3]:
def calculate_mean_from_string(string):
    cleaned_string = string.replace('\n', '')
    numbers = re.findall(r"[-+]?\d*\.\d+|\d+", cleaned_string)
    array = np.array(numbers, dtype=float)
    mean_value = np.mean(array)
    return mean_value

In [4]:
for df in datasets:
    for column in df.columns:
        if column != 'Label':
            if df[column].dtype != float and df[column].dtype != int:
                df[column] = df[column].apply(calculate_mean_from_string)
        else:
            df[column] = df[column].str.split('-').str[1].astype(int)

# Classification

In [5]:
def heatmap(test,pred):
    cm = confusion_matrix(test, pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Using TenserFlow

In [6]:
# Combine all labels from different datasets
all_labels = np.concatenate([df['Label'].values for df in datasets])

# Define the stratified k-fold
stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [7]:
def oversample_features(X, y):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled


def standardize_features(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

In [8]:
def preprocess_data(dataset):
    X = dataset.drop('Label', axis=1)
    y = dataset['Label']

    # Oversample and standardize the features
    X_resampled, y_resampled = oversample_features(X, y)
    X_scaled = standardize_features(X_resampled)

    return X_scaled, y_resampled

In [9]:
def prepare_datasets(fold):
    # Prepare training dataset
    training_datasets = [dataset for index, dataset in enumerate(datasets) if index != fold]
    combined_df = pd.concat(training_datasets, ignore_index=True)
    X_train, y_train = preprocess_data(combined_df)

    # Prepare validation dataset
    validation_dataset = datasets[fold]
    X_val, y_val = preprocess_data(validation_dataset)

    return X_train, y_train, X_val, y_val

In [10]:
def define_model(X_train, y_train):
    mean_neurons = (X_train.shape[1] + len(np.unique(y_train))) // 2
    num_input_neurons = X_train.shape[1]
    num_output_neurons = len(np.unique(y_train))
    neurons_hidden_layer = int(2 / 3 * num_input_neurons + 1 / 3 * num_output_neurons)

    # Define and compile the model with hyperparameters
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(units=neurons_hidden_layer, activation='relu',
                              input_shape=(X_train.shape[1],),
                              kernel_regularizer=tf.keras.regularizers.l1_l2(l1=regulizer, l2=regulizer)),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(units=mean_neurons, activation='relu'),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(units=len(np.unique(y_train)), activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)

    return model

In [11]:
def build_and_train_model(X_train, y_train, X_val, y_val, num_epochs):
    # Determine model architecture and compile
    model = define_model(X_train, y_train)

    # Train the model
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)
    model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=batch_size, epochs=num_epochs,
              callbacks=[early_stopping])

    return model

In [12]:
def evaluate_model(model, X_val, y_val):
    y_val_pred_probs = model.predict(X_val)
    y_val_pred = np.argmax(y_val_pred_probs, axis=1)
    fold_accuracy = accuracy_score(y_val, y_val_pred)
    return fold_accuracy

After executing a grid search, we can see that the best hyperparameters are:

![parameter_search.png](parameter_search.png)

In [17]:
# Hyperparameter Grid
num_epochs = 201
dropout = 0.06
learning_rate = '0.1'
patience = 15
optimizer = 'adam'
regulizer = 0.1
batch_size = 64

In [18]:
cv_scores = []
for fold, (train_index, val_index) in enumerate(stratified_kfold.split(range(len(all_labels)), all_labels)):
    # Prepare training and validation datasets
    X_train, y_train, X_val, y_val = prepare_datasets(fold)

    # Build, compile, and train the model
    model = build_and_train_model(X_train, y_train, X_val, y_val, num_epochs)

    # Evaluate and store accuracy for this fold
    fold_accuracy = evaluate_model(model, X_val, y_val)
    cv_scores.append(fold_accuracy)

# Calculate and store the average accuracy for these hyperparameters
overall_average_accuracy = np.mean(cv_scores)
print(f"\nOverall Average Accuracy: {overall_average_accuracy:.4f}")

Epoch 1/201
Epoch 2/201
Epoch 3/201
Epoch 4/201
Epoch 5/201
Epoch 6/201
Epoch 7/201
Epoch 8/201
Epoch 9/201
Epoch 10/201
Epoch 11/201
Epoch 12/201
Epoch 13/201
Epoch 14/201
Epoch 15/201
Epoch 16/201
Epoch 17/201
Epoch 18/201
Epoch 19/201
Epoch 20/201
Epoch 21/201
Epoch 22/201
Epoch 23/201
Epoch 24/201
Epoch 25/201
Epoch 26/201
Epoch 27/201
Epoch 28/201
Epoch 29/201
Epoch 30/201
Epoch 31/201
Epoch 32/201
Epoch 33/201
Epoch 34/201
Epoch 35/201
Epoch 36/201
Epoch 37/201
Epoch 38/201
Epoch 39/201
Epoch 40/201
Epoch 41/201
Epoch 42/201
Epoch 43/201
Epoch 44/201
Epoch 45/201
Epoch 46/201
Epoch 47/201
Epoch 48/201
Epoch 49/201
Epoch 50/201
Epoch 51/201
Epoch 1/201
Epoch 2/201
Epoch 3/201
Epoch 4/201
Epoch 5/201
Epoch 6/201
Epoch 7/201
Epoch 8/201
Epoch 9/201
Epoch 10/201
Epoch 11/201
Epoch 12/201
Epoch 13/201
Epoch 14/201
Epoch 15/201
Epoch 16/201
Epoch 17/201
Epoch 18/201
Epoch 19/201
Epoch 20/201
Epoch 21/201
Epoch 22/201
Epoch 23/201
Epoch 24/201
Epoch 25/201
Epoch 26/201
Epoch 27/201
Epoc

```
### CNN
# Reshape data for CNN
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1], 1))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1], 1))

# Convert labels to categorical one-hot encoding
y_train_onehot = to_categorical(y_train_encoded)
y_test_onehot = to_categorical(y_test_encoded)
# Define the CNN model with different activation functions for hidden layers
activation_functions = ['tanh', 'relu', 'sigmoid']

for activation1 in activation_functions:
    for activation2 in activation_functions:
        # Define the CNN model
        model = Sequential()
        model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(1, X_train_scaled.shape[1], 1)))
        model.add(MaxPooling2D((2, 2)))
        model.add(Conv2D(64, (3, 3), activation=activation1))
        model.add(MaxPooling2D((2, 2)))
        model.add(Conv2D(64, (3, 3), activation=activation2))
        model.add(Flatten())
        model.add(Dense(64, activation=activation1))
        model.add(Dense(y_train_onehot.shape[1], activation='sigmoid'))  # Sigmoid for the output layer
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(X_train_reshaped, y_train_onehot, epochs=10, validation_split=0.2)
# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test_reshaped, y_test_onehot)
print(f'Test 
```