# Imports

In [None]:
import pandas as pd
import numpy as np
import re
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Input de DataSet

In [None]:
datasets = [pd.read_csv(f'datasets/urbansounds_features_{i}.csv') for i in range(1, 11)]

Clean the DataSet

In [None]:
def calculate_mean_from_string(string):
    cleaned_string = string.replace('\n', '')
    numbers = re.findall(r"[-+]?\d*\.\d+|\d+", cleaned_string)
    array = np.array(numbers, dtype=float)
    mean_value = np.mean(array)
    return mean_value

In [None]:
for df in datasets:
    for column in df.columns:
        if column != 'Label':
            if df[column].dtype != float and df[column].dtype != int:
                df[column] = df[column].apply(calculate_mean_from_string)
        else:
            df[column] = df[column].str.split('-').str[1].astype(int)

In [None]:
def plot_learning_curve(history):
    # Plot training & validation accuracy values
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

    # Plot training & validation loss values
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()


In [None]:
def plot_confusion_matrix(y_true, y_pred, class_labels):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

# Classification

## MLP

In [None]:
# MLP Hyperparameters
hyperparameters_mlp = {
    'input_neurons': None,  # To be determined dynamically
    'output_neurons': 10,
    'n_layers': 4,
    'neurons_per_layer': [256, 128, 64, 32],
    'dropout': 0.5,
    'learning_rate': 0.0005,
    'batch_size': 128,
    'epochs': 20,
    'regularizer': None,
    'patience': 10
}


In [None]:
hyperparameters = hyperparameters_mlp

accuracies = []

for fold in range(10):
    test_data = datasets[fold]
    test_labels = test_data['Label'].values
    test_data = test_data.drop(columns=['Label']).values

    train_datasets = datasets[:fold] + datasets[fold + 1:]

    train_data = pd.concat(train_datasets)
    train_labels = train_data['Label'].values
    train_data = train_data.drop(columns=['Label']).values

    smote = SMOTE(random_state=42)
    scaler = StandardScaler()

    train_data, train_labels = smote.fit_resample(train_data, train_labels)
    train_data = scaler.fit_transform(train_data)

    hyperparameters['input_neurons'] = train_data.shape[1]

    model = tf.keras.Sequential()
    
    model.add(tf.keras.layers.Flatten(input_shape=(hyperparameters_mlp['input_neurons'],)))
    
    for neurons in hyperparameters_mlp['neurons_per_layer']:
        model.add(tf.keras.layers.Dense(
            neurons,
            activation='relu',
            kernel_regularizer=hyperparameters_mlp['regularizer']
        ))
        model.add(tf.keras.layers.Dropout(hyperparameters_mlp['dropout']))
    
    model.add(tf.keras.layers.Dense(hyperparameters_mlp['output_neurons'], activation='softmax'))
    
    model.compile(
        optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=hyperparameters_mlp['learning_rate']),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=hyperparameters['patience'],
        restore_best_weights=True
    )

    history = model.fit(
        train_data, train_labels,
        epochs=hyperparameters['epochs'],
        batch_size=hyperparameters['batch_size'],
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=0
    )
    #plot_learning_curve(history)


    predictions = model.predict(test_data)
    predicted_labels = np.argmax(predictions, axis=1)
    #plot_confusion_matrix(test_labels, predicted_labels, class_labels=['0', '1', '2', '3','4', '5', '6', '7','8','9'])

    accuracy = accuracy_score(test_labels, predicted_labels)
    accuracies.append(accuracy)


mean_accuracy = np.mean(accuracies)

print(f"Mean Accuracy across folds (MLP): {mean_accuracy}")

## RNN

In [None]:
# RNN Hyperparameters
hyperparameters_rnn = {
    'input_neurons': None,  # To be determined dynamically
    'output_neurons': 10,
    'lstm_units': 64,
    'dropout': 0.3,
    'learning_rate': 0.0005,
    'batch_size': 128,
    'epochs': 20,
    'regularizer': None,
    'patience': 10
}


In [None]:
hyperparameters = hyperparameters_rnn

accuracies = []

for fold in range(10):
    test_data = datasets[fold]
    test_labels = test_data['Label'].values
    test_data = test_data.drop(columns=['Label']).values

    train_datasets = datasets[:fold] + datasets[fold + 1:]

    train_data = pd.concat(train_datasets)
    train_labels = train_data['Label'].values
    train_data = train_data.drop(columns=['Label']).values

    smote = SMOTE(random_state=42)
    train_data, train_labels = smote.fit_resample(train_data, train_labels)

    scaler = StandardScaler()
    train_data = scaler.fit_transform(train_data)

    train_data = train_data.reshape(-1, len(train_data[0]), 1)

    hyperparameters['input_neurons'] = len(train_data[0])

    model = tf.keras.Sequential()
    
    model.add(tf.keras.layers.LSTM(
        hyperparameters_rnn['lstm_units'],
        input_shape=(len(train_data[0]), 1),
        dropout=hyperparameters_rnn['dropout'],
        return_sequences=False
    ))
    
    model.add(tf.keras.layers.Dense(hyperparameters_rnn['output_neurons'], activation='softmax'))
    
    model.compile(
        optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=hyperparameters_rnn['learning_rate']),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=hyperparameters['patience'],
        restore_best_weights=True
    )

    history = model.fit(
        train_data, train_labels,
        epochs=hyperparameters['epochs'],
        batch_size=hyperparameters['batch_size'],
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=0
    )
    #plot_learning_curve(history)

    test_data = test_data.reshape(-1, len(test_data[0]), 1)

    predictions = model.predict(test_data)
    predicted_labels = np.argmax(predictions, axis=1)
    #plot_confusion_matrix(test_labels, predicted_labels, class_labels=['0', '1', '2', '3','4', '5', '6', '7','8','9'])

    accuracy = accuracy_score(test_labels, predicted_labels)
    accuracies.append(accuracy)

mean_accuracy = np.mean(accuracies)

print(f"Mean Accuracy across folds (RNN): {mean_accuracy}")
