# Imports

In [None]:
import pandas as pd
import numpy as np
import re
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score


# Data pre-processing and preparation

### Obtaining the data

We used the following python script:

In [None]:
# python features.py

### Unifying the class labels with the data

In [None]:
df = pd.read_csv('urbansounds_features.csv')

In [None]:
df['Label'] = df['Label'].str.split('-').str[1]

# Display the first few rows of the updated DataFrame
print(df.head())

### Check for object values

In [None]:
object_columns = df.select_dtypes(include=['object']).columns
object_columns

### Convert the columns with object values to numeric

In [None]:
def calculate_mean_from_string(string):
    cleaned_string = string.replace('\n', '')
    numbers = re.findall(r"[-+]?\d*\.\d+|\d+", cleaned_string)
    array = np.array(numbers, dtype=float)
    mean_value = np.mean(array)
    return mean_value

In [None]:
df['Label'] = df['Label'].astype('int64')

In [None]:
df['fourier_tempogram'] = df['fourier_tempogram'].apply(calculate_mean_from_string)

In [None]:
df

### Check interval of the values per column

In [None]:
column_intervals = df.describe().loc[['min', 'max']]
column_intervals

### Check the distribution of the classes

In [None]:
class_counts = df['Label'].value_counts()
class_labels = class_counts.index
class_values = class_counts.values

plt.figure(figsize=(8, 6))
plt.bar(class_labels, class_values, color='skyblue')
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution')
plt.show()

### The classes are not balanced nor normalized, so we will need adress that in the training set

In [None]:
X = df.drop('Label', axis=1) 
y = df['Label']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Oversample the features values using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Show a graph of the class distribution
class_counts = y_train_resampled.value_counts()
class_labels = class_counts.index
class_values = class_counts.values

plt.figure(figsize=(8, 6))
plt.bar(class_labels, class_values, color='skyblue')
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution')
plt.show()

In [None]:
# Standardize the feature values
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Model architecture definition

The classifiers we chose were the following: a classifier based on multilayer perceptron (MLP) and a convolutional neural network (CNN).

### MLP classifier

For the MLP classifier, we need to define the following parameters: number of layers, number of neurons
per layer and the activation function for each layer.

# Training strategy

Optimizer
Learning hyperparameters
Regularization techniques
Possibility of using transfer learning

Choosing the best optimizer involves expererimenting and comparing the results of different optimizers to see what works best. We tested the following: SGD, Adam, and Adagrad.
A traditional default value for the learning rate is 0.1 or 0.01. We chose 0.05 as a learning rate for our MLP classifier.
Batch size - ??
Too few epochs may result in underfitting, while too many epochs may lead to overfitting, so we tried 300 and got good results.
We used early stopping to prevent overfitting and chose 0.1 as dropout rate.
 

## CNN classifier