# Recurrent Neural Network

In this file, we will be adressing our take on the problem using a **recurrent neural network**.

We will begin  by importing the necessary modules:

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm # progress bar on long runs
from scipy.io import wavfile as wav
import librosa
import os
import matplotlib.pyplot as plt
%matplotlib inline 

import tensorflow as tf
from keras import Sequential
from keras import regularizers as reg
from keras import layers

import warnings
warnings.filterwarnings('ignore')

ImportError: Unable to import required dependencies:
numpy: Error importing numpy: you should not try to import numpy from
        its source directory; please exit the numpy source tree, and relaunch
        your python interpreter from there.

In [5]:
df = pd.read_csv('../UrbanSound8K/metadata/UrbanSound8K.csv')

df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


As per mentioned in the **project statement**, the target variable corresponds to the correct labeling of the sound. There are 10 different possible sounds in the dataset:

 - air conditioner
 - car horn
 - children playing
 - dog bark
 - drilling
 - engine idling
 - gun shot
 - jackhammer
 - siren
 - street music


We can already find the `classID` column, which essentially represents each label as an integer, from 0 to 9:


In [6]:
class_id_pairs = df[['classID', 'class']].drop_duplicates().sort_values(by="classID")

for index, row in class_id_pairs.iterrows():
    print(f'classID: {row["classID"]}, class: {row["class"]}')

classID: 0, class: air_conditioner
classID: 1, class: car_horn
classID: 2, class: children_playing
classID: 3, class: dog_bark
classID: 4, class: drilling
classID: 5, class: engine_idling
classID: 6, class: gun_shot
classID: 7, class: jackhammer
classID: 8, class: siren
classID: 9, class: street_music


This means that we can remove the last column and begin working with our dataset, which we already determined is slightly unbalanced for the `car_horn` and `gunshot` values: 

In [7]:
df.drop(columns=['class'],inplace=True)
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2


## Data Preprocessing

Since not every `.wav` file is 4 seconds long, we will apply **zero-padding** to ensure that all files meet this requirement.

In [8]:
# /////////////////// NEEDS REVISION ///////////////////

# I don't understand how to do this in the np arrays

In order to improve dataset consistency and traning data quality, we also decided to create 2 new datasets: 

 - `df_22`: resamples data to 22050Hz
 - `df_44`: resamples data to 44100Hz

## NEED HELP

not sure if i should do resampling before feature extraction or during feature extraction. during feature extraction would probably help automate different quality sound extraction. but it will depend on the 0-padding order aswell.

Librosa extracts MFCCs on different scales for different .wav files. This is due to the fact that lower frequencies are emphasized during this process, potentially creating bias issues as a consequence of heterogeneous distributions of frequencies throughout each file.

To address this issue, we can apply feature scaling to the new dataframes, in order to improve data quality for our modeling purposes:

In [None]:
# Uses sklearn's MinMax scaler, rescales values to be in a range of [0,1]
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# /////////////////// NEEDS REVISION ///////////////////

# example = df_2d.iloc[0]["feature"][0]
# print("First arrray of the first entry in the 2D dataset: \n", example)

'''
# Iterates over all original dataframe rows (predicts approximate runtime)
for index_num,row in tqdm(df.iterrows(), total=len(df), desc="Processing", unit="row"):
    # Get the "features" array for the current row
    features_array = row['feature']

    # Ensure the features_array is a 2D array (in case it is 1D)
    # If it's a 1D array of shape (40,) for example, reshape it into (40, 1) for scaling
    if isinstance(features_array, np.ndarray):  # Check if the element is a numpy array
        if features_array.ndim == 1:
            # Reshape the 1D array to 2D for scaling
            features_array = features_array.reshape(-1, 1)
        
        # Apply Min-Max scaling to the array
        scaled_features = scaler.fit_transform(features_array).flatten()  # Flatten to maintain 1D structure after scaling
        
        # Update the "features" column with the scaled array (in-place)
        df.at[index_num, 'feature'] = scaled_features
'''
# /////////////////// NEEDS REVISION ///////////////////

'\n# Iterates over all original dataframe rows (predicts approximate runtime)\nfor index_num,row in tqdm(df.iterrows(), total=len(df), desc="Processing", unit="row"):\n    # Get the "features" array for the current row\n    features_array = row[\'feature\']\n\n    # Ensure the features_array is a 2D array (in case it is 1D)\n    # If it\'s a 1D array of shape (40,) for example, reshape it into (40, 1) for scaling\n    if isinstance(features_array, np.ndarray):  # Check if the element is a numpy array\n        if features_array.ndim == 1:\n            # Reshape the 1D array to 2D for scaling\n            features_array = features_array.reshape(-1, 1)\n        \n        # Apply Min-Max scaling to the array\n        scaled_features = scaler.fit_transform(features_array).flatten()  # Flatten to maintain 1D structure after scaling\n        \n        # Update the "features" column with the scaled array (in-place)\n        df.at[index_num, \'feature\'] = scaled_features\n'

## Feature extraction
The **librosa** library has a built-in method for feature extraction, called [Mel-Frequency Cepstral Coefficients](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum), that summarises the frequency distribution across the time window.

In order to build the new dataset, we developed the following functions, which are capable of extracting **1D or 2D** features.

These feature extractor functions will represent the frequencies found in the wav files as **np arrays**, while using MFCCs in order to obtain features similar to the way humans perceive sounds.

In [10]:
# Uses the mean from the Time axis, uses file sample rate
def features_extractor_1D(file):
    audio, sample_rate = librosa.load(file) 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0) 
    return mfccs_scaled_features

# Uses both Time and Frequency axis, custom sample rate
def features_extractor_2D(file, sample_rate):
    audio, _ = librosa.load(file) 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    return mfccs_features

Now that we know how to transform audio files into usable data types, we must associate each numpy array to their respective entry inside the df dataframe.

This will allow for important pre-processing steps to be applied accordingly, as well as proper Neural Network training and testing.

In [None]:
# Identify path containing all folds
audio_dataset_path='../UrbanSound8K/audio/'
extracted_features22=[]
extracted_features44=[]

'''

# Iterates over all original dataframe rows (predicts approximate runtime)
for index_num,row in tqdm(df.iterrows(), total=len(df), desc="Processing", unit="row"):
    # Identifies wav file name, concatenates to respective fold: accesses original .wav file
    #file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'\\',str(row["slice_file_name"]))
    
    # Adds associated sound label
    final_class_labels=row["classID"]
    
    # 22050Hz sample rate
    data1=features_extractor_2D(file_name, 22050) 
    extracted_features22.append([data1,final_class_labels])

    # 44100hHz sample rate
    data2=features_extractor_2D(file_name, 44100) 
    extracted_features44.append([data2,final_class_labels])
    

# Convert extracted_features to Pandas dataframe
df_1d =pd.DataFrame(extracted_features22,columns=['feature','class'])
df_2d =pd.DataFrame(extracted_features44,columns=['feature','class'])

df_1d.to_csv("rnn_2d_22.csv", index=False)
df_2d.to_csv("rnn_2d_44.csv", index=False)

'''

Processing: 100%|██████████| 8732/8732 [03:25<00:00, 42.51row/s]


## Model Development

In order to develop an effective **Recurrent Neural Network**, the group decided to explore the concept of **Long Short Term Memory** (LSTM) networks. LSTMs are a type of RNN that are designed to handle sequential data pattern recognition. 

We consider this approach could be the most effective in order to classify the sounds, since continuous sounds or repetitive rythms are sequential. These time-dependant aspects are characteristics which LSTMs are capable of recognizing and "remembering" throughout training.

In this case, each LSTM layer 

#### Neural Network Architecture

In order to build this LSTM model, we decided to do some research and attempt to look into the topology suggested in a few published papers, before attempting to change and improve classification performance.

The [first paper](https://dergi.neu.edu.tr/index.php/aiit/article/download/740/327/3147) suggests the following:

 - 22050Hz sample rate
 - 1 LSTM layer of size 128
 - followed another LSTM layer of size 64
 - SoftMax for prediction
 - 50 epochs
 - Adam optimization
 - Dropout Rate = 0.2

This topology takes the large input vector size into consideration, meaning that it should maintain robustness and generally avoid major overfiting, while still allowing the model to identify patterns during an appropriate ammount of time. It also uses as sufficiently good sound quality for environment noise purposes, although we would like to verify if the difference in sound quality allows for better classification. A dropout rate of 0.2 (rate in which neurons are randomly disabled) is used in order to prevent overfitting.

Overall, this model aims for efficiency and computation speed, in an effort to reduce training time while preserving model quality and performance, hence why we chose this as our starter model topology.


[The second paper](https://annals-csis.org/Volume_18/drp/pdf/185.pdf) proposes almost identical topology (similar to most of the projects found), with the small changes of:
 - 44100Hz sample rate
 - 64 epochs
 - Dropout Rate = 0.25

We expect this model to take longer to train (due to the higher number of epochs).
It could lead to overfitting, which is why the **dropout rate** has also been **adjusted** to the slightly higher value of 0.25. If it is correctly trained, it **should demonstrate better results**, compared to the previous settings.

Overall, it is a generally "riskier" model, and it was chosen to assess if the results of the combination combination of **increase in epochs** and **improved sound quality** could compensate the time consumed, render the training problems relativelly irrelevant if the generalization and classification abilities of the resulting model show significant improvements.

### Model Performance Predictions


We expect the **first model** to be trained faster due to inferior sound quality and lower epochs, in contrast with the **second model**, which could overfit but benefits from higher sound quality and dropout rate, potentially mitigating such problems, overall being preferable.

Both results will be used as a type of foundation for the final topology decisions.

## REVIEW !!!

If we have time, we can try:

 - 2 LSTM layers of size 128 (might take a while)
 - SoftMax for combination of 2D results
 - Sigmoid or Tanh for classification

### Model Development

Following the previously stated network topology and parameters, and using the available **tensorflow** and **keras** modules, we can start implementing an LSTM neural network, using available tools like:

 - `keras.Sequential()`: 
 - `layers.LSTM`: 
 - `layers.Dropout`:
 - `layers.TimeDistributed`: 
 - `layers.Dropout`:

We have created a Neural Network generating function, which is based on the following concepts: 

 - By ensuring that the input has a fixed shape and frequency, we can now use a Sequential keras model. This will allow for simple layer stacking and parameter configuration;
 - A dropout rate of 0.2 is employed in between each layer (with the exception of flatten and classification layers);
 - Time Distributed layers allow for easier handling of the extracted sequential features, by preserving the sequential time dimension features while reducing dimensionality;
 - In order to potentially improve our classifier's robustness, we decided to employ **l2 regularization** (or Lasso regularization), due to our dataset's reduced size;



In [None]:
# Easily configurable to generate with different architecture
# Implicitly creates first model, explained above

def generate_lstm(x, y, lstm1_size=128, lstm2_size=64, dropout=0.2):
    
    # Uses UrbanSounds8k split data shape
    train_shape = (x, y) 

    # Initiaizes sequential model
    model = Sequential()

    # Adds 2 LSTM layers of size 128 and 64, with a dropout rate of 0.2
    model.add(layers.LSTM(lstm1_size,  input_shape = train_shape, return_sequences = True, activation='tanh'))
    model.add(layers.Dropout(dropout))

    # The second layer does not need return sequences, 
    # since the next layer won't be another LSTM
    model.add(layers.LSTM(lstm2_size, return_sequences = False, activation='tanh'))
    model.add(layers.Dropout(dropout))

    # Suggested intermediate dense layer -> reduce dimensionality and preserve time sequential features
    model.add(layers.TimeDistributed(layers.Dense(64, activation='tanh', kernel_regularizer = reg.l2(0.01))))
    model.add(layers.Dropout(dropout))
    model.add(layers.TimeDistributed(layers.Dense(32, activation='tanh', kernel_regularizer = reg.l2(0.01))))
    model.add(layers.Dropout(dropout))

    # /////////////////// NEEDS REVISION ///////////////////

    # Last TimeDistributed produces 16 features/time step (thought dimensionality was appropriate)
    model.add(layers.TimeDistributed(layers.Dense(16, activation='tanh', kernel_regularizer = reg.l2(0.01))))
    model.add(layers.Dropout(dropout))

    # Flattens the last TimeDistributed outputs
    model.add(layers.Flatten())

    # Dense output layer -> classification
    model.add(layers.Dense(10, activation = 'softmax'))

    return model


## STILL NEEDS TO EBE COMPILED AND ADAM OPTIMIZED!!!

# References

Many-to-Many LSTM for Sequence Prediction with TimeDistributed layers - [link](https://machinelearningmastery.com/timedistributed-layer-for-long-short-term-memory-networks-in-python/)