# Import libraries

In [None]:
# Standard libraries
import numpy as np
import pandas as pd
import time

# for audio
from IPython.display import Audio
import librosa

# for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# for modeling
import tensorflow as tf
from sklearn.metrics import classification_report

# For visualization
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

# For drive access
from google.colab import drive
import os
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# define classes to extract features from audio files

In [None]:
class Framed:

  def __init__(self,
               dataframe,
               sr=16000,
               window_size_s=8.0,
               hop_size_s=4.0,
               augment=True
               ):
    """
    Instantiate the Framed class, create train and val dfs
    The extract_framed() method is automatically called to extract the framed audios and add to the train and val dfs

    Alternative, the extract_framed() method could be called separately if needed by passing a dataframe to it as parameter

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information
      sr (int): Sample rate of the audio files. Default = 16000
      window_size_s (float): Window size in seconds for framing the audio. Default = 8.0
      hop_size_s (float): Hop size in seconds for framing the audio. Default = 4.0
    """

    self.sr = sr
    self.frame_length = int(window_size_s * sr)
    self.frame_step = int(hop_size_s * sr)

    self.train_df, self.val_df = self._split_train_val(dataframe)

    self.extract_framed(self.train_df, augment=augment)
    self.extract_framed(self.val_df, augment=False)

  def _split_train_val(self, dataframe):
    """
    Split the dataframe to train and val based on predefined train/val column in the dataframe
    """
    train_df = dataframe[dataframe['data'] == 'train']
    train_df = train_df.sample(frac=1, random_state=1234)

    val_df = dataframe[dataframe['data'] == 'val']
    val_df = val_df.sample(frac=1, random_state=1234)

    return train_df, val_df

  def augment(self, audio_array):
    """
    Apply random augmentation to training set if needed
    """
    choice = np.random.choice(['original','noise','shift','pitch'])

    # if the audio is short, stretch it first, otherwise just apply other augmentation techniques
    if len(audio_array) <= 100000:
      audio_array = librosa.effects.time_stretch(y=audio_array, rate=np.random.uniform(low=0, high=1))
    else:
      audio_array = audio_array

    if choice == 'noise':
      noise_amp = np.random.normal(loc=0.0, scale=0.05)*np.amax(audio_array)
      audio_array = audio_array + noise_amp*np.random.normal(size=audio_array.shape[0])
    elif choice == 'shift':
      shift_range = int(np.random.uniform(low=-5, high=5)*1000)
      audio_array = np.roll(audio_array, shift_range)
    elif choice == 'pitch':
      audio_array = librosa.effects.pitch_shift(y=audio_array, sr=self.sr, n_steps=np.random.uniform(low=0, high=1))
    else:
      audio_array = audio_array

    return audio_array

  def extract_framed(self, dataframe, augment):
    """
    Load audio files from the given DataFrame, extract framed audios, and add the framed audios to the given DataFrame

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information
    """

    # load audios
    audios = []
    for filename in dataframe['filename_npy']:
      audio = np.load('/content/drive/MyDrive/207/207-Project/data/train/librosa_loaded/' + filename)
      if augment is True:
        audio = self.augment(audio)
      audios.append(audio)
    assert len(audios) == len(dataframe)

    # extract framed audios
    framed = []
    for audio in audios:
      framed_audio = tf.signal.frame(audio, self.frame_length, self.frame_step, pad_end=False)
      framed.append(framed_audio)
    assert len(framed) == len(audios)
    del audios

    # add framed audios to df
    dataframe['framed'] = framed
    del framed

In [None]:
class Extraction:

  def __init__(self,
               train_df,
               val_df,
               sr=16000,
               n_mfccs=20, #20 is the default n_mfccs from librosa
               n_mels=20, #reduced from 128 default from librosa to 20
               n_chroma=12, #12 is the default n_chroma from librosa
               features=['mfcc'],
               normalize=True,
               avgpool=True
               ):
    """
    Instantiate the Extraction class and extract the labels and features from train and val DataFrames
    The extract_features() method is automatically called to extract the labels and features from the given DataFrame

    Parameters:
      train_df (pd.DataFrame): DataFrame containing training data
      val_df (pd.DataFrame): DataFrame containing validation data
      sr (int): Sample rate of the audio files. Default = 16000
      duration_s (float): Length of framed audios in seconds
      features (list): List features to extract. Default = ['mfcc']
        acceptable features: 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating'
      normalize (bool): Whether to normalize the features
      maxpool (bool): Whether to maxpool the features

    """
    self.sr = sr
    self.n_mfccs = n_mfccs
    self.n_mels = n_mels
    self.n_chroma = n_chroma

    # confirm features have been specified
    assert len(features) != 0, "Must Specify At Least One Feature In The Form Of A List"
    self.features = features

    # confirm features are valid
    self.accepted_feature = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating']
    for each in self.features:
      assert each in self.accepted_feature, f"{each} is not an accepted feature, only 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating' are acceptable features"

    # extract train and val labels and features
    self.train_y, self.train_features = self.extract_features(train_df)
    self.val_y, self.val_features = self.extract_features(val_df)

    # normalize and/or maxpool the features
    if normalize is True:
      self.train_features, self.val_features = self.normalize_features(self.train_features, self.val_features)
    if avgpool is True:
      self.train_features, self.val_features = self.avgpool_features(self.train_features, self.val_features)
    else:
      self.train_features = self.train_features
      self.val_features = self.val_features


  # each of the extract_feature() function is callable by the extract_features() function based on the features instantiated
  # each of the features are extracted from librosa and transposed to represent (time_shape, n_features)
  def extract_mfcc(self, each):
    return np.transpose(librosa.feature.mfcc(y=np.array(each), sr=self.sr, n_mfcc=self.n_mfccs, n_fft=128, hop_length=64))

  def extract_chroma(self, each):
    return np.transpose(librosa.feature.chroma_stft(y=np.array(each), sr=self.sr, n_chroma=self.n_chroma, n_fft=128, hop_length=64))

  def extract_rms(self, each):
    return np.transpose(librosa.feature.rms(y=np.array(each), hop_length=64))

  def extract_spectral_centroid(self, each):
    return np.transpose(librosa.feature.spectral_centroid(y=np.array(each), sr=self.sr, n_fft=128, hop_length=64))

  def extract_melspectrogram(self, each):
    mel = librosa.feature.melspectrogram(y=np.array(each), sr=self.sr, n_mels=self.n_mels, n_fft=128, hop_length=64)
    return np.transpose(librosa.power_to_db(mel))


  # normalization function which normalizes any given feature of train and val
  def normalization(self, train_X, val_X, n_time, n_features):
    train_X_reshape = train_X.reshape(-1, n_time * n_features)
    val_X_reshape = val_X.reshape(-1, n_time * n_features)

    scaler = MinMaxScaler((0,1)).fit(train_X_reshape)

    train_X_scaled = scaler.transform(train_X_reshape)
    val_X_scaled = scaler.transform(val_X_reshape)

    train_X = train_X_scaled.reshape(-1, n_time, n_features)
    val_X = val_X_scaled.reshape(-1, n_time, n_features)

    return train_X, val_X


  # avgerage pooling function which average pool any given feature of train and val
  def avgpooling(self, train_X, val_X, n_time, n_features):
    input = tf.keras.layers.Input(shape=(n_time, n_features))
    global_avg_pooling = tf.keras.layers.GlobalAveragePooling1D()
    pooled = global_avg_pooling(input)
    pooling_model = tf.keras.models.Model(inputs=input, outputs=pooled)

    train_X = pooling_model.predict(train_X)
    val_X = pooling_model.predict(val_X)

    return train_X, val_X


  # normalize the features if needed
  # 'continent', 'rating' do not need to be normalized
  def normalize_features(self, train_features_dict, val_features_dict):
    start_time = time.time()
    display('features normalization started ---------->')

    # normalize mfcc
    if 'mfcc' in train_features_dict.keys():
      train_feature = train_features_dict['mfcc']
      val_feature = val_features_dict['mfcc']
      train_features_dict['mfcc'], val_features_dict['mfcc'] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=self.n_mfccs)
      display('mfcc normalized')

    # normalize chroma
    if 'chroma' in train_features_dict.keys():
      train_feature = train_features_dict['chroma']
      val_feature = val_features_dict['chroma']
      train_features_dict['chroma'], val_features_dict['chroma'] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=self.n_chroma)
      display('chroma normalized')

    # normalize rms
    if 'rms' in train_features_dict.keys():
      train_feature = train_features_dict['rms']
      val_feature = val_features_dict['rms']
      train_features_dict['rms'], val_features_dict['rms'] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=1)
      display('rms normalized')

    # normalize spectral_centroid
    if 'spectral_centroid' in train_features_dict.keys():
      train_feature = train_features_dict['spectral_centroid']
      val_feature = val_features_dict['spectral_centroid']
      train_features_dict['spectral_centroid'], val_features_dict['spectral_centroid'] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=1)
      display('spectral_centroid normalized')

    # normalize melspectrogram
    if 'melspectrogram' in train_features_dict.keys():
      train_feature = train_features_dict['melspectrogram']
      val_feature = val_features_dict['melspectrogram']
      train_features_dict['melspectrogram'], val_features_dict['melspectrogram'] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=self.n_mels)
      display('melspectrogram normalized')

    end_time = time.time()
    display(f'features normalization took {(end_time - start_time)/60:.1f} mins')

    return train_features_dict, val_features_dict


  # average pool the features if needed
  # 'continent', 'rating' do not need to be pooled
  def avgpool_features(self, train_features_dict, val_features_dict):
    start_time = time.time()
    display('features average pooling started ---------->')

    # average pool mfcc
    if 'mfcc' in train_features_dict.keys():
      train_feature = train_features_dict['mfcc']
      val_feature = val_features_dict['mfcc']
      train_features_dict['mfcc'], val_features_dict['mfcc'] = self.avgpooling(train_feature, val_feature, n_time=train_feature.shape[1], n_features=self.n_mfccs)
      display('mfcc avgpooled')

    # average pool chroma
    if 'chroma' in train_features_dict.keys():
      train_feature = train_features_dict['chroma']
      val_feature = val_features_dict['chroma']
      train_features_dict['chroma'], val_features_dict['chroma'] = self.avgpooling(train_feature, val_feature, n_time=train_feature.shape[1], n_features=self.n_chroma)
      display('chroma avgpooled')

    # average pool rms
    if 'rms' in train_features_dict.keys():
      train_feature = train_features_dict['rms']
      val_feature = val_features_dict['rms']
      train_features_dict['rms'], val_features_dict['rms'] = self.avgpooling(train_feature, val_feature, n_time=train_feature.shape[1], n_features=1)
      display('rms avgpooled')

    # average pool spectral_centroid
    if 'spectral_centroid' in train_features_dict.keys():
      train_feature = train_features_dict['spectral_centroid']
      val_feature = val_features_dict['spectral_centroid']
      train_features_dict['spectral_centroid'], val_features_dict['spectral_centroid'] = self.avgpooling(train_feature, val_feature, n_time=train_feature.shape[1], n_features=1)
      display('spectral_centroid avgpooled')

    # average pool melspectrogram
    if 'melspectrogram' in train_features_dict.keys():
      train_feature = train_features_dict['melspectrogram']
      val_feature = val_features_dict['melspectrogram']
      train_features_dict['melspectrogram'], val_features_dict['melspectrogram'] = self.avgpooling(train_feature, val_feature, n_time=train_feature.shape[1], n_features=self.n_mels)
      display('melspectrogram avgpooled')

    end_time = time.time()
    display(f'features average pooling took {(end_time - start_time)/60:.1f} mins')

    return train_features_dict, val_features_dict


  def extract_features(self, dataframe):
    """
    Extract the label & features from the dataframes

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information

    Returns:
      tuple: Tuple containing labels and a dictionary of extracted features
    """
    start_time = time.time()
    display(f'feature extraction started ---------->')

    y = []
    features_dict = {item: [] for item in self.features}

    for _, row in dataframe.iterrows():
      label = row['primary_label']
      framed = row['framed']

      for each in framed:
        y.append(label)

        if 'continent' in self.features:
          features_dict['continent'].append(row['continent'])
        if 'rating' in self.features:
          features_dict['rating'].append(row['rating'])

        for feature in self.features:
          extract = f"extract_{feature}"
          if hasattr(self, extract) and callable(func := getattr(self, extract)):
            features_dict[feature].append(func(each))

    # cast lists to np arrays
    for each in features_dict.keys():
      features_dict[each] = np.array(features_dict[each])

    y = np.array(y)

    end_time = time.time()
    display(f'features extraction took {(end_time - start_time)/60:.1f} mins')

    return y, features_dict


# Load train data csv file

In [None]:
df = pd.read_csv('/content/drive/MyDrive/207/207-Project/notebooks/RG/3_species/train_val.csv')

df.head()

Unnamed: 0,primary_label,filename,type,filename_npy,rating,duration_secs_32000,country,continent,data
0,eaywag1,eaywag1/XC718442.ogg,blank,eaywag1/XC718442.npy,good,12.538781,FR,EUROPE,train
1,eaywag1,eaywag1/XC675682.ogg,call,eaywag1/XC675682.npy,good,35.657,RU,EUROPE,train
2,eaywag1,eaywag1/XC722533.ogg,blank,eaywag1/XC722533.npy,good,58.104,RU,EUROPE,train
3,eaywag1,eaywag1/XC673617.ogg,call,eaywag1/XC673617.npy,poor,18.756,GB,EUROPE,train
4,eaywag1,eaywag1/XC675935.ogg,call,eaywag1/XC675935.npy,good,16.666,RU,EUROPE,train


In [None]:
len(df)

940

In [None]:
# drop the samples with less than 8 seconds in duration
df = df[df['duration_secs_32000'] >= 8]

In [None]:
len(df)

794

# Extract train and val dfs with framed audios

In [None]:
%%time

framed = Framed(df,
               window_size_s=8.0,
               hop_size_s=4.0,
               augment=False)

CPU times: user 6.73 s, sys: 6.37 s, total: 13.1 s
Wall time: 39.4 s


In [None]:
framed.train_df.head()

Unnamed: 0,primary_label,filename,type,filename_npy,rating,duration_secs_32000,country,continent,data,framed
654,barswa,barswa/XC659151.ogg,call,barswa/XC659151.npy,good,47.438375,PT,EUROPE,train,"((tf.Tensor(-2.4908164e-05, shape=(), dtype=fl..."
33,eaywag1,eaywag1/XC738300.ogg,blank,eaywag1/XC738300.npy,good,18.408,FR,EUROPE,train,"((tf.Tensor(7.4012205e-06, shape=(), dtype=flo..."
206,eaywag1,eaywag1/XC675971.ogg,song,eaywag1/XC675971.npy,good,18.364,RU,EUROPE,train,"((tf.Tensor(1.6528298e-05, shape=(), dtype=flo..."
240,comsan,comsan/XC606397.ogg,call,comsan/XC606397.npy,good,52.584,FR,EUROPE,train,"((tf.Tensor(-4.419009e-06, shape=(), dtype=flo..."
433,comsan,comsan/XC469618.ogg,call,comsan/XC469618.npy,poor,37.27675,PL,EUROPE,train,"((tf.Tensor(-1.4243425e-05, shape=(), dtype=fl..."


In [None]:
framed.val_df.head()

Unnamed: 0,primary_label,filename,type,filename_npy,rating,duration_secs_32000,country,continent,data,framed
789,comsan,comsan/XC157141.ogg,call,comsan/XC157141.npy,good,46.92,NO,EUROPE,val,"((tf.Tensor(3.7471045e-06, shape=(), dtype=flo..."
910,barswa,barswa/XC480227.ogg,call,barswa/XC480227.npy,good,13.824,US,AMERICAS,val,"((tf.Tensor(2.1201558e-06, shape=(), dtype=flo..."
664,eaywag1,eaywag1/XC417163.ogg,song,eaywag1/XC417163.npy,good,91.896,UA,EUROPE,val,"((tf.Tensor(-5.267575e-06, shape=(), dtype=flo..."
800,comsan,comsan/XC638592.ogg,call,comsan/XC638592.npy,good,12.624,GB,EUROPE,val,"((tf.Tensor(-3.496225e-07, shape=(), dtype=flo..."
716,eaywag1,eaywag1/XC571987.ogg,call,eaywag1/XC571987.npy,good,57.573875,BE,EUROPE,val,"((tf.Tensor(-1.1569877e-05, shape=(), dtype=fl..."


# Extract features from train and val dfs

In [None]:
features_list = ['mfcc', 'spectral_centroid']

In [None]:
%%time

features = Extraction(framed.train_df,
                      framed.val_df,
                      features=features_list,
                      normalize=True,
                      avgpool=False)

'feature extraction started ---------->'

  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


'features extraction took 2.9 mins'

'feature extraction started ---------->'

'features extraction took 1.0 mins'

'features normalization started ---------->'

'mfcc normalized'

'spectral_centroid normalized'

'features normalization took 0.0 mins'

CPU times: user 3min 34s, sys: 2min 38s, total: 6min 13s
Wall time: 3min 54s


In [None]:
train_y = features.train_y

display(train_y.shape)
display(train_y[:5])

(4555,)

array(['barswa', 'barswa', 'barswa', 'barswa', 'barswa'], dtype='<U7')

In [None]:
train_features = features.train_features

for key in train_features.keys():
  display(key)
  display(train_features[key].shape)
  display(train_features[key][0])

'mfcc'

(4555, 2001, 20)

array([[0.00338614, 0.8352443 , 0.46172464, ..., 0.6186624 , 0.70663685,
        0.59682107],
       [0.00619125, 0.7901093 , 0.5427949 , ..., 0.5511806 , 0.6487802 ,
        0.5252016 ],
       [0.0090096 , 0.7787183 , 0.48450667, ..., 0.56436545, 0.6488531 ,
        0.5753213 ],
       ...,
       [0.5033841 , 0.44364876, 0.4574382 , ..., 0.48450795, 0.23099616,
        0.5440367 ],
       [0.4898051 , 0.4010282 , 0.45603973, ..., 0.392355  , 0.5465311 ,
        0.48506096],
       [0.46907258, 0.4166221 , 0.3919567 , ..., 0.34186682, 0.59587497,
        0.5051658 ]], dtype=float32)

'spectral_centroid'

(4555, 2001, 1)

array([[0.61137424],
       [0.52825984],
       [0.52315837],
       ...,
       [0.35439899],
       [0.44519478],
       [0.40538586]])

In [None]:
val_y = features.val_y
val_y = np.array(val_y)

display(val_y.shape)
display(val_y[:5])

(1785,)

array(['comsan', 'comsan', 'comsan', 'comsan', 'comsan'], dtype='<U7')

In [None]:
val_features = features.val_features

for key in val_features.keys():
  display(key)
  display(val_features[key].shape)
  display(val_features[key][0])

'mfcc'

(1785, 2001, 20)

array([[0.28553534, 0.82879555, 0.4685034 , ..., 0.6132169 , 0.7109858 ,
        0.60240287],
       [0.2767973 , 0.7748526 , 0.52716374, ..., 0.56047726, 0.65523845,
        0.53791463],
       [0.288247  , 0.7732297 , 0.51268685, ..., 0.5508988 , 0.6520843 ,
        0.5922797 ],
       ...,
       [0.38764858, 0.67271477, 0.5722601 , ..., 0.5596724 , 0.44758523,
        0.6908587 ],
       [0.38421202, 0.67129374, 0.55717045, ..., 0.58740664, 0.5287953 ,
        0.76876664],
       [0.41384888, 0.5997746 , 0.70168513, ..., 0.65828586, 0.64106286,
        0.66367024]], dtype=float32)

'spectral_centroid'

(1785, 2001, 1)

array([[0.52498882],
       [0.54336891],
       [0.56493021],
       ...,
       [0.57962745],
       [0.62304436],
       [0.74273091]])

# Encode classes

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder().fit(train_y)
train_y = label_encoder.transform(train_y)
val_y = label_encoder.transform(val_y)
classes = list(label_encoder.inverse_transform([0,1,2]))
classes

['barswa', 'comsan', 'eaywag1']

In [None]:
print(len(train_y))
print(train_y[:5])
print(len(val_y))
print(val_y[:5])
print(classes)

4555
[0 0 0 0 0]
1785
[1 1 1 1 1]
['barswa', 'comsan', 'eaywag1']


In [None]:
del LabelEncoder
del label_encoder

In [None]:
%who

Audio	 Extraction	 Framed	 MinMaxScaler	 classes	 classification_report	 df	 drive	 features	 
features_list	 framed	 key	 librosa	 np	 os	 pd	 plt	 sns	 
tf	 time	 train_features	 train_y	 val_features	 val_y	 


# shuffle the data before feeding into the model

In [None]:
train_len = len(train_y)
np.random.seed(1234)
train_indices = np.random.permutation(train_len)
train_features = {key: np.array([train_features[key][i] for i in train_indices]) for key in train_features}
train_y = np.array([train_y[i] for i in train_indices])

display(train_y.shape)
display(train_y[:5])

for key in train_features.keys():
  display(key)
  display(train_features[key].shape)
  display(train_features[key][0])

(4555,)

array([0, 0, 1, 2, 1])

'mfcc'

(4555, 2001, 20)

array([[0.3553276 , 0.64529294, 0.28979385, ..., 0.8388775 , 0.5595019 ,
        0.61987257],
       [0.3672787 , 0.6013518 , 0.286879  , ..., 0.43387967, 0.71006435,
        0.609848  ],
       [0.37307036, 0.59147096, 0.37521827, ..., 0.6228888 , 0.65028226,
        0.8110755 ],
       ...,
       [0.49818277, 0.2488049 , 0.7156229 , ..., 0.9378917 , 0.37893093,
        0.6738539 ],
       [0.47667885, 0.23692179, 0.7962152 , ..., 0.8186811 , 0.5097628 ,
        0.65689635],
       [0.63455796, 0.24975747, 0.6249111 , ..., 0.5330628 , 0.4508566 ,
        0.5310422 ]], dtype=float32)

'spectral_centroid'

(4555, 2001, 1)

array([[0.55942881],
       [0.46440581],
       [0.55651992],
       ...,
       [0.86191297],
       [0.86035616],
       [0.78218592]])

In [None]:
val_len = len(val_y)
np.random.seed(1234)
val_indices = np.random.permutation(val_len)
val_features = {key: np.array([val_features[key][i] for i in val_indices]) for key in val_features}
val_y = np.array([val_y[i] for i in val_indices])

display(val_y.shape)
display(val_y[:5])

for key in val_features.keys():
  display(key)
  display(val_features[key].shape)
  display(val_features[key][0])

(1785,)

array([1, 0, 2, 2, 2])

'mfcc'

(1785, 2001, 20)

array([[0.28581583, 0.82879555, 0.4685034 , ..., 0.6132169 , 0.7109858 ,
        0.60240287],
       [0.27707088, 0.7748526 , 0.52716374, ..., 0.56047726, 0.65523845,
        0.53791463],
       [0.28853047, 0.7732297 , 0.51268685, ..., 0.5508988 , 0.6520843 ,
        0.5922797 ],
       ...,
       [0.34863627, 0.85480016, 0.634158  , ..., 0.46598727, 0.41815892,
        0.38599724],
       [0.33774936, 0.8664254 , 0.6333518 , ..., 0.47383314, 0.54734963,
        0.34620386],
       [0.44557786, 0.85717875, 0.48014978, ..., 0.5249997 , 0.552079  ,
        0.48980725]], dtype=float32)

'spectral_centroid'

(1785, 2001, 1)

array([[0.60096885],
       [0.49099804],
       [0.44816606],
       ...,
       [0.00609501],
       [0.02398301],
       [0.19629661]])

# Define LSTM functions

In [None]:
train_results = {}
val_results = {}

In [None]:
def build_model(input_features, learning_rate=0.0001):

  tf.keras.backend.clear_session()

  model = tf.keras.models.Sequential()

  # add input layer
  model.add(tf.keras.layers.LSTM(64,input_shape=(input_features.shape[1],input_features.shape[2]), name='Input'))

  # hidden layers
  model.add(tf.keras.layers.Dense(units=64,
                                  activation='relu',
                                  kernel_regularizer=tf.keras.regularizers.l2(0.01)))
  model.add(tf.keras.layers.Dense(units=32,
                                  activation='relu',
                                  kernel_regularizer=tf.keras.regularizers.l2(0.01)))
  model.add(tf.keras.layers.Dense(units=16,
                                  activation='relu',
                                  kernel_regularizer=tf.keras.regularizers.l2(0.01)))

  # dropout layer
  model.add(tf.keras.layers.Dropout(rate=0.3, name='Dropout'))

  # output layer
  model.add(tf.keras.layers.Dense(units=3, activation='softmax', name='Output'))

  # compile model
  model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                metrics=['accuracy'])

  return model

In [None]:
def visualize(model_history, best_epoch):
  # visualize the loss and accuracy
  fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize=(15,5))

  axes[0].plot(model_history.history['loss'][:best_epoch+1], color='purple', label='losses')
  axes[0].plot(model_history.history['val_loss'][:best_epoch+1], color='orange', label='val_losses')
  axes[0].set_xlabel('Epoch')
  axes[0].set_ylabel('Loss')
  axes[0].set_title('Loss Progression')
  axes[0].legend()

  axes[1].plot(model_history.history['accuracy'][:best_epoch+1], color='purple', label='accuracy')
  axes[1].plot(model_history.history['val_accuracy'][:best_epoch+1], color='orange', label='val_accuracy')
  axes[1].set_xlabel('Epoch')
  axes[1].set_ylabel('Accuracy')
  axes[1].set_title('Accuracy Progression')
  axes[1].legend()
  plt.show()

# build a LSTM model using all features

In [None]:
training_features = np.concatenate((train_features['mfcc'], train_features['spectral_centroid']), axis=-1)
training_features

array([[[0.35532761, 0.64529294, 0.28979385, ..., 0.55950189,
         0.61987257, 0.55942881],
        [0.3672787 , 0.6013518 , 0.286879  , ..., 0.71006435,
         0.60984802, 0.46440581],
        [0.37307036, 0.59147096, 0.37521827, ..., 0.65028226,
         0.81107551, 0.55651992],
        ...,
        [0.49818277, 0.2488049 , 0.7156229 , ..., 0.37893093,
         0.67385387, 0.86191297],
        [0.47667885, 0.23692179, 0.79621518, ..., 0.50976282,
         0.65689635, 0.86035616],
        [0.63455796, 0.24975747, 0.62491113, ..., 0.4508566 ,
         0.53104222, 0.78218592]],

       [[0.11940956, 0.82879555, 0.46850339, ..., 0.71098578,
         0.60240287, 0.56455545],
        [0.11491239, 0.77492076, 0.52655798, ..., 0.65428823,
         0.53701419, 0.57616006],
        [0.12037778, 0.77322972, 0.51268685, ..., 0.65208429,
         0.59227967, 0.60270639],
        ...,
        [0.38870358, 0.46339208, 0.39338219, ..., 0.4867807 ,
         0.33494741, 0.49045411],
        [0.4

In [None]:
training_features.shape

(4555, 2001, 21)

In [None]:
validation_features = np.concatenate((val_features['mfcc'], val_features['spectral_centroid']), axis=-1)
validation_features

array([[[0.28581583, 0.82879555, 0.46850339, ..., 0.71098578,
         0.60240287, 0.60096885],
        [0.27707088, 0.77485257, 0.52716374, ..., 0.65523845,
         0.53791463, 0.49099804],
        [0.28853047, 0.77322972, 0.51268685, ..., 0.65208429,
         0.59227967, 0.44816606],
        ...,
        [0.34863627, 0.85480016, 0.63415802, ..., 0.41815892,
         0.38599724, 0.00609501],
        [0.33774936, 0.8664254 , 0.6333518 , ..., 0.54734963,
         0.34620386, 0.02398301],
        [0.44557786, 0.85717875, 0.48014978, ..., 0.55207902,
         0.48980725, 0.19629661]],

       [[0.12070894, 0.82879555, 0.46850339, ..., 0.71098578,
         0.60240287, 0.59282535],
        [0.11604333, 0.77485257, 0.52716374, ..., 0.65523845,
         0.53791463, 0.57428279],
        [0.12183535, 0.7734887 , 0.51250196, ..., 0.65028304,
         0.59174019, 0.55518701],
        ...,
        [0.55499864, 0.35420373, 0.21736497, ..., 0.32488856,
         0.60312176, 0.39367425],
        [0.5

In [None]:
validation_features.shape

(1785, 2001, 21)

In [None]:
model = build_model(training_features, learning_rate=0.00002)

callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=50,
    restore_best_weights=True,
    start_from_epoch=50,
)

history = model.fit(
   x=training_features,
   y=train_y,
   epochs=300,
   validation_data=(validation_features, val_y),
   batch_size=32,
   callbacks=[callback],
   verbose=0)

best_epoch = history.history['val_loss'].index(min(history.history['val_loss']))

display('best_epoch:', best_epoch)

visualize(history, best_epoch)

## evaluate the results

`On training set`

In [None]:
# % of each class in training set
display((train_y == 0).sum()/len(train_y))
display((train_y == 1).sum()/len(train_y))
display((train_y == 2).sum()/len(train_y))

In [None]:
# predict using the training set
train_yhat = model.predict(training_features)
train_yhat_result = np.argmax(train_yhat, axis=-1)

In [None]:
train_results['all_features'] = model.evaluate(training_features,
                                                train_y)[-1]

In [None]:
print('\nTraining Classification Report\n')
print(classification_report(train_y, train_yhat_result, target_names=classes))

In [None]:
# calculate the confusion matrix
train_cm = tf.math.confusion_matrix(train_y, train_yhat_result)

# use a heatmap to display the confusion matrix
ax = sns.heatmap(train_cm,
                 annot=True,
                 fmt='.0f',
                 cmap='Blues',
                 xticklabels=classes,
                 yticklabels=classes,
                 cbar=False)

# add axis labels
ax.set(xlabel='Predicted Label',ylabel='True Label')
plt.title('train confusion matrix')
plt.show()

`On validation set`

In [None]:
# % of each class in validation set
display((val_y == 0).sum()/len(val_y))
display((val_y == 1).sum()/len(val_y))
display((val_y == 2).sum()/len(val_y))

In [None]:
# predict using the validation set
val_yhat = model.predict(validation_features)
val_yhat_result = np.argmax(val_yhat, axis=-1)

In [None]:
val_results['all_features'] = model.evaluate(validation_features,
                                            val_y)[-1]

In [None]:
print('\nValidation Classification Report\n')
print(classification_report(val_y, val_yhat_result, target_names=classes))

In [None]:
# calculate the confusion matrix
val_cm = tf.math.confusion_matrix(val_y, val_yhat_result)

# use a heatmap to display the confusion matrix
ax = sns.heatmap(val_cm,
                 annot=True,
                 fmt='.0f',
                 cmap='Blues',
                 xticklabels=classes,
                 yticklabels=classes,
                 cbar=False)

# add axis labels
ax.set(xlabel='Predicted Label',ylabel='True Label')
plt.title('validation confusion matrix')
plt.show()

# summary of results

In [None]:
train_results_df = pd.DataFrame(list(train_results.items()), columns=['Features', 'Train_Accuracy']).round(2)
val_results_df = pd.DataFrame(list(val_results.items()), columns=['Features', 'Val_Accuracy']).round(2)

result_df = train_results_df.merge(val_results_df, on='Features')
result_df = result_df.sort_values('Val_Accuracy')
result_df