<a href="https://colab.research.google.com/github/rachlllg/Project_Bird-Song-Classifier-with-Machine-Learning/blob/main/5_sec_audio_features_avgpooled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
# for drive access
from google.colab import drive
drive.mount('/content/drive')

# standard libraries
import numpy as np
import pandas as pd
import pickle
import time

# for audio
import librosa

# for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

# for saving the label & features to disk
import pickle

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Self Defined Class Methods For Feature Extraction

In [None]:
class Extraction:

  def __init__(self,
               train_df,
               val_df,
               sr=16000,
               n_mfccs=20, #20 is the default n_mfccs from librosa
               n_mels=20, #reduced from 128 default from librosa to 20
               n_chroma=12, #12 is the default n_chroma from librosa
               features=['mfcc'],
               do_normalize=True,
               do_avgpool=True #WARNING: THE KERNEL WILL CRASH IF YOU TRY TO EXTRACT ALL FEATURES WITHOUT AVGPOOL
               ):
    """
    Instantiate the Extraction class and extract the labels and features from train and val DataFrames.

    The extract_features() method is automatically called to extract the labels and features from the given DataFrame.

    Parameters:
      train_df (pd.DataFrame): DataFrame containing training data.
      val_df (pd.DataFrame): DataFrame containing validation data.
      sr (int): Sample rate of the audio files.
      n_mfccs (int): Number of MFCCs to extract.
      n_mels (int): Number of Mel bands to extract.
      n_chroma (int): Number of chroma bins to use.
      features (list): List of features to extract.
        accepted features: 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type'.
      normalize (bool): Whether to normalize the features.
      maxpool (bool): Whether to maxpool the features.
    """
    # instantiate class variables
    self.sr = sr
    self.n_mfccs = n_mfccs
    self.n_mels = n_mels
    self.n_chroma = n_chroma
    self.do_normalize = do_normalize
    self.do_avgpool = do_avgpool

    # confirm features have been specified
    assert len(features) != 0, "Must Specify At Least One Feature In The Form Of A List."
    self.features = features

    # confirm features are valid
    self.accepted_feature = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type']
    for each in self.features:
      assert each in self.accepted_feature, f"{each} is not an accepted feature, only 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', and 'type' are accpeted features."

    # extract train and val labels and features
    self.train_y, self.train_features = self.extract_features(train_df)
    self.val_y, self.val_features = self.extract_features(val_df)

    # process the features by applying normalization or average pooling
    self.train_features, self.val_features = self.process_features(self.train_features, self.val_features)


  ########################################
  # each of the extract_x() function is callable by the extract_features() function based on the features instantiated
  # each of the features are extracted from librosa and transposed to shape (n_time, n_features)
  ########################################
  def extract_mfcc(self, each):
    return np.transpose(librosa.feature.mfcc(y=np.array(each), sr=self.sr, n_mfcc=self.n_mfccs))

  def extract_chroma(self, each):
    return np.transpose(librosa.feature.chroma_stft(y=np.array(each), sr=self.sr, n_chroma=self.n_chroma))

  def extract_rms(self, each):
    return np.transpose(librosa.feature.rms(y=np.array(each)))

  def extract_spectral_centroid(self, each):
    return np.transpose(librosa.feature.spectral_centroid(y=np.array(each), sr=self.sr))

  def extract_melspectrogram(self, each):
    mel = librosa.feature.melspectrogram(y=np.array(each), sr=self.sr, n_mels=self.n_mels)
    return np.transpose(librosa.power_to_db(mel))


  ########################################
  # normalization function is called by process_features() function for the features that require normalization
  ########################################
  def normalization(self, train_X, val_X, n_time, n_features):
    """
    Normalize the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Normalized training feature
      val_X (npy): Normalized validation feature
    """
    # flatten the np arrays to 1D
    train_X_reshape = train_X.reshape(-1, n_time * n_features)
    val_X_reshape = val_X.reshape(-1, n_time * n_features)

    # use minmaxscaler to normalize the train and val features
    scaler = MinMaxScaler((0,1)).fit(train_X_reshape)
    train_X_scaled = scaler.transform(train_X_reshape)
    val_X_scaled = scaler.transform(val_X_reshape)

    # reshape the features to the original shape (n_time, n_features)
    train_X = train_X_scaled.reshape(-1, n_time, n_features)
    val_X = val_X_scaled.reshape(-1, n_time, n_features)

    return train_X, val_X


  ########################################
  # avgpooling function is called by process_features() function for the features that require average pooling
  ########################################
  def avgpooling(self, train_X, val_X, n_time, n_features):
    """
    Average pooling the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Avgpooled training feature
      val_X (npy): Avgpooled validation feature
    """
    # instantiate the keras layers and model.
    # the model only performs average pooling of inputs
    tf.keras.backend.clear_session()
    input = tf.keras.layers.Input(shape=(n_time, n_features))
    global_avg_pooling = tf.keras.layers.GlobalAveragePooling1D()
    pooled = global_avg_pooling(input)
    pooling_model = tf.keras.models.Model(inputs=input, outputs=pooled)

    # use the instantiated model to avgpool the train and val features
    train_X = pooling_model.predict(train_X)
    val_X = pooling_model.predict(val_X)

    return train_X, val_X


  ########################################
  # process_features function is called when the class is instantiated
  ########################################
  def process_features(self, train_features_dict, val_features_dict):
    """
    Apply normalization or average pooling to train and val features

    Parameters:
      train_features_dict (dict): Dictionary containing the training features
      val_features_dict (dict): Dictionary containing the validation features

    Returns:
      tuple: Tuple containing normalized and/or average pooled training and validation features dictionary
    """
    start_time = time.time()
    display('features processing started ---------->')

    for each in train_features_dict.keys():
      # 'continent', 'rating', and 'type' do not need to be processed
      if each == 'continent' or each == 'rating' or each == 'type':
        pass
      # process numeric features
      else:
        if each == 'mfcc':
          n_features=self.n_mfccs
        elif each == 'chroma':
          n_features=self.n_chroma
        elif each == 'rms' or each == 'spectral_centroid':
          n_features=1
        elif each == 'melspectrogram':
          n_features=self.n_mels

        # index the dictionary to find the value of the feature based on the feature name (key)
        train_feature = train_features_dict[each]
        val_feature = val_features_dict[each]

        # call normalization or avgpooling function to apply normalization or average pooling to features
        if self.do_normalize and self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_features_dict[each], val_features_dict[each], n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_normalize:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        else:
          train_features_dict[each], val_features_dict[each] = train_features_dict[each], val_features_dict[each]

        display(f'{each} processed')

    end_time = time.time()
    display(f'features processing took {(end_time - start_time)/60:.1f} mins')

    return train_features_dict, val_features_dict


  ########################################
  # extract_features function is called when the class is instantiated
  ########################################
  def extract_features(self, dataframe):
    """
    Extract the label & features from the dataframes

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information

    Returns:
      tuple: Tuple containing labels and a dictionary of extracted features
    """
    start_time = time.time()
    display(f'feature extraction started ---------->')

    # create empty list and dict to store the labels and features
    y = []
    features_dict = {item: [] for item in self.features}

    # iterate through each row of the dataframe to extract the label and features
    for _, row in dataframe.iterrows():
      label = row['primary_label']
      framed = row['framed']

      for each in framed:
        y.append(label)

        if 'continent' in self.features:
          features_dict['continent'].append(row['continent'])
        if 'rating' in self.features:
          features_dict['rating'].append(row['rating'])
        if 'type' in self.features:
          features_dict['type'].append(row['type'])

        # dynatically call the extract_x function to extract the listed features
        for feature in self.features:
          extract = f"extract_{feature}"
          if hasattr(self, extract) and callable(func := getattr(self, extract)):
            features_dict[feature].append(func(each))

    # cast lists to np arrays
    for each in features_dict.keys():
      features_dict[each] = np.array(features_dict[each])
    y = np.array(y)

    end_time = time.time()
    display(f'features extraction took {(end_time - start_time)/60:.1f} mins')

    return y, features_dict


# Load train and test framed audio data

In [None]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/train_df_5_sec.pkl', 'rb') as file:
  train_df = pickle.load(file)

train_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
240,eaywag1,call,42.965,9.4512,5.0,eaywag1/XC471811.ogg,17.8155,FR,EUROPE,eaywag1/XC471811.npy,train,"((tf.Tensor(7.637085e-06, shape=(), dtype=floa..."
542,barswa,song,42.806,13.8335,4.0,barswa/XC371853.ogg,50.834313,IT,EUROPE,barswa/XC371853.npy,train,"((tf.Tensor(-1.867149e-06, shape=(), dtype=flo..."
214,comsan,call,60.2357,25.0058,1.0,comsan/XC554068.ogg,25.6,FI,EUROPE,comsan/XC554068.npy,train,"((tf.Tensor(8.780201e-06, shape=(), dtype=floa..."
492,barswa,song,53.9299,-2.9833,2.5,barswa/XC690496.ogg,15.098812,GB,EUROPE,barswa/XC690496.npy,train,"((tf.Tensor(4.4646572e-07, shape=(), dtype=flo..."
190,comsan,call,51.5579,17.509,4.0,comsan/XC492893.ogg,11.755,PL,EUROPE,comsan/XC492893.npy,train,"((tf.Tensor(-3.6929268e-06, shape=(), dtype=fl..."


In [None]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/val_df_5_sec.pkl', 'rb') as file:
  val_df = pickle.load(file)

val_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
969,comsan,call,41.8074,-8.8626,4.0,comsan/XC670080.ogg,11.206,PT,EUROPE,comsan/XC670080.npy,val,"((tf.Tensor(9.328758e-06, shape=(), dtype=floa..."
851,comsan,blank,51.6578,19.3281,5.0,comsan/XC738993.ogg,11.616,PL,EUROPE,comsan/XC738993.npy,val,"((tf.Tensor(6.5908675e-06, shape=(), dtype=flo..."
998,barswa,song,48.2131,-3.0137,5.0,barswa/XC643586.ogg,49.536,FR,EUROPE,barswa/XC643586.npy,val,"((tf.Tensor(1.9546133e-06, shape=(), dtype=flo..."
771,comsan,call,56.0779,47.9129,5.0,comsan/XC371997.ogg,12.355937,RU,EUROPE,comsan/XC371997.npy,val,"((tf.Tensor(5.00679e-06, shape=(), dtype=float..."
988,eaywag1,call,52.2003,-6.4349,2.0,eaywag1/XC687527.ogg,15.464,IE,EUROPE,eaywag1/XC687527.npy,val,"((tf.Tensor(2.4605542e-06, shape=(), dtype=flo..."


# Extract features

In [None]:
features_list = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram']

In [None]:
%%time

features = Extraction(train_df,
                      val_df,
                      features=features_list,
                      do_normalize=True,
                      do_avgpool=True)

'feature extraction started ---------->'

  return pitch_tuning(


'features extraction took 11.7 mins'

'feature extraction started ---------->'

'features extraction took 6.2 mins'

'features processing started ---------->'



'mfcc processed'



'chroma processed'



'rms processed'



'spectral_centroid processed'



'melspectrogram processed'

'features processing took 0.1 mins'

CPU times: user 16min 31s, sys: 11min 42s, total: 28min 14s
Wall time: 18min 1s


In [None]:
train_y = features.train_y

display(train_y.shape)
display(train_y[:5])

(8236,)

array(['eaywag1', 'eaywag1', 'eaywag1', 'eaywag1', 'eaywag1'], dtype='<U7')

In [None]:
train_features = features.train_features

for key in train_features.keys():
  display(key)
  display(train_features[key].shape)
  display(train_features[key][0])

'mfcc'

(8236, 20)

array([0.3391694 , 0.40924874, 0.5358972 , 0.5670248 , 0.48861542,
       0.49416122, 0.53483653, 0.46956772, 0.5273787 , 0.42815846,
       0.4858381 , 0.55727714, 0.48681408, 0.54893106, 0.5173583 ,
       0.47444746, 0.49698934, 0.54475796, 0.50564116, 0.46894422],
      dtype=float32)

'chroma'

(8236, 12)

array([0.3414871 , 0.35261095, 0.38855013, 0.2892622 , 0.27807856,
       0.28740257, 0.3437926 , 0.38803992, 0.43501127, 0.37608668,
       0.36091387, 0.37655085], dtype=float32)

'rms'

(8236, 1)

array([0.01620677], dtype=float32)

'spectral_centroid'

(8236, 1)

array([0.5986368], dtype=float32)

'melspectrogram'

(8236, 20)

array([0.26168185, 0.22435948, 0.23347075, 0.2257035 , 0.23461936,
       0.26539275, 0.28566006, 0.2849377 , 0.28847882, 0.3145214 ,
       0.3290944 , 0.3545447 , 0.34914643, 0.366206  , 0.400232  ,
       0.41124317, 0.41550747, 0.3776026 , 0.3694587 , 0.32380182],
      dtype=float32)

In [None]:
val_y = features.val_y
val_y = np.array(val_y)

display(val_y.shape)
display(val_y[:5])

(4236,)

array(['comsan', 'comsan', 'comsan', 'comsan', 'comsan'], dtype='<U7')

In [None]:
val_features = features.val_features

for key in val_features.keys():
  display(key)
  display(val_features[key].shape)
  display(val_features[key][0])

'mfcc'

(4236, 20)

array([0.6117639 , 0.45735338, 0.32603553, 0.31278074, 0.41163567,
       0.33088478, 0.45899308, 0.3755749 , 0.48617023, 0.36413723,
       0.5081278 , 0.46959385, 0.49548495, 0.47380266, 0.35587388,
       0.46973756, 0.47127017, 0.4715686 , 0.45227003, 0.46532288],
      dtype=float32)

'chroma'

(4236, 12)

array([0.5587291 , 0.6693529 , 0.7382125 , 0.569045  , 0.37887034,
       0.30884057, 0.33714068, 0.38599586, 0.45166773, 0.47486994,
       0.48054758, 0.4789536 ], dtype=float32)

'rms'

(4236, 1)

array([0.01516566], dtype=float32)

'spectral_centroid'

(4236, 1)

array([0.4760819], dtype=float32)

'melspectrogram'

(4236, 20)

array([0.35488096, 0.3936562 , 0.4730693 , 0.54626846, 0.5947946 ,
       0.63286024, 0.669766  , 0.6793414 , 0.66020733, 0.66840255,
       0.65349996, 0.6252907 , 0.5924856 , 0.5647165 , 0.5370612 ,
       0.55427355, 0.57385606, 0.53045094, 0.49662367, 0.50499874],
      dtype=float32)

# Encode Classes

In [None]:
label_encoder = LabelEncoder().fit(train_y)
train_y = label_encoder.transform(train_y)
val_y = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0,1,2]))
classes

['barswa', 'comsan', 'eaywag1']

In [None]:
display(len(train_y))
display(train_y[:10])

display(len(val_y))
display(val_y[:10])

8236

array([2, 2, 2, 2, 2, 2, 0, 0, 0, 0])

4236

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0])

# Put the labels and features to one large dataframe

In [None]:
train_features['label'] = train_y
train_features

{'mfcc': array([[0.3391694 , 0.40924874, 0.5358972 , ..., 0.54475796, 0.50564116,
         0.46894422],
        [0.33568618, 0.41750646, 0.53681815, ..., 0.542808  , 0.5288323 ,
         0.45956683],
        [0.36065662, 0.41422567, 0.5833725 , ..., 0.52050847, 0.55034566,
         0.44331655],
        ...,
        [0.59729165, 0.4938564 , 0.38847885, ..., 0.4174044 , 0.4665348 ,
         0.39914683],
        [0.6012123 , 0.51702017, 0.3863889 , ..., 0.42538506, 0.46877104,
         0.41223076],
        [0.6082435 , 0.5434134 , 0.37211478, ..., 0.4219473 , 0.46813804,
         0.40352035]], dtype=float32),
 'chroma': array([[0.3414871 , 0.35261095, 0.38855013, ..., 0.37608668, 0.36091387,
         0.37655085],
        [0.39570332, 0.35266703, 0.3601897 , ..., 0.3406976 , 0.36317727,
         0.4071223 ],
        [0.44144076, 0.37290844, 0.3611199 , ..., 0.27120224, 0.36760825,
         0.40015313],
        ...,
        [0.5226286 , 0.5496563 , 0.5463212 , ..., 0.5537217 , 0.5842862 ,
 

In [None]:
val_features['label'] = val_y
val_features

{'mfcc': array([[0.6117639 , 0.45735338, 0.32603553, ..., 0.4715686 , 0.45227003,
         0.46532288],
        [0.6099412 , 0.47829324, 0.30729547, ..., 0.4468575 , 0.45525885,
         0.4413513 ],
        [0.57958215, 0.4876538 , 0.28921005, ..., 0.42907423, 0.43618467,
         0.43834025],
        ...,
        [0.46997085, 0.6986984 , 0.6882918 , ..., 0.4631213 , 0.49106282,
         0.45474696],
        [0.47383025, 0.69971573, 0.68370336, ..., 0.4704221 , 0.4982638 ,
         0.46479496],
        [0.47414112, 0.7015769 , 0.6778016 , ..., 0.4751861 , 0.4956097 ,
         0.47461113]], dtype=float32),
 'chroma': array([[0.5587291 , 0.6693529 , 0.7382125 , ..., 0.47486994, 0.48054758,
         0.4789536 ],
        [0.59523904, 0.7246345 , 0.7710419 , ..., 0.5111967 , 0.5072777 ,
         0.5540884 ],
        [0.7001934 , 0.7302617 , 0.77242446, ..., 0.5601306 , 0.5455484 ,
         0.5827179 ],
        ...,
        [0.68066764, 0.53032273, 0.42924815, ..., 0.69034696, 0.6195136 ,
 

In [None]:
merged_dict = {'train': train_features, 'val': val_features}

merged_dict

{'train': {'mfcc': array([[0.3391694 , 0.40924874, 0.5358972 , ..., 0.54475796, 0.50564116,
          0.46894422],
         [0.33568618, 0.41750646, 0.53681815, ..., 0.542808  , 0.5288323 ,
          0.45956683],
         [0.36065662, 0.41422567, 0.5833725 , ..., 0.52050847, 0.55034566,
          0.44331655],
         ...,
         [0.59729165, 0.4938564 , 0.38847885, ..., 0.4174044 , 0.4665348 ,
          0.39914683],
         [0.6012123 , 0.51702017, 0.3863889 , ..., 0.42538506, 0.46877104,
          0.41223076],
         [0.6082435 , 0.5434134 , 0.37211478, ..., 0.4219473 , 0.46813804,
          0.40352035]], dtype=float32),
  'chroma': array([[0.3414871 , 0.35261095, 0.38855013, ..., 0.37608668, 0.36091387,
          0.37655085],
         [0.39570332, 0.35266703, 0.3601897 , ..., 0.3406976 , 0.36317727,
          0.4071223 ],
         [0.44144076, 0.37290844, 0.3611199 , ..., 0.27120224, 0.36760825,
          0.40015313],
         ...,
         [0.5226286 , 0.5496563 , 0.5463212 , 

# save the merged dict with labels and features to pkl

In [None]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_5_sec_audio_features_avgpooled.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

In [None]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_5_sec_audio_features_avgpooled.pkl', 'rb') as file:
  merged_df = pickle.load(file)

In [None]:
assert merged_dict.keys() == merged_df.keys()

for top_level_key in merged_dict.keys():
  assert merged_dict[top_level_key].keys() == merged_df[top_level_key].keys()

  for bottom_level_key in merged_dict[top_level_key].keys():
    assert merged_dict[top_level_key][bottom_level_key].shape == merged_df[top_level_key][bottom_level_key].shape
    assert np.array_equal(merged_dict[top_level_key][bottom_level_key], merged_df[top_level_key][bottom_level_key])