<a href="https://colab.research.google.com/github/rachlllg/Project_Bird-Song-Classifier-with-Machine-Learning/blob/main/3.model_prep/d.extract_features_labels/8_sec_audio_features_avgpooled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
# for drive access
from google.colab import drive
drive.mount('/content/drive')

# standard libraries
import numpy as np
import pandas as pd
import pickle
import time

# for audio
import librosa

# for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

# for saving the label & features to disk
import pickle

Mounted at /content/drive


# Self Defined Class Methods For Feature Extraction

In [2]:
class Extraction:

  def __init__(self,
               train_df,
               val_df,
               sr=16000,
               n_mfccs=20, #20 is the default n_mfccs from librosa
               n_mels=20, #reduced from 128 default from librosa to 20
               n_chroma=12, #12 is the default n_chroma from librosa
               features=['mfcc'],
               do_normalize=True,
               do_avgpool=True #WARNING: THE KERNEL WILL CRASH IF YOU TRY TO EXTRACT ALL FEATURES WITHOUT AVGPOOL
               ):
    """
    Instantiate the Extraction class and extract the labels and features from train and val DataFrames.

    The extract_features() method is automatically called to extract the labels and features from the given DataFrame.

    Parameters:
      train_df (pd.DataFrame): DataFrame containing training data.
      val_df (pd.DataFrame): DataFrame containing validation data.
      sr (int): Sample rate of the audio files.
      n_mfccs (int): Number of MFCCs to extract.
      n_mels (int): Number of Mel bands to extract.
      n_chroma (int): Number of chroma bins to use.
      features (list): List of features to extract.
        accepted features: 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type'.
      normalize (bool): Whether to normalize the features.
      maxpool (bool): Whether to maxpool the features.
    """
    # instantiate class variables
    self.sr = sr
    self.n_mfccs = n_mfccs
    self.n_mels = n_mels
    self.n_chroma = n_chroma
    self.do_normalize = do_normalize
    self.do_avgpool = do_avgpool

    # confirm features have been specified
    assert len(features) != 0, "Must Specify At Least One Feature In The Form Of A List."
    self.features = features

    # confirm features are valid
    self.accepted_feature = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type']
    for each in self.features:
      assert each in self.accepted_feature, f"{each} is not an accepted feature, only 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', and 'type' are accpeted features."

    # extract train and val labels and features
    self.train_y, self.train_features = self.extract_features(train_df)
    self.val_y, self.val_features = self.extract_features(val_df)

    # process the features by applying normalization or average pooling
    self.train_features, self.val_features = self.process_features(self.train_features, self.val_features)


  ########################################
  # each of the extract_x() function is callable by the extract_features() function based on the features instantiated
  # each of the features are extracted from librosa and transposed to shape (n_time, n_features)
  ########################################
  def extract_mfcc(self, each):
    return np.transpose(librosa.feature.mfcc(y=np.array(each), sr=self.sr, n_mfcc=self.n_mfccs))

  def extract_chroma(self, each):
    return np.transpose(librosa.feature.chroma_stft(y=np.array(each), sr=self.sr, n_chroma=self.n_chroma))

  def extract_rms(self, each):
    return np.transpose(librosa.feature.rms(y=np.array(each)))

  def extract_spectral_centroid(self, each):
    return np.transpose(librosa.feature.spectral_centroid(y=np.array(each), sr=self.sr))

  def extract_melspectrogram(self, each):
    mel = librosa.feature.melspectrogram(y=np.array(each), sr=self.sr, n_mels=self.n_mels)
    return np.transpose(librosa.power_to_db(mel))


  ########################################
  # normalization function is called by process_features() function for the features that require normalization
  ########################################
  def normalization(self, train_X, val_X, n_time, n_features):
    """
    Normalize the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Normalized training feature
      val_X (npy): Normalized validation feature
    """
    # flatten the np arrays to 1D
    train_X_reshape = train_X.reshape(-1, n_time * n_features)
    val_X_reshape = val_X.reshape(-1, n_time * n_features)

    # use minmaxscaler to normalize the train and val features
    scaler = MinMaxScaler((0,1)).fit(train_X_reshape)
    train_X_scaled = scaler.transform(train_X_reshape)
    val_X_scaled = scaler.transform(val_X_reshape)

    # reshape the features to the original shape (n_time, n_features)
    train_X = train_X_scaled.reshape(-1, n_time, n_features)
    val_X = val_X_scaled.reshape(-1, n_time, n_features)

    return train_X, val_X


  ########################################
  # avgpooling function is called by process_features() function for the features that require average pooling
  ########################################
  def avgpooling(self, train_X, val_X, n_time, n_features):
    """
    Average pooling the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Avgpooled training feature
      val_X (npy): Avgpooled validation feature
    """
    # instantiate the keras layers and model.
    # the model only performs average pooling of inputs
    tf.keras.backend.clear_session()
    input = tf.keras.layers.Input(shape=(n_time, n_features))
    global_avg_pooling = tf.keras.layers.GlobalAveragePooling1D()
    pooled = global_avg_pooling(input)
    pooling_model = tf.keras.models.Model(inputs=input, outputs=pooled)

    # use the instantiated model to avgpool the train and val features
    train_X = pooling_model.predict(train_X)
    val_X = pooling_model.predict(val_X)

    return train_X, val_X


  ########################################
  # process_features function is called when the class is instantiated
  ########################################
  def process_features(self, train_features_dict, val_features_dict):
    """
    Apply normalization or average pooling to train and val features

    Parameters:
      train_features_dict (dict): Dictionary containing the training features
      val_features_dict (dict): Dictionary containing the validation features

    Returns:
      tuple: Tuple containing normalized and/or average pooled training and validation features dictionary
    """
    start_time = time.time()
    display('features processing started ---------->')

    for each in train_features_dict.keys():
      # 'continent', 'rating', and 'type' do not need to be processed
      if each == 'continent' or each == 'rating' or each == 'type':
        pass
      # process numeric features
      else:
        if each == 'mfcc':
          n_features=self.n_mfccs
        elif each == 'chroma':
          n_features=self.n_chroma
        elif each == 'rms' or each == 'spectral_centroid':
          n_features=1
        elif each == 'melspectrogram':
          n_features=self.n_mels

        # index the dictionary to find the value of the feature based on the feature name (key)
        train_feature = train_features_dict[each]
        val_feature = val_features_dict[each]

        # call normalization or avgpooling function to apply normalization or average pooling to features
        if self.do_normalize and self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_features_dict[each], val_features_dict[each], n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_normalize:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        else:
          train_features_dict[each], val_features_dict[each] = train_features_dict[each], val_features_dict[each]

        display(f'{each} processed')

    end_time = time.time()
    display(f'features processing took {(end_time - start_time)/60:.1f} mins')

    return train_features_dict, val_features_dict


  ########################################
  # extract_features function is called when the class is instantiated
  ########################################
  def extract_features(self, dataframe):
    """
    Extract the label & features from the dataframes

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information

    Returns:
      tuple: Tuple containing labels and a dictionary of extracted features
    """
    start_time = time.time()
    display(f'feature extraction started ---------->')

    # create empty list and dict to store the labels and features
    y = []
    features_dict = {item: [] for item in self.features}

    # iterate through each row of the dataframe to extract the label and features
    for _, row in dataframe.iterrows():
      label = row['primary_label']
      framed = row['framed']

      for each in framed:
        y.append(label)

        if 'continent' in self.features:
          features_dict['continent'].append(row['continent'])
        if 'rating' in self.features:
          features_dict['rating'].append(row['rating'])
        if 'type' in self.features:
          features_dict['type'].append(row['type'])

        # dynatically call the extract_x function to extract the listed features
        for feature in self.features:
          extract = f"extract_{feature}"
          if hasattr(self, extract) and callable(func := getattr(self, extract)):
            features_dict[feature].append(func(each))

    # cast lists to np arrays
    for each in features_dict.keys():
      features_dict[each] = np.array(features_dict[each])
    y = np.array(y)

    end_time = time.time()
    display(f'features extraction took {(end_time - start_time)/60:.1f} mins')

    return y, features_dict


# Load train and test framed audio data

In [3]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/train_df_8_sec.pkl', 'rb') as file:
  train_df = pickle.load(file)

train_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
668,barswa,blank,-33.2377,26.9574,5.0,barswa/XC713037.ogg,58.331,ZA,AFRICA,barswa/XC713037.npy,train,"((tf.Tensor(-1.3069126e-05, shape=(), dtype=fl..."
247,eaywag1,call,54.1191,13.376,3.5,eaywag1/XC658750.ogg,49.68,DE,EUROPE,eaywag1/XC658750.npy,train,"((tf.Tensor(-1.1132361e-06, shape=(), dtype=fl..."
620,barswa,song,35.0573,34.0009,5.0,barswa/XC405617.ogg,114.207375,CY,ASIA,barswa/XC405617.npy,train,"((tf.Tensor(3.5154517e-06, shape=(), dtype=flo..."
86,comsan,call,43.5352,-1.4475,4.5,comsan/XC580687.ogg,59.82,FR,EUROPE,comsan/XC580687.npy,train,"((tf.Tensor(-7.335485e-06, shape=(), dtype=flo..."
450,eaywag1,song,43.5118,3.8367,3.0,eaywag1/XC567939.ogg,10.031,FR,EUROPE,eaywag1/XC567939.npy,train,"((tf.Tensor(7.8262474e-07, shape=(), dtype=flo..."


In [4]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/val_df_8_sec.pkl', 'rb') as file:
  val_df = pickle.load(file)

val_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
991,barswa,song,39.4709,-8.0205,3.5,barswa/XC624409.ogg,10.008,PT,EUROPE,barswa/XC624409.npy,val,"((tf.Tensor(3.148432e-08, shape=(), dtype=floa..."
760,barswa,call,51.4522,-9.8189,5.0,barswa/XC666943.ogg,11.885,IE,EUROPE,barswa/XC666943.npy,val,"((tf.Tensor(-8.617062e-07, shape=(), dtype=flo..."
876,comsan,call,50.7347,3.2143,4.0,comsan/XC578171.ogg,21.707,BE,EUROPE,comsan/XC578171.npy,val,"((tf.Tensor(-3.9801816e-06, shape=(), dtype=fl..."
927,eaywag1,song,46.0192,61.846,5.0,eaywag1/XC184434.ogg,63.921625,KZ,ASIA,eaywag1/XC184434.npy,val,"((tf.Tensor(-4.7415142e-07, shape=(), dtype=fl..."
880,comsan,call,42.9037,13.9077,4.0,comsan/XC433334.ogg,24.006,IT,EUROPE,comsan/XC433334.npy,val,"((tf.Tensor(-1.1874385e-05, shape=(), dtype=fl..."


# Extract features

In [5]:
features_list = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram']

In [6]:
%%time

features = Extraction(train_df,
                      val_df,
                      features=features_list,
                      do_normalize=True,
                      do_avgpool=True)

'feature extraction started ---------->'

  return pitch_tuning(


'features extraction took 8.6 mins'

'feature extraction started ---------->'

'features extraction took 4.4 mins'

'features processing started ---------->'



'mfcc processed'



'chroma processed'



'rms processed'



'spectral_centroid processed'



'melspectrogram processed'

'features processing took 0.1 mins'

CPU times: user 12min 29s, sys: 8min 39s, total: 21min 9s
Wall time: 13min 8s


In [7]:
train_y = features.train_y

display(train_y.shape)
display(train_y[:5])

(4763,)

array(['barswa', 'barswa', 'barswa', 'barswa', 'barswa'], dtype='<U7')

In [8]:
train_features = features.train_features

for key in train_features.keys():
  display(key)
  display(train_features[key].shape)
  display(train_features[key][0])

'mfcc'

(4763, 20)

array([0.54529715, 0.4793393 , 0.5349132 , 0.6477655 , 0.33921012,
       0.50720143, 0.5774945 , 0.33674356, 0.5244012 , 0.4527355 ,
       0.46150163, 0.5167904 , 0.42641604, 0.5308325 , 0.44796544,
       0.41205934, 0.47362772, 0.4413777 , 0.48703906, 0.4391229 ],
      dtype=float32)

'chroma'

(4763, 12)

array([0.33323073, 0.30428657, 0.2593368 , 0.21065742, 0.2063782 ,
       0.23007749, 0.28214905, 0.34974068, 0.4035503 , 0.5534823 ,
       0.5671482 , 0.39478505], dtype=float32)

'rms'

(4763, 1)

array([0.04363674], dtype=float32)

'spectral_centroid'

(4763, 1)

array([0.4894502], dtype=float32)

'melspectrogram'

(4763, 20)

array([0.4942571 , 0.46630353, 0.46723923, 0.484303  , 0.47135404,
       0.48354036, 0.4830873 , 0.4776831 , 0.47777864, 0.47133243,
       0.46449012, 0.5017648 , 0.5535639 , 0.60845214, 0.64287436,
       0.62849855, 0.5571691 , 0.47733763, 0.43253782, 0.3855449 ],
      dtype=float32)

In [9]:
val_y = features.val_y
val_y = np.array(val_y)

display(val_y.shape)
display(val_y[:5])

(2430,)

array(['barswa', 'barswa', 'comsan', 'comsan', 'comsan'], dtype='<U7')

In [10]:
val_features = features.val_features

for key in val_features.keys():
  display(key)
  display(val_features[key].shape)
  display(val_features[key][0])

'mfcc'

(2430, 20)

array([0.53712696, 0.4974271 , 0.42230383, 0.42471886, 0.4594302 ,
       0.33270204, 0.45457438, 0.38093314, 0.3756868 , 0.3556811 ,
       0.40496013, 0.4339821 , 0.34050068, 0.38577297, 0.3604864 ,
       0.36709645, 0.39496648, 0.42753544, 0.43672395, 0.4028031 ],
      dtype=float32)

'chroma'

(2430, 12)

array([0.6062603 , 0.6611424 , 0.72486585, 0.6485676 , 0.5837077 ,
       0.51898646, 0.49712995, 0.510781  , 0.5144139 , 0.5058629 ,
       0.5512895 , 0.6173802 ], dtype=float32)

'rms'

(2430, 1)

array([0.0056784], dtype=float32)

'spectral_centroid'

(2430, 1)

array([0.49066466], dtype=float32)

'melspectrogram'

(2430, 20)

array([0.4302389 , 0.44570604, 0.4562991 , 0.48507658, 0.5083174 ,
       0.5265989 , 0.55344784, 0.5576655 , 0.5555785 , 0.5426227 ,
       0.54502976, 0.5531634 , 0.5272248 , 0.49583733, 0.4798763 ,
       0.4707806 , 0.46853146, 0.4510343 , 0.44375154, 0.41643912],
      dtype=float32)

# Encode Classes

In [11]:
label_encoder = LabelEncoder().fit(train_y)
train_y = label_encoder.transform(train_y)
val_y = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0,1,2]))
classes

['barswa', 'comsan', 'eaywag1']

In [12]:
display(len(train_y))
display(train_y[:10])

display(len(val_y))
display(val_y[:10])

4763

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

2430

array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2])

# Put the labels and features to one large dataframe

In [13]:
train_features['label'] = train_y
train_features

{'mfcc': array([[0.54529715, 0.4793393 , 0.5349132 , ..., 0.4413777 , 0.48703906,
         0.4391229 ],
        [0.5793472 , 0.4298672 , 0.53157544, ..., 0.44011986, 0.4861899 ,
         0.45821744],
        [0.58690274, 0.4212866 , 0.5418751 , ..., 0.44207054, 0.48264065,
         0.45636433],
        ...,
        [0.64385235, 0.6829456 , 0.53566563, ..., 0.5652174 , 0.54604053,
         0.5220579 ],
        [0.63340336, 0.6876703 , 0.5409084 , ..., 0.54519475, 0.53867674,
         0.49238336],
        [0.62090445, 0.68252176, 0.5312934 , ..., 0.5375894 , 0.5312795 ,
         0.48669696]], dtype=float32),
 'chroma': array([[0.33323073, 0.30428657, 0.2593368 , ..., 0.5534823 , 0.5671482 ,
         0.39478505],
        [0.3348951 , 0.2847971 , 0.28591642, ..., 0.48760402, 0.5730938 ,
         0.4367055 ],
        [0.30589613, 0.32092458, 0.35909373, ..., 0.51112914, 0.47443873,
         0.3541228 ],
        ...,
        [0.63341504, 0.69855356, 0.7788    , ..., 0.61769176, 0.59956795,
 

In [14]:
val_features['label'] = val_y
val_features

{'mfcc': array([[0.53712696, 0.4974271 , 0.42230383, ..., 0.42753544, 0.43672395,
         0.4028031 ],
        [0.6763556 , 0.7499724 , 0.5798445 , ..., 0.59830695, 0.5712111 ,
         0.5139304 ],
        [0.6544008 , 0.7117479 , 0.63287103, ..., 0.5231731 , 0.53924084,
         0.48692483],
        ...,
        [0.6871535 , 0.6927643 , 0.5035141 , ..., 0.54538333, 0.51867616,
         0.5010022 ],
        [0.5348202 , 0.71103656, 0.482931  , ..., 0.48366803, 0.47977835,
         0.43783596],
        [0.5662565 , 0.8127476 , 0.67853034, ..., 0.50593054, 0.5672664 ,
         0.4755229 ]], dtype=float32),
 'chroma': array([[0.6062603 , 0.6611424 , 0.72486585, ..., 0.5058629 , 0.5512895 ,
         0.6173802 ],
        [0.7335697 , 0.64398766, 0.62081   , ..., 0.7005903 , 0.7300671 ,
         0.7513304 ],
        [0.37384117, 0.40621468, 0.5603675 , ..., 0.5704591 , 0.55914676,
         0.45755625],
        ...,
        [0.7064273 , 0.7694555 , 0.73633957, ..., 0.6428705 , 0.65043974,
 

In [15]:
merged_dict = {'train': train_features, 'val': val_features}

merged_dict

{'train': {'mfcc': array([[0.54529715, 0.4793393 , 0.5349132 , ..., 0.4413777 , 0.48703906,
          0.4391229 ],
         [0.5793472 , 0.4298672 , 0.53157544, ..., 0.44011986, 0.4861899 ,
          0.45821744],
         [0.58690274, 0.4212866 , 0.5418751 , ..., 0.44207054, 0.48264065,
          0.45636433],
         ...,
         [0.64385235, 0.6829456 , 0.53566563, ..., 0.5652174 , 0.54604053,
          0.5220579 ],
         [0.63340336, 0.6876703 , 0.5409084 , ..., 0.54519475, 0.53867674,
          0.49238336],
         [0.62090445, 0.68252176, 0.5312934 , ..., 0.5375894 , 0.5312795 ,
          0.48669696]], dtype=float32),
  'chroma': array([[0.33323073, 0.30428657, 0.2593368 , ..., 0.5534823 , 0.5671482 ,
          0.39478505],
         [0.3348951 , 0.2847971 , 0.28591642, ..., 0.48760402, 0.5730938 ,
          0.4367055 ],
         [0.30589613, 0.32092458, 0.35909373, ..., 0.51112914, 0.47443873,
          0.3541228 ],
         ...,
         [0.63341504, 0.69855356, 0.7788    , 

# save the merged dict with labels and features to pkl

In [16]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_8_sec_audio_features_avgpooled.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

In [17]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_8_sec_audio_features_avgpooled.pkl', 'rb') as file:
  merged_df = pickle.load(file)

In [18]:
assert merged_dict.keys() == merged_df.keys()

for top_level_key in merged_dict.keys():
  assert merged_dict[top_level_key].keys() == merged_df[top_level_key].keys()

  for bottom_level_key in merged_dict[top_level_key].keys():
    assert merged_dict[top_level_key][bottom_level_key].shape == merged_df[top_level_key][bottom_level_key].shape
    assert np.array_equal(merged_dict[top_level_key][bottom_level_key], merged_df[top_level_key][bottom_level_key])