<a href="https://colab.research.google.com/github/rachlllg/Project_Bird-Song-Classifier-with-Machine-Learning/blob/main/3.model_prep/d.extract_features_labels/5_sec_augmented_audio_features_avgpooled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
# for drive access
from google.colab import drive
drive.mount('/content/drive')

# standard libraries
import numpy as np
import pandas as pd
import pickle
import time

# for audio
import librosa

# for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

# for saving the label & features to disk
import pickle

Mounted at /content/drive


# Self Defined Class Methods For Feature Extraction

In [2]:
class Extraction:

  def __init__(self,
               train_df,
               val_df,
               sr=16000,
               n_mfccs=20, #20 is the default n_mfccs from librosa
               n_mels=20, #reduced from 128 default from librosa to 20
               n_chroma=12, #12 is the default n_chroma from librosa
               features=['mfcc'],
               do_normalize=True,
               do_avgpool=True #WARNING: THE KERNEL WILL CRASH IF YOU TRY TO EXTRACT ALL FEATURES WITHOUT AVGPOOL
               ):
    """
    Instantiate the Extraction class and extract the labels and features from train and val DataFrames.

    The extract_features() method is automatically called to extract the labels and features from the given DataFrame.

    Parameters:
      train_df (pd.DataFrame): DataFrame containing training data.
      val_df (pd.DataFrame): DataFrame containing validation data.
      sr (int): Sample rate of the audio files.
      n_mfccs (int): Number of MFCCs to extract.
      n_mels (int): Number of Mel bands to extract.
      n_chroma (int): Number of chroma bins to use.
      features (list): List of features to extract.
        accepted features: 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type'.
      normalize (bool): Whether to normalize the features.
      maxpool (bool): Whether to maxpool the features.
    """
    # instantiate class variables
    self.sr = sr
    self.n_mfccs = n_mfccs
    self.n_mels = n_mels
    self.n_chroma = n_chroma
    self.do_normalize = do_normalize
    self.do_avgpool = do_avgpool

    # confirm features have been specified
    assert len(features) != 0, "Must Specify At Least One Feature In The Form Of A List."
    self.features = features

    # confirm features are valid
    self.accepted_feature = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type']
    for each in self.features:
      assert each in self.accepted_feature, f"{each} is not an accepted feature, only 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', and 'type' are accpeted features."

    # extract train and val labels and features
    self.train_y, self.train_features = self.extract_features(train_df)
    self.val_y, self.val_features = self.extract_features(val_df)

    # process the features by applying normalization or average pooling
    self.train_features, self.val_features = self.process_features(self.train_features, self.val_features)


  ########################################
  # each of the extract_x() function is callable by the extract_features() function based on the features instantiated
  # each of the features are extracted from librosa and transposed to shape (n_time, n_features)
  ########################################
  def extract_mfcc(self, each):
    return np.transpose(librosa.feature.mfcc(y=np.array(each), sr=self.sr, n_mfcc=self.n_mfccs))

  def extract_chroma(self, each):
    return np.transpose(librosa.feature.chroma_stft(y=np.array(each), sr=self.sr, n_chroma=self.n_chroma))

  def extract_rms(self, each):
    return np.transpose(librosa.feature.rms(y=np.array(each)))

  def extract_spectral_centroid(self, each):
    return np.transpose(librosa.feature.spectral_centroid(y=np.array(each), sr=self.sr))

  def extract_melspectrogram(self, each):
    mel = librosa.feature.melspectrogram(y=np.array(each), sr=self.sr, n_mels=self.n_mels)
    return np.transpose(librosa.power_to_db(mel))


  ########################################
  # normalization function is called by process_features() function for the features that require normalization
  ########################################
  def normalization(self, train_X, val_X, n_time, n_features):
    """
    Normalize the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Normalized training feature
      val_X (npy): Normalized validation feature
    """
    # flatten the np arrays to 1D
    train_X_reshape = train_X.reshape(-1, n_time * n_features)
    val_X_reshape = val_X.reshape(-1, n_time * n_features)

    # use minmaxscaler to normalize the train and val features
    scaler = MinMaxScaler((0,1)).fit(train_X_reshape)
    train_X_scaled = scaler.transform(train_X_reshape)
    val_X_scaled = scaler.transform(val_X_reshape)

    # reshape the features to the original shape (n_time, n_features)
    train_X = train_X_scaled.reshape(-1, n_time, n_features)
    val_X = val_X_scaled.reshape(-1, n_time, n_features)

    return train_X, val_X


  ########################################
  # avgpooling function is called by process_features() function for the features that require average pooling
  ########################################
  def avgpooling(self, train_X, val_X, n_time, n_features):
    """
    Average pooling the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Avgpooled training feature
      val_X (npy): Avgpooled validation feature
    """
    # instantiate the keras layers and model.
    # the model only performs average pooling of inputs
    tf.keras.backend.clear_session()
    input = tf.keras.layers.Input(shape=(n_time, n_features))
    global_avg_pooling = tf.keras.layers.GlobalAveragePooling1D()
    pooled = global_avg_pooling(input)
    pooling_model = tf.keras.models.Model(inputs=input, outputs=pooled)

    # use the instantiated model to avgpool the train and val features
    train_X = pooling_model.predict(train_X)
    val_X = pooling_model.predict(val_X)

    return train_X, val_X


  ########################################
  # process_features function is called when the class is instantiated
  ########################################
  def process_features(self, train_features_dict, val_features_dict):
    """
    Apply normalization or average pooling to train and val features

    Parameters:
      train_features_dict (dict): Dictionary containing the training features
      val_features_dict (dict): Dictionary containing the validation features

    Returns:
      tuple: Tuple containing normalized and/or average pooled training and validation features dictionary
    """
    start_time = time.time()
    display('features processing started ---------->')

    for each in train_features_dict.keys():
      # 'continent', 'rating', and 'type' do not need to be processed
      if each == 'continent' or each == 'rating' or each == 'type':
        pass
      # process numeric features
      else:
        if each == 'mfcc':
          n_features=self.n_mfccs
        elif each == 'chroma':
          n_features=self.n_chroma
        elif each == 'rms' or each == 'spectral_centroid':
          n_features=1
        elif each == 'melspectrogram':
          n_features=self.n_mels

        # index the dictionary to find the value of the feature based on the feature name (key)
        train_feature = train_features_dict[each]
        val_feature = val_features_dict[each]

        # call normalization or avgpooling function to apply normalization or average pooling to features
        if self.do_normalize and self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_features_dict[each], val_features_dict[each], n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_normalize:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        else:
          train_features_dict[each], val_features_dict[each] = train_features_dict[each], val_features_dict[each]

        display(f'{each} processed')

    end_time = time.time()
    display(f'features processing took {(end_time - start_time)/60:.1f} mins')

    return train_features_dict, val_features_dict


  ########################################
  # extract_features function is called when the class is instantiated
  ########################################
  def extract_features(self, dataframe):
    """
    Extract the label & features from the dataframes

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information

    Returns:
      tuple: Tuple containing labels and a dictionary of extracted features
    """
    start_time = time.time()
    display(f'feature extraction started ---------->')

    # create empty list and dict to store the labels and features
    y = []
    features_dict = {item: [] for item in self.features}

    # iterate through each row of the dataframe to extract the label and features
    for _, row in dataframe.iterrows():
      label = row['primary_label']
      framed = row['framed']

      for each in framed:
        y.append(label)

        if 'continent' in self.features:
          features_dict['continent'].append(row['continent'])
        if 'rating' in self.features:
          features_dict['rating'].append(row['rating'])
        if 'type' in self.features:
          features_dict['type'].append(row['type'])

        # dynatically call the extract_x function to extract the listed features
        for feature in self.features:
          extract = f"extract_{feature}"
          if hasattr(self, extract) and callable(func := getattr(self, extract)):
            features_dict[feature].append(func(each))

    # cast lists to np arrays
    for each in features_dict.keys():
      features_dict[each] = np.array(features_dict[each])
    y = np.array(y)

    end_time = time.time()
    display(f'features extraction took {(end_time - start_time)/60:.1f} mins')

    return y, features_dict


# Load train and test framed audio data

In [3]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/train_df_5_sec_augmented.pkl', 'rb') as file:
  train_df = pickle.load(file)

train_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
240,eaywag1,call,42.965,9.4512,5.0,eaywag1/XC471811.ogg,17.8155,FR,EUROPE,eaywag1/XC471811.npy,train,"((tf.Tensor(8.750593e-06, shape=(), dtype=floa..."
542,barswa,song,42.806,13.8335,4.0,barswa/XC371853.ogg,50.834313,IT,EUROPE,barswa/XC371853.npy,train,"((tf.Tensor(-0.00011792055242018368, shape=(),..."
214,comsan,call,60.2357,25.0058,1.0,comsan/XC554068.ogg,25.6,FI,EUROPE,comsan/XC554068.npy,train,"((tf.Tensor(6.721793e-06, shape=(), dtype=floa..."
492,barswa,song,53.9299,-2.9833,2.5,barswa/XC690496.ogg,15.098812,GB,EUROPE,barswa/XC690496.npy,train,"((tf.Tensor(4.4646572e-07, shape=(), dtype=flo..."
190,comsan,call,51.5579,17.509,4.0,comsan/XC492893.ogg,11.755,PL,EUROPE,comsan/XC492893.npy,train,"((tf.Tensor(-0.005616761, shape=(), dtype=floa..."


In [4]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/val_df_5_sec.pkl', 'rb') as file:
  val_df = pickle.load(file)

val_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
969,comsan,call,41.8074,-8.8626,4.0,comsan/XC670080.ogg,11.206,PT,EUROPE,comsan/XC670080.npy,val,"((tf.Tensor(9.328758e-06, shape=(), dtype=floa..."
851,comsan,blank,51.6578,19.3281,5.0,comsan/XC738993.ogg,11.616,PL,EUROPE,comsan/XC738993.npy,val,"((tf.Tensor(6.5908675e-06, shape=(), dtype=flo..."
998,barswa,song,48.2131,-3.0137,5.0,barswa/XC643586.ogg,49.536,FR,EUROPE,barswa/XC643586.npy,val,"((tf.Tensor(1.9546133e-06, shape=(), dtype=flo..."
771,comsan,call,56.0779,47.9129,5.0,comsan/XC371997.ogg,12.355937,RU,EUROPE,comsan/XC371997.npy,val,"((tf.Tensor(5.00679e-06, shape=(), dtype=float..."
988,eaywag1,call,52.2003,-6.4349,2.0,eaywag1/XC687527.ogg,15.464,IE,EUROPE,eaywag1/XC687527.npy,val,"((tf.Tensor(2.4605542e-06, shape=(), dtype=flo..."


# Extract features

In [5]:
features_list = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram']

In [6]:
%%time

features = Extraction(train_df,
                      val_df,
                      features=features_list,
                      do_normalize=True,
                      do_avgpool=True)

'feature extraction started ---------->'

  return pitch_tuning(


'features extraction took 15.3 mins'

'feature extraction started ---------->'

'features extraction took 6.1 mins'

'features processing started ---------->'



'mfcc processed'



'chroma processed'



'rms processed'



'spectral_centroid processed'



'melspectrogram processed'

'features processing took 0.2 mins'

CPU times: user 20min 28s, sys: 14min 11s, total: 34min 40s
Wall time: 21min 37s


In [7]:
train_y = features.train_y

display(train_y.shape)
display(train_y[:5])

(10261,)

array(['eaywag1', 'eaywag1', 'eaywag1', 'eaywag1', 'eaywag1'], dtype='<U7')

In [8]:
train_features = features.train_features

for key in train_features.keys():
  display(key)
  display(train_features[key].shape)
  display(train_features[key][0])

'mfcc'

(10261, 20)

array([0.3182792 , 0.42068544, 0.54547703, 0.5918969 , 0.51184756,
       0.5023576 , 0.5279983 , 0.47080007, 0.51137924, 0.4292506 ,
       0.47610453, 0.5554994 , 0.46370822, 0.58553004, 0.4942194 ,
       0.50509644, 0.4921472 , 0.5677892 , 0.48849332, 0.4660263 ],
      dtype=float32)

'chroma'

(10261, 12)

array([0.3773521 , 0.34899873, 0.39250457, 0.34685364, 0.27569845,
       0.2959079 , 0.3255963 , 0.3667992 , 0.39577588, 0.3856901 ,
       0.34652126, 0.38717416], dtype=float32)

'rms'

(10261, 1)

array([0.01533054], dtype=float32)

'spectral_centroid'

(10261, 1)

array([0.61564493], dtype=float32)

'melspectrogram'

(10261, 20)

array([0.23523037, 0.22539774, 0.23439766, 0.2273884 , 0.23939332,
       0.2687144 , 0.2679984 , 0.28686044, 0.286103  , 0.30369306,
       0.32076204, 0.33419812, 0.35532755, 0.35558248, 0.39116687,
       0.40989885, 0.41736504, 0.3892507 , 0.3782649 , 0.33180547],
      dtype=float32)

In [9]:
val_y = features.val_y
val_y = np.array(val_y)

display(val_y.shape)
display(val_y[:5])

(4236,)

array(['comsan', 'comsan', 'comsan', 'comsan', 'comsan'], dtype='<U7')

In [10]:
val_features = features.val_features

for key in val_features.keys():
  display(key)
  display(val_features[key].shape)
  display(val_features[key][0])

'mfcc'

(4236, 20)

array([0.6087543 , 0.48751956, 0.32761732, 0.33255047, 0.440848  ,
       0.32937652, 0.45838502, 0.37031353, 0.48405024, 0.36023545,
       0.49369672, 0.4713621 , 0.48795366, 0.50057304, 0.34260538,
       0.49101645, 0.4461726 , 0.4923069 , 0.42989695, 0.45091388],
      dtype=float32)

'chroma'

(4236, 12)

array([0.55872846, 0.6693503 , 0.73820925, 0.56903225, 0.37883523,
       0.30883962, 0.3371404 , 0.38599554, 0.45166692, 0.47486886,
       0.48054674, 0.4789526 ], dtype=float32)

'rms'

(4236, 1)

array([0.01604724], dtype=float32)

'spectral_centroid'

(4236, 1)

array([0.48146713], dtype=float32)

'melspectrogram'

(4236, 20)

array([0.35998622, 0.4138753 , 0.49311396, 0.56462425, 0.6112864 ,
       0.648454  , 0.6449974 , 0.69178176, 0.6657548 , 0.6739032 ,
       0.65808237, 0.6094307 , 0.611345  , 0.57557636, 0.54362684,
       0.56370366, 0.5845648 , 0.5442493 , 0.5083579 , 0.5160122 ],
      dtype=float32)

# Encode Classes

In [11]:
label_encoder = LabelEncoder().fit(train_y)
train_y = label_encoder.transform(train_y)
val_y = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0,1,2]))
classes

['barswa', 'comsan', 'eaywag1']

In [12]:
display(len(train_y))
display(train_y[:10])

display(len(val_y))
display(val_y[:10])

10261

array([2, 2, 2, 2, 2, 2, 0, 0, 0, 0])

4236

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0])

# Put the labels and features to one large dataframe

In [13]:
train_features['label'] = train_y
train_features

{'mfcc': array([[0.3182792 , 0.42068544, 0.54547703, ..., 0.5677892 , 0.48849332,
         0.4660263 ],
        [0.30763105, 0.42694822, 0.54081   , ..., 0.5679904 , 0.51369065,
         0.46872035],
        [0.32936803, 0.41591012, 0.5914174 , ..., 0.5493734 , 0.52185374,
         0.45490903],
        ...,
        [0.5944429 , 0.528247  , 0.39337492, ..., 0.43771422, 0.44471854,
         0.3832799 ],
        [0.5997119 , 0.549461  , 0.3893529 , ..., 0.44500926, 0.45456818,
         0.39065084],
        [0.6032661 , 0.5725346 , 0.36859348, ..., 0.43966922, 0.44821528,
         0.38659805]], dtype=float32),
 'chroma': array([[0.3773521 , 0.34899873, 0.39250457, ..., 0.3856901 , 0.34652126,
         0.38717416],
        [0.41277367, 0.3876313 , 0.36973938, ..., 0.331545  , 0.3389837 ,
         0.38958785],
        [0.4120148 , 0.34952646, 0.36930162, ..., 0.2567423 , 0.36262432,
         0.3680476 ],
        ...,
        [0.5223055 , 0.54672045, 0.5459075 , ..., 0.5536309 , 0.5785832 ,
 

In [14]:
val_features['label'] = val_y
val_features

{'mfcc': array([[0.6087543 , 0.48751956, 0.32761732, ..., 0.4923069 , 0.42989695,
         0.45091388],
        [0.6066605 , 0.50880086, 0.308606  , ..., 0.46752214, 0.433274  ,
         0.42629564],
        [0.5773918 , 0.51840866, 0.29060236, ..., 0.4498327 , 0.41214767,
         0.42276117],
        ...,
        [0.47111034, 0.73432565, 0.6919102 , ..., 0.48349854, 0.47198492,
         0.44021854],
        [0.4747861 , 0.73555815, 0.68741053, ..., 0.49052885, 0.48083943,
         0.45026383],
        [0.47517386, 0.7375982 , 0.6815216 , ..., 0.49597192, 0.4783622 ,
         0.4603559 ]], dtype=float32),
 'chroma': array([[0.55872846, 0.6693503 , 0.73820925, ..., 0.47486886, 0.48054674,
         0.4789526 ],
        [0.59523857, 0.7246323 , 0.77103895, ..., 0.51119524, 0.5072764 ,
         0.5540872 ],
        [0.7001928 , 0.7302599 , 0.7724216 , ..., 0.56012976, 0.54554707,
         0.58271676],
        ...,
        [0.6806674 , 0.53031987, 0.42924216, ..., 0.6903464 , 0.61951315,
 

In [15]:
merged_dict = {'train': train_features, 'val': val_features}

merged_dict

{'train': {'mfcc': array([[0.3182792 , 0.42068544, 0.54547703, ..., 0.5677892 , 0.48849332,
          0.4660263 ],
         [0.30763105, 0.42694822, 0.54081   , ..., 0.5679904 , 0.51369065,
          0.46872035],
         [0.32936803, 0.41591012, 0.5914174 , ..., 0.5493734 , 0.52185374,
          0.45490903],
         ...,
         [0.5944429 , 0.528247  , 0.39337492, ..., 0.43771422, 0.44471854,
          0.3832799 ],
         [0.5997119 , 0.549461  , 0.3893529 , ..., 0.44500926, 0.45456818,
          0.39065084],
         [0.6032661 , 0.5725346 , 0.36859348, ..., 0.43966922, 0.44821528,
          0.38659805]], dtype=float32),
  'chroma': array([[0.3773521 , 0.34899873, 0.39250457, ..., 0.3856901 , 0.34652126,
          0.38717416],
         [0.41277367, 0.3876313 , 0.36973938, ..., 0.331545  , 0.3389837 ,
          0.38958785],
         [0.4120148 , 0.34952646, 0.36930162, ..., 0.2567423 , 0.36262432,
          0.3680476 ],
         ...,
         [0.5223055 , 0.54672045, 0.5459075 , 

# save the merged dict with labels and features to pkl

In [16]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_5_sec_augmented_audio_features_avgpooled.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

In [17]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_5_sec_augmented_audio_features_avgpooled.pkl', 'rb') as file:
  merged_df = pickle.load(file)

In [18]:
assert merged_dict.keys() == merged_df.keys()

for top_level_key in merged_dict.keys():
  assert merged_dict[top_level_key].keys() == merged_df[top_level_key].keys()

  for bottom_level_key in merged_dict[top_level_key].keys():
    assert merged_dict[top_level_key][bottom_level_key].shape == merged_df[top_level_key][bottom_level_key].shape
    assert np.array_equal(merged_dict[top_level_key][bottom_level_key], merged_df[top_level_key][bottom_level_key])