# Import Libraries

In [1]:
# for drive access
from google.colab import drive
drive.mount('/content/drive')

# standard libraries
import numpy as np
import pandas as pd
import pickle
import time

# for audio
import librosa

# for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

# for saving the label & features to disk
import pickle

Mounted at /content/drive


# Self Defined Class Methods For Feature Extraction

In [2]:
class Extraction:

  def __init__(self,
               train_df,
               val_df,
               sr=16000,
               n_mfccs=20, #20 is the default n_mfccs from librosa
               n_mels=20, #reduced from 128 default from librosa to 20
               n_chroma=12, #12 is the default n_chroma from librosa
               features=['mfcc'],
               do_normalize=True,
               do_avgpool=True #WARNING: THE KERNEL WILL CRASH IF YOU TRY TO EXTRACT ALL FEATURES WITHOUT AVGPOOL
               ):
    """
    Instantiate the Extraction class and extract the labels and features from train and val DataFrames.

    The extract_features() method is automatically called to extract the labels and features from the given DataFrame.

    Parameters:
      train_df (pd.DataFrame): DataFrame containing training data.
      val_df (pd.DataFrame): DataFrame containing validation data.
      sr (int): Sample rate of the audio files.
      n_mfccs (int): Number of MFCCs to extract.
      n_mels (int): Number of Mel bands to extract.
      n_chroma (int): Number of chroma bins to use.
      features (list): List of features to extract.
        accepted features: 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type'.
      normalize (bool): Whether to normalize the features.
      maxpool (bool): Whether to maxpool the features.
    """
    # instantiate class variables
    self.sr = sr
    self.n_mfccs = n_mfccs
    self.n_mels = n_mels
    self.n_chroma = n_chroma
    self.do_normalize = do_normalize
    self.do_avgpool = do_avgpool

    # confirm features have been specified
    assert len(features) != 0, "Must Specify At Least One Feature In The Form Of A List."
    self.features = features

    # confirm features are valid
    self.accepted_feature = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type']
    for each in self.features:
      assert each in self.accepted_feature, f"{each} is not an accepted feature, only 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', and 'type' are accpeted features."

    # extract train and val labels and features
    self.train_y, self.train_features = self.extract_features(train_df)
    self.val_y, self.val_features = self.extract_features(val_df)

    # process the features by applying normalization or average pooling
    self.train_features, self.val_features = self.process_features(self.train_features, self.val_features)


  ########################################
  # each of the extract_x() function is callable by the extract_features() function based on the features instantiated
  # each of the features are extracted from librosa and transposed to shape (n_time, n_features)
  ########################################
  def extract_mfcc(self, each):
    return np.transpose(librosa.feature.mfcc(y=np.array(each), sr=self.sr, n_mfcc=self.n_mfccs))

  def extract_chroma(self, each):
    return np.transpose(librosa.feature.chroma_stft(y=np.array(each), sr=self.sr, n_chroma=self.n_chroma))

  def extract_rms(self, each):
    return np.transpose(librosa.feature.rms(y=np.array(each)))

  def extract_spectral_centroid(self, each):
    return np.transpose(librosa.feature.spectral_centroid(y=np.array(each), sr=self.sr))

  def extract_melspectrogram(self, each):
    mel = librosa.feature.melspectrogram(y=np.array(each), sr=self.sr, n_mels=self.n_mels)
    return np.transpose(librosa.power_to_db(mel))


  ########################################
  # normalization function is called by process_features() function for the features that require normalization
  ########################################
  def normalization(self, train_X, val_X, n_time, n_features):
    """
    Normalize the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Normalized training feature
      val_X (npy): Normalized validation feature
    """
    # flatten the np arrays to 1D
    train_X_reshape = train_X.reshape(-1, n_time * n_features)
    val_X_reshape = val_X.reshape(-1, n_time * n_features)

    # use minmaxscaler to normalize the train and val features
    scaler = MinMaxScaler((0,1)).fit(train_X_reshape)
    train_X_scaled = scaler.transform(train_X_reshape)
    val_X_scaled = scaler.transform(val_X_reshape)

    # reshape the features to the original shape (n_time, n_features)
    train_X = train_X_scaled.reshape(-1, n_time, n_features)
    val_X = val_X_scaled.reshape(-1, n_time, n_features)

    return train_X, val_X


  ########################################
  # avgpooling function is called by process_features() function for the features that require average pooling
  ########################################
  def avgpooling(self, train_X, val_X, n_time, n_features):
    """
    Average pooling the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Avgpooled training feature
      val_X (npy): Avgpooled validation feature
    """
    # instantiate the keras layers and model.
    # the model only performs average pooling of inputs
    tf.keras.backend.clear_session()
    input = tf.keras.layers.Input(shape=(n_time, n_features))
    global_avg_pooling = tf.keras.layers.GlobalAveragePooling1D()
    pooled = global_avg_pooling(input)
    pooling_model = tf.keras.models.Model(inputs=input, outputs=pooled)

    # use the instantiated model to avgpool the train and val features
    train_X = pooling_model.predict(train_X)
    val_X = pooling_model.predict(val_X)

    return train_X, val_X


  ########################################
  # process_features function is called when the class is instantiated
  ########################################
  def process_features(self, train_features_dict, val_features_dict):
    """
    Apply normalization or average pooling to train and val features

    Parameters:
      train_features_dict (dict): Dictionary containing the training features
      val_features_dict (dict): Dictionary containing the validation features

    Returns:
      tuple: Tuple containing normalized and/or average pooled training and validation features dictionary
    """
    start_time = time.time()
    display('features processing started ---------->')

    for each in train_features_dict.keys():
      # 'continent', 'rating', and 'type' do not need to be processed
      if each == 'continent' or each == 'rating' or each == 'type':
        pass
      # process numeric features
      else:
        if each == 'mfcc':
          n_features=self.n_mfccs
        elif each == 'chroma':
          n_features=self.n_chroma
        elif each == 'rms' or each == 'spectral_centroid':
          n_features=1
        elif each == 'melspectrogram':
          n_features=self.n_mels

        # index the dictionary to find the value of the feature based on the feature name (key)
        train_feature = train_features_dict[each]
        val_feature = val_features_dict[each]

        # call normalization or avgpooling function to apply normalization or average pooling to features
        if self.do_normalize and self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_features_dict[each], val_features_dict[each], n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_normalize:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        else:
          train_features_dict[each], val_features_dict[each] = train_features_dict[each], val_features_dict[each]

        display(f'{each} processed')

    end_time = time.time()
    display(f'features processing took {(end_time - start_time)/60:.1f} mins')

    return train_features_dict, val_features_dict


  ########################################
  # extract_features function is called when the class is instantiated
  ########################################
  def extract_features(self, dataframe):
    """
    Extract the label & features from the dataframes

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information

    Returns:
      tuple: Tuple containing labels and a dictionary of extracted features
    """
    start_time = time.time()
    display(f'feature extraction started ---------->')

    # create empty list and dict to store the labels and features
    y = []
    features_dict = {item: [] for item in self.features}

    # iterate through each row of the dataframe to extract the label and features
    for _, row in dataframe.iterrows():
      label = row['primary_label']
      framed = row['framed']

      for each in framed:
        y.append(label)

        if 'continent' in self.features:
          features_dict['continent'].append(row['continent'])
        if 'rating' in self.features:
          features_dict['rating'].append(row['rating'])
        if 'type' in self.features:
          features_dict['type'].append(row['type'])

        # dynatically call the extract_x function to extract the listed features
        for feature in self.features:
          extract = f"extract_{feature}"
          if hasattr(self, extract) and callable(func := getattr(self, extract)):
            features_dict[feature].append(func(each))

    # cast lists to np arrays
    for each in features_dict.keys():
      features_dict[each] = np.array(features_dict[each])
    y = np.array(y)

    end_time = time.time()
    display(f'features extraction took {(end_time - start_time)/60:.1f} mins')

    return y, features_dict


# Load train and test framed audio data

In [3]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/train_df_8_sec_augmented.pkl', 'rb') as file:
  train_df = pickle.load(file)

train_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
668,barswa,blank,-33.2377,26.9574,5.0,barswa/XC713037.ogg,58.331,ZA,AFRICA,barswa/XC713037.npy,train,"((tf.Tensor(-1.0659789e-05, shape=(), dtype=fl..."
247,eaywag1,call,54.1191,13.376,3.5,eaywag1/XC658750.ogg,49.68,DE,EUROPE,eaywag1/XC658750.npy,train,"((tf.Tensor(-1.1132361e-06, shape=(), dtype=fl..."
620,barswa,song,35.0573,34.0009,5.0,barswa/XC405617.ogg,114.207375,CY,ASIA,barswa/XC405617.npy,train,"((tf.Tensor(3.9268853e-06, shape=(), dtype=flo..."
86,comsan,call,43.5352,-1.4475,4.5,comsan/XC580687.ogg,59.82,FR,EUROPE,comsan/XC580687.npy,train,"((tf.Tensor(-8.466335e-06, shape=(), dtype=flo..."
450,eaywag1,song,43.5118,3.8367,3.0,eaywag1/XC567939.ogg,10.031,FR,EUROPE,eaywag1/XC567939.npy,train,"((tf.Tensor(-0.00037298112779295535, shape=(),..."


In [4]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/val_df_8_sec.pkl', 'rb') as file:
  val_df = pickle.load(file)

val_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
991,barswa,song,39.4709,-8.0205,3.5,barswa/XC624409.ogg,10.008,PT,EUROPE,barswa/XC624409.npy,val,"((tf.Tensor(3.148432e-08, shape=(), dtype=floa..."
760,barswa,call,51.4522,-9.8189,5.0,barswa/XC666943.ogg,11.885,IE,EUROPE,barswa/XC666943.npy,val,"((tf.Tensor(-8.617062e-07, shape=(), dtype=flo..."
876,comsan,call,50.7347,3.2143,4.0,comsan/XC578171.ogg,21.707,BE,EUROPE,comsan/XC578171.npy,val,"((tf.Tensor(-3.9801816e-06, shape=(), dtype=fl..."
927,eaywag1,song,46.0192,61.846,5.0,eaywag1/XC184434.ogg,63.921625,KZ,ASIA,eaywag1/XC184434.npy,val,"((tf.Tensor(-4.7415142e-07, shape=(), dtype=fl..."
880,comsan,call,42.9037,13.9077,4.0,comsan/XC433334.ogg,24.006,IT,EUROPE,comsan/XC433334.npy,val,"((tf.Tensor(-1.1874385e-05, shape=(), dtype=fl..."


# Extract features

In [5]:
features_list = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram']

In [6]:
%%time

features = Extraction(train_df,
                      val_df,
                      features=features_list,
                      do_normalize=True,
                      do_avgpool=True)

'feature extraction started ---------->'

  return pitch_tuning(


'features extraction took 5.7 mins'

'feature extraction started ---------->'

'features extraction took 2.7 mins'

'features processing started ---------->'



'mfcc processed'



'chroma processed'



'rms processed'



'spectral_centroid processed'



'melspectrogram processed'

'features processing took 0.1 mins'

CPU times: user 8min 52s, sys: 5min 37s, total: 14min 30s
Wall time: 8min 31s


In [7]:
train_y = features.train_y

display(train_y.shape)
display(train_y[:5])

(4763,)

array(['barswa', 'barswa', 'barswa', 'barswa', 'barswa'], dtype='<U7')

In [8]:
train_features = features.train_features

for key in train_features.keys():
  display(key)
  display(train_features[key].shape)
  display(train_features[key][0])

'mfcc'

(4763, 20)

array([0.44627464, 0.45426083, 0.5378976 , 0.6415272 , 0.3273386 ,
       0.537812  , 0.5507873 , 0.31623414, 0.5448066 , 0.42926756,
       0.4554109 , 0.52395195, 0.42408144, 0.5540474 , 0.44779915,
       0.44673815, 0.48709244, 0.41270763, 0.49669155, 0.44323295],
      dtype=float32)

'chroma'

(4763, 12)

array([0.37101182, 0.32926235, 0.27844787, 0.21576864, 0.17506872,
       0.17882459, 0.19580089, 0.2657043 , 0.34516746, 0.45568615,
       0.62949955, 0.51872045], dtype=float32)

'rms'

(4763, 1)

array([0.03238721], dtype=float32)

'spectral_centroid'

(4763, 1)

array([0.5100422], dtype=float32)

'melspectrogram'

(4763, 20)

array([0.46083716, 0.4404941 , 0.4459735 , 0.4552247 , 0.44280383,
       0.45544213, 0.45687798, 0.45849824, 0.45889068, 0.45322755,
       0.4440301 , 0.46989176, 0.52670664, 0.55969405, 0.6080172 ,
       0.60682744, 0.5416601 , 0.45356917, 0.40902466, 0.36045012],
      dtype=float32)

In [9]:
val_y = features.val_y
val_y = np.array(val_y)

display(val_y.shape)
display(val_y[:5])

(2430,)

array(['barswa', 'barswa', 'comsan', 'comsan', 'comsan'], dtype='<U7')

In [10]:
val_features = features.val_features

for key in val_features.keys():
  display(key)
  display(val_features[key].shape)
  display(val_features[key][0])

'mfcc'

(2430, 20)

array([0.4727356 , 0.49102217, 0.416187  , 0.4110909 , 0.46178582,
       0.331717  , 0.45005453, 0.36411384, 0.37412563, 0.347604  ,
       0.39516807, 0.43693858, 0.33319825, 0.38965693, 0.36040395,
       0.37758887, 0.39124897, 0.3882255 , 0.43174356, 0.39325842],
      dtype=float32)

'chroma'

(2430, 12)

array([0.6062606 , 0.6611376 , 0.7248586 , 0.64855564, 0.58370745,
       0.5189866 , 0.4971299 , 0.5107803 , 0.51441324, 0.50586236,
       0.55129015, 0.617381  ], dtype=float32)

'rms'

(2430, 1)

array([0.00597911], dtype=float32)

'spectral_centroid'

(2430, 1)

array([0.4952323], dtype=float32)

'melspectrogram'

(2430, 20)

array([0.4270522 , 0.44763607, 0.4630777 , 0.48716363, 0.5089495 ,
       0.52835125, 0.55888444, 0.5702827 , 0.5705755 , 0.55323595,
       0.55394274, 0.55681396, 0.5328459 , 0.47464818, 0.4582111 ,
       0.4517086 , 0.45180812, 0.43495947, 0.43842584, 0.41111934],
      dtype=float32)

# Encode Classes

In [11]:
label_encoder = LabelEncoder().fit(train_y)
train_y = label_encoder.transform(train_y)
val_y = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0,1,2]))
classes

['barswa', 'comsan', 'eaywag1']

In [12]:
display(len(train_y))
display(train_y[:10])

display(len(val_y))
display(val_y[:10])

4763

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

2430

array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2])

# Put the labels and features to one large dataframe

In [13]:
train_features['label'] = train_y
train_features

{'mfcc': array([[0.44627464, 0.45426083, 0.5378976 , ..., 0.41270763, 0.49669155,
         0.44323295],
        [0.4870945 , 0.3999269 , 0.53268594, ..., 0.38443726, 0.49417073,
         0.46917754],
        [0.48996815, 0.39159116, 0.5386251 , ..., 0.38565   , 0.4793156 ,
         0.46398744],
        ...,
        [0.5851718 , 0.67896426, 0.5308942 , ..., 0.53540635, 0.5450823 ,
         0.5205621 ],
        [0.57395256, 0.68371713, 0.53624344, ..., 0.5139698 , 0.5373012 ,
         0.48929185],
        [0.55954564, 0.6784133 , 0.52643305, ..., 0.5064577 , 0.5295971 ,
         0.48355782]], dtype=float32),
 'chroma': array([[0.37101182, 0.32926235, 0.27844787, ..., 0.45568615, 0.62949955,
         0.51872045],
        [0.35346994, 0.29351434, 0.27224222, ..., 0.4687016 , 0.5994445 ,
         0.5020444 ],
        [0.312296  , 0.30661365, 0.33116865, ..., 0.5022151 , 0.53005433,
         0.38759488],
        ...,
        [0.6334156 , 0.69854987, 0.7787965 , ..., 0.6176913 , 0.5995683 ,
 

In [14]:
val_features['label'] = val_y
val_features

{'mfcc': array([[0.4727356 , 0.49102217, 0.416187  , ..., 0.3882255 , 0.43174356,
         0.39325842],
        [0.6182434 , 0.74673927, 0.5758725 , ..., 0.57078695, 0.57131976,
         0.5115228 ],
        [0.5959441 , 0.70817715, 0.6292639 , ..., 0.4901472 , 0.53842074,
         0.48296666],
        ...,
        [0.6305262 , 0.68887514, 0.498226  , ..., 0.5140355 , 0.51679754,
         0.4968501 ],
        [0.46860984, 0.70737803, 0.4775378 , ..., 0.44839352, 0.47631183,
         0.4315598 ],
        [0.50246507, 0.81071496, 0.6752052 , ..., 0.47143868, 0.56774634,
         0.47035185]], dtype=float32),
 'chroma': array([[0.6062606 , 0.6611376 , 0.7248586 , ..., 0.50586236, 0.55129015,
         0.617381  ],
        [0.7335699 , 0.64398235, 0.6208009 , ..., 0.70059   , 0.7300674 ,
         0.7513307 ],
        [0.37384203, 0.406207  , 0.5603575 , ..., 0.5704588 , 0.5591472 ,
         0.45755687],
        ...,
        [0.7064277 , 0.7694523 , 0.7363335 , ..., 0.64287   , 0.65043986,
 

In [15]:
merged_dict = {'train': train_features, 'val': val_features}

merged_dict

{'train': {'mfcc': array([[0.44627464, 0.45426083, 0.5378976 , ..., 0.41270763, 0.49669155,
          0.44323295],
         [0.4870945 , 0.3999269 , 0.53268594, ..., 0.38443726, 0.49417073,
          0.46917754],
         [0.48996815, 0.39159116, 0.5386251 , ..., 0.38565   , 0.4793156 ,
          0.46398744],
         ...,
         [0.5851718 , 0.67896426, 0.5308942 , ..., 0.53540635, 0.5450823 ,
          0.5205621 ],
         [0.57395256, 0.68371713, 0.53624344, ..., 0.5139698 , 0.5373012 ,
          0.48929185],
         [0.55954564, 0.6784133 , 0.52643305, ..., 0.5064577 , 0.5295971 ,
          0.48355782]], dtype=float32),
  'chroma': array([[0.37101182, 0.32926235, 0.27844787, ..., 0.45568615, 0.62949955,
          0.51872045],
         [0.35346994, 0.29351434, 0.27224222, ..., 0.4687016 , 0.5994445 ,
          0.5020444 ],
         [0.312296  , 0.30661365, 0.33116865, ..., 0.5022151 , 0.53005433,
          0.38759488],
         ...,
         [0.6334156 , 0.69854987, 0.7787965 , 

# save the merged dict with labels and features to pkl

In [16]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_8_sec_augmented_audio_features_avgpooled.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

In [17]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_8_sec_augmented_audio_features_avgpooled.pkl', 'rb') as file:
  merged_df = pickle.load(file)

In [18]:
assert merged_dict.keys() == merged_df.keys()

for top_level_key in merged_dict.keys():
  assert merged_dict[top_level_key].keys() == merged_df[top_level_key].keys()

  for bottom_level_key in merged_dict[top_level_key].keys():
    assert merged_dict[top_level_key][bottom_level_key].shape == merged_df[top_level_key][bottom_level_key].shape
    assert np.array_equal(merged_dict[top_level_key][bottom_level_key], merged_df[top_level_key][bottom_level_key])