<a href="https://colab.research.google.com/github/rachlllg/Project_Bird-Song-Classifier-with-Machine-Learning/blob/main/3.model_prep/d.extract_features_labels/5_sec_augmented_audio_features_not_avgpooled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
# for drive access
from google.colab import drive
drive.mount('/content/drive')

# standard libraries
import numpy as np
import pandas as pd
import pickle
import time

# for audio
import librosa

# for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

# for saving the label & features to disk
import pickle

Mounted at /content/drive


# Self Defined Class Methods For Feature Extraction

In [2]:
class Extraction:

  def __init__(self,
               train_df,
               val_df,
               sr=16000,
               n_mfccs=20, #20 is the default n_mfccs from librosa
               n_mels=20, #reduced from 128 default from librosa to 20
               n_chroma=12, #12 is the default n_chroma from librosa
               features=['mfcc'],
               do_normalize=True,
               do_avgpool=True #WARNING: THE KERNEL WILL CRASH IF YOU TRY TO EXTRACT ALL FEATURES WITHOUT AVGPOOL
               ):
    """
    Instantiate the Extraction class and extract the labels and features from train and val DataFrames.

    The extract_features() method is automatically called to extract the labels and features from the given DataFrame.

    Parameters:
      train_df (pd.DataFrame): DataFrame containing training data.
      val_df (pd.DataFrame): DataFrame containing validation data.
      sr (int): Sample rate of the audio files.
      n_mfccs (int): Number of MFCCs to extract.
      n_mels (int): Number of Mel bands to extract.
      n_chroma (int): Number of chroma bins to use.
      features (list): List of features to extract.
        accepted features: 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type'.
      normalize (bool): Whether to normalize the features.
      maxpool (bool): Whether to maxpool the features.
    """
    # instantiate class variables
    self.sr = sr
    self.n_mfccs = n_mfccs
    self.n_mels = n_mels
    self.n_chroma = n_chroma
    self.do_normalize = do_normalize
    self.do_avgpool = do_avgpool

    # confirm features have been specified
    assert len(features) != 0, "Must Specify At Least One Feature In The Form Of A List."
    self.features = features

    # confirm features are valid
    self.accepted_feature = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type']
    for each in self.features:
      assert each in self.accepted_feature, f"{each} is not an accepted feature, only 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', and 'type' are accpeted features."

    # extract train and val labels and features
    self.train_y, self.train_features = self.extract_features(train_df)
    self.val_y, self.val_features = self.extract_features(val_df)

    # process the features by applying normalization or average pooling
    self.train_features, self.val_features = self.process_features(self.train_features, self.val_features)


  ########################################
  # each of the extract_x() function is callable by the extract_features() function based on the features instantiated
  # each of the features are extracted from librosa and transposed to shape (n_time, n_features)
  ########################################
  def extract_mfcc(self, each):
    return np.transpose(librosa.feature.mfcc(y=np.array(each), sr=self.sr, n_mfcc=self.n_mfccs))

  def extract_chroma(self, each):
    return np.transpose(librosa.feature.chroma_stft(y=np.array(each), sr=self.sr, n_chroma=self.n_chroma))

  def extract_rms(self, each):
    return np.transpose(librosa.feature.rms(y=np.array(each)))

  def extract_spectral_centroid(self, each):
    return np.transpose(librosa.feature.spectral_centroid(y=np.array(each), sr=self.sr))

  def extract_melspectrogram(self, each):
    mel = librosa.feature.melspectrogram(y=np.array(each), sr=self.sr, n_mels=self.n_mels)
    return np.transpose(librosa.power_to_db(mel))


  ########################################
  # normalization function is called by process_features() function for the features that require normalization
  ########################################
  def normalization(self, train_X, val_X, n_time, n_features):
    """
    Normalize the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Normalized training feature
      val_X (npy): Normalized validation feature
    """
    # flatten the np arrays to 1D
    train_X_reshape = train_X.reshape(-1, n_time * n_features)
    val_X_reshape = val_X.reshape(-1, n_time * n_features)

    # use minmaxscaler to normalize the train and val features
    scaler = MinMaxScaler((0,1)).fit(train_X_reshape)
    train_X_scaled = scaler.transform(train_X_reshape)
    val_X_scaled = scaler.transform(val_X_reshape)

    # reshape the features to the original shape (n_time, n_features)
    train_X = train_X_scaled.reshape(-1, n_time, n_features)
    val_X = val_X_scaled.reshape(-1, n_time, n_features)

    return train_X, val_X


  ########################################
  # avgpooling function is called by process_features() function for the features that require average pooling
  ########################################
  def avgpooling(self, train_X, val_X, n_time, n_features):
    """
    Average pooling the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Avgpooled training feature
      val_X (npy): Avgpooled validation feature
    """
    # instantiate the keras layers and model.
    # the model only performs average pooling of inputs
    tf.keras.backend.clear_session()
    input = tf.keras.layers.Input(shape=(n_time, n_features))
    global_avg_pooling = tf.keras.layers.GlobalAveragePooling1D()
    pooled = global_avg_pooling(input)
    pooling_model = tf.keras.models.Model(inputs=input, outputs=pooled)

    # use the instantiated model to avgpool the train and val features
    train_X = pooling_model.predict(train_X)
    val_X = pooling_model.predict(val_X)

    return train_X, val_X


  ########################################
  # process_features function is called when the class is instantiated
  ########################################
  def process_features(self, train_features_dict, val_features_dict):
    """
    Apply normalization or average pooling to train and val features

    Parameters:
      train_features_dict (dict): Dictionary containing the training features
      val_features_dict (dict): Dictionary containing the validation features

    Returns:
      tuple: Tuple containing normalized and/or average pooled training and validation features dictionary
    """
    start_time = time.time()
    display('features processing started ---------->')

    for each in train_features_dict.keys():
      # 'continent', 'rating', and 'type' do not need to be processed
      if each == 'continent' or each == 'rating' or each == 'type':
        pass
      # process numeric features
      else:
        if each == 'mfcc':
          n_features=self.n_mfccs
        elif each == 'chroma':
          n_features=self.n_chroma
        elif each == 'rms' or each == 'spectral_centroid':
          n_features=1
        elif each == 'melspectrogram':
          n_features=self.n_mels

        # index the dictionary to find the value of the feature based on the feature name (key)
        train_feature = train_features_dict[each]
        val_feature = val_features_dict[each]

        # call normalization or avgpooling function to apply normalization or average pooling to features
        if self.do_normalize and self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_features_dict[each], val_features_dict[each], n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_normalize:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        else:
          train_features_dict[each], val_features_dict[each] = train_features_dict[each], val_features_dict[each]

        display(f'{each} processed')

    end_time = time.time()
    display(f'features processing took {(end_time - start_time)/60:.1f} mins')

    return train_features_dict, val_features_dict


  ########################################
  # extract_features function is called when the class is instantiated
  ########################################
  def extract_features(self, dataframe):
    """
    Extract the label & features from the dataframes

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information

    Returns:
      tuple: Tuple containing labels and a dictionary of extracted features
    """
    start_time = time.time()
    display(f'feature extraction started ---------->')

    # create empty list and dict to store the labels and features
    y = []
    features_dict = {item: [] for item in self.features}

    # iterate through each row of the dataframe to extract the label and features
    for _, row in dataframe.iterrows():
      label = row['primary_label']
      framed = row['framed']

      for each in framed:
        y.append(label)

        if 'continent' in self.features:
          features_dict['continent'].append(row['continent'])
        if 'rating' in self.features:
          features_dict['rating'].append(row['rating'])
        if 'type' in self.features:
          features_dict['type'].append(row['type'])

        # dynatically call the extract_x function to extract the listed features
        for feature in self.features:
          extract = f"extract_{feature}"
          if hasattr(self, extract) and callable(func := getattr(self, extract)):
            features_dict[feature].append(func(each))

    # cast lists to np arrays
    for each in features_dict.keys():
      features_dict[each] = np.array(features_dict[each])
    y = np.array(y)

    end_time = time.time()
    display(f'features extraction took {(end_time - start_time)/60:.1f} mins')

    return y, features_dict


# Load train and test framed audio data

In [3]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/train_df_5_sec_augmented.pkl', 'rb') as file:
  train_df = pickle.load(file)

train_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
240,eaywag1,call,42.965,9.4512,5.0,eaywag1/XC471811.ogg,17.8155,FR,EUROPE,eaywag1/XC471811.npy,train,"((tf.Tensor(8.750593e-06, shape=(), dtype=floa..."
542,barswa,song,42.806,13.8335,4.0,barswa/XC371853.ogg,50.834313,IT,EUROPE,barswa/XC371853.npy,train,"((tf.Tensor(-0.00011792055242018368, shape=(),..."
214,comsan,call,60.2357,25.0058,1.0,comsan/XC554068.ogg,25.6,FI,EUROPE,comsan/XC554068.npy,train,"((tf.Tensor(6.721793e-06, shape=(), dtype=floa..."
492,barswa,song,53.9299,-2.9833,2.5,barswa/XC690496.ogg,15.098812,GB,EUROPE,barswa/XC690496.npy,train,"((tf.Tensor(4.4646572e-07, shape=(), dtype=flo..."
190,comsan,call,51.5579,17.509,4.0,comsan/XC492893.ogg,11.755,PL,EUROPE,comsan/XC492893.npy,train,"((tf.Tensor(-0.005616761, shape=(), dtype=floa..."


In [4]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/val_df_5_sec.pkl', 'rb') as file:
  val_df = pickle.load(file)

val_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
969,comsan,call,41.8074,-8.8626,4.0,comsan/XC670080.ogg,11.206,PT,EUROPE,comsan/XC670080.npy,val,"((tf.Tensor(9.328758e-06, shape=(), dtype=floa..."
851,comsan,blank,51.6578,19.3281,5.0,comsan/XC738993.ogg,11.616,PL,EUROPE,comsan/XC738993.npy,val,"((tf.Tensor(6.5908675e-06, shape=(), dtype=flo..."
998,barswa,song,48.2131,-3.0137,5.0,barswa/XC643586.ogg,49.536,FR,EUROPE,barswa/XC643586.npy,val,"((tf.Tensor(1.9546133e-06, shape=(), dtype=flo..."
771,comsan,call,56.0779,47.9129,5.0,comsan/XC371997.ogg,12.355937,RU,EUROPE,comsan/XC371997.npy,val,"((tf.Tensor(5.00679e-06, shape=(), dtype=float..."
988,eaywag1,call,52.2003,-6.4349,2.0,eaywag1/XC687527.ogg,15.464,IE,EUROPE,eaywag1/XC687527.npy,val,"((tf.Tensor(2.4605542e-06, shape=(), dtype=flo..."


# Extract features

In [5]:
features_list = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram']

In [6]:
%%time

features = Extraction(train_df,
                      val_df,
                      features=features_list,
                      do_normalize=True,
                      do_avgpool=False)

'feature extraction started ---------->'

  return pitch_tuning(


'features extraction took 13.7 mins'

'feature extraction started ---------->'

'features extraction took 5.3 mins'

'features processing started ---------->'

'mfcc processed'

'chroma processed'

'rms processed'

'spectral_centroid processed'

'melspectrogram processed'

'features processing took 0.0 mins'

CPU times: user 17min 58s, sys: 12min 31s, total: 30min 30s
Wall time: 19min 1s


In [7]:
train_y = features.train_y

display(train_y.shape)
display(train_y[:5])

(10261,)

array(['eaywag1', 'eaywag1', 'eaywag1', 'eaywag1', 'eaywag1'], dtype='<U7')

In [8]:
train_features = features.train_features

for key in train_features.keys():
  display(key)
  display(train_features[key].shape)
  display(train_features[key][0])

'mfcc'

(10261, 157, 20)

array([[0.2148199 , 0.53295   , 0.58787652, ..., 0.49206123, 0.58662359,
        0.48072821],
       [0.23063787, 0.46433535, 0.56860237, ..., 0.53792659, 0.53372919,
        0.42353284],
       [0.28633877, 0.46663784, 0.5219466 , ..., 0.63144989, 0.51128766,
        0.40030859],
       ...,
       [0.33404294, 0.39252724, 0.54296704, ..., 0.66216793, 0.55985361,
        0.52580219],
       [0.34164979, 0.35850773, 0.55630248, ..., 0.52268431, 0.66016425,
        0.57782299],
       [0.34397934, 0.31435593, 0.5866993 , ..., 0.57935434, 0.6338881 ,
        0.46534603]])

'chroma'

(10261, 157, 12)

array([[0.72125012, 1.        , 0.70153726, ..., 0.51665933, 0.45164933,
        0.63683513],
       [0.38441563, 1.        , 0.51115559, ..., 0.1087205 , 0.17886741,
        0.28090129],
       [0.43155014, 1.        , 0.47017047, ..., 0.10447118, 0.14263515,
        0.21189574],
       ...,
       [0.15419191, 0.1466925 , 0.14042889, ..., 0.08234019, 0.04668454,
        0.11499088],
       [0.3178629 , 0.23551468, 0.2605378 , ..., 0.15343353, 0.13763483,
        0.57083627],
       [0.25173206, 0.18468852, 0.32232395, ..., 0.11787812, 0.17486725,
        0.65504684]])

'rms'

(10261, 157, 1)

array([[0.00037804],
       [0.00041586],
       [0.00037903],
       [0.00039831],
       [0.0004111 ],
       [0.00040243],
       [0.00043961],
       [0.000447  ],
       [0.00044419],
       [0.00053418],
       [0.00062343],
       [0.00076148],
       [0.0008278 ],
       [0.00204906],
       [0.00974285],
       [0.0314123 ],
       [0.06397449],
       [0.05991661],
       [0.05304547],
       [0.07168494],
       [0.10974371],
       [0.09875963],
       [0.0874064 ],
       [0.0657624 ],
       [0.02514094],
       [0.00653468],
       [0.00113259],
       [0.0007272 ],
       [0.0007297 ],
       [0.00051097],
       [0.00042027],
       [0.00045023],
       [0.00035952],
       [0.00037248],
       [0.00066104],
       [0.00066729],
       [0.00062794],
       [0.00061145],
       [0.00093647],
       [0.00119244],
       [0.0011318 ],
       [0.00131359],
       [0.00120825],
       [0.00126773],
       [0.00159484],
       [0.00126015],
       [0.00143015],
       [0.001

'spectral_centroid'

(10261, 157, 1)

array([[0.54273776],
       [0.56740379],
       [0.5665856 ],
       [0.58318569],
       [0.64406038],
       [0.68626509],
       [0.65365436],
       [0.60352928],
       [0.56666022],
       [0.59694112],
       [0.66747673],
       [0.71711086],
       [0.73021011],
       [0.62729001],
       [0.60250969],
       [0.61733502],
       [0.61425865],
       [0.63937974],
       [0.69357536],
       [0.76493017],
       [0.81638725],
       [0.76750463],
       [0.72368081],
       [0.73834644],
       [0.74598282],
       [0.73348843],
       [0.76216708],
       [0.73204428],
       [0.66927311],
       [0.65426888],
       [0.66950373],
       [0.58985512],
       [0.57540032],
       [0.59061851],
       [0.5594645 ],
       [0.55005387],
       [0.55519007],
       [0.54804415],
       [0.55354576],
       [0.61492985],
       [0.68072139],
       [0.66294331],
       [0.57931541],
       [0.54162267],
       [0.58757813],
       [0.66863965],
       [0.70115432],
       [0.620

'melspectrogram'

(10261, 157, 20)

array([[0.239329  , 0.20169071, 0.16591564, ..., 0.14137913, 0.13657354,
        0.15578518],
       [0.2271158 , 0.15795866, 0.15189533, ..., 0.25037369, 0.20394648,
        0.18483622],
       [0.22946148, 0.18727828, 0.20572799, ..., 0.29802781, 0.26195605,
        0.25842147],
       ...,
       [0.23673225, 0.23974862, 0.23700115, ..., 0.36293041, 0.47362068,
        0.43916982],
       [0.23433242, 0.23680281, 0.20873355, ..., 0.41108792, 0.47706506,
        0.45529814],
       [0.23628782, 0.2446731 , 0.20642802, ..., 0.43552963, 0.47305871,
        0.44487357]])

In [9]:
val_y = features.val_y
val_y = np.array(val_y)

display(val_y.shape)
display(val_y[:5])

(4236,)

array(['comsan', 'comsan', 'comsan', 'comsan', 'comsan'], dtype='<U7')

In [10]:
val_features = features.val_features

for key in val_features.keys():
  display(key)
  display(val_features[key].shape)
  display(val_features[key][0])

'mfcc'

(4236, 157, 20)

array([[0.12919118, 0.5332147 , 0.58448285, ..., 0.5001087 , 0.5968588 ,
        0.47302923],
       [0.21850848, 0.5172807 , 0.39729723, ..., 0.46162906, 0.6031759 ,
        0.5405065 ],
       [0.35402006, 0.5029663 , 0.37418777, ..., 0.5108615 , 0.5117495 ,
        0.50900865],
       ...,
       [0.61141056, 0.50481147, 0.2987647 , ..., 0.45481598, 0.31998146,
        0.43665218],
       [0.6103744 , 0.4917607 , 0.25696057, ..., 0.33913997, 0.4614352 ,
        0.5174127 ],
       [0.6112005 , 0.46322918, 0.22175561, ..., 0.36599386, 0.4360496 ,
        0.5096961 ]], dtype=float32)

'chroma'

(4236, 157, 12)

array([[0.638266  , 0.6402416 , 0.63484526, ..., 0.6785249 , 0.817814  ,
        0.6841474 ],
       [0.8057531 , 0.5431085 , 0.75786465, ..., 0.5369535 , 0.7538716 ,
        0.5710767 ],
       [0.5362104 , 0.46964097, 1.        , ..., 0.8512755 , 0.84020585,
        0.49520084],
       ...,
       [0.56576216, 0.57835037, 0.65007913, ..., 1.        , 0.8544148 ,
        0.3876615 ],
       [0.5738198 , 0.99999994, 0.9984492 , ..., 0.6838548 , 0.76467305,
        0.4722059 ],
       [0.49070495, 1.        , 0.625481  , ..., 0.6397636 , 0.62838703,
        0.57072383]], dtype=float32)

'rms'

(4236, 157, 1)

array([[0.0001651 ],
       [0.00066634],
       [0.00103816],
       [0.00188056],
       [0.00356781],
       [0.00475699],
       [0.00641029],
       [0.00822204],
       [0.00903485],
       [0.00986424],
       [0.01108038],
       [0.01650712],
       [0.01853311],
       [0.01965705],
       [0.01977885],
       [0.01709053],
       [0.01816905],
       [0.01477351],
       [0.01188663],
       [0.01120201],
       [0.01313992],
       [0.01149288],
       [0.01081272],
       [0.01124688],
       [0.01209087],
       [0.0133867 ],
       [0.01322389],
       [0.01319507],
       [0.01723755],
       [0.01464454],
       [0.01327809],
       [0.01511353],
       [0.01258245],
       [0.01269867],
       [0.01516492],
       [0.01404867],
       [0.01235007],
       [0.0116561 ],
       [0.01220266],
       [0.01325289],
       [0.01244891],
       [0.0132968 ],
       [0.01524711],
       [0.01828633],
       [0.01611781],
       [0.01356855],
       [0.0122597 ],
       [0.011

'spectral_centroid'

(4236, 157, 1)

array([[0.50887922],
       [0.45509232],
       [0.46766778],
       [0.50050337],
       [0.55332382],
       [0.60002475],
       [0.59861743],
       [0.56810603],
       [0.50715739],
       [0.495247  ],
       [0.48419236],
       [0.48841549],
       [0.51748484],
       [0.51068457],
       [0.49989721],
       [0.49557683],
       [0.48635545],
       [0.47204283],
       [0.48732144],
       [0.45858538],
       [0.45005775],
       [0.44768736],
       [0.46857971],
       [0.55031287],
       [0.5896404 ],
       [0.53854183],
       [0.51155234],
       [0.52306345],
       [0.52170887],
       [0.54437956],
       [0.54627369],
       [0.48078273],
       [0.50183687],
       [0.51989531],
       [0.51308203],
       [0.52065805],
       [0.50926324],
       [0.4851767 ],
       [0.45821809],
       [0.4916122 ],
       [0.52433294],
       [0.5426299 ],
       [0.58260609],
       [0.61217144],
       [0.60500523],
       [0.56005137],
       [0.56129915],
       [0.530

'melspectrogram'

(4236, 157, 20)

array([[0.12509389, 0.07402582, 0.07103718, ..., 0.05830182, 0.03726496,
        0.03640219],
       [0.13999654, 0.09017412, 0.11917368, ..., 0.16569921, 0.15206379,
        0.17554478],
       [0.21755145, 0.19466285, 0.22259887, ..., 0.2865749 , 0.30129182,
        0.3170755 ],
       ...,
       [0.37416503, 0.41479376, 0.5050743 , ..., 0.47857845, 0.5075372 ,
        0.49894968],
       [0.3718377 , 0.4133409 , 0.4916974 , ..., 0.48150223, 0.5009154 ,
        0.5188349 ],
       [0.3744141 , 0.42663068, 0.48142073, ..., 0.48007607, 0.4991594 ,
        0.51347286]], dtype=float32)

# Encode Classes

In [11]:
label_encoder = LabelEncoder().fit(train_y)
train_y = label_encoder.transform(train_y)
val_y = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0,1,2]))
classes

['barswa', 'comsan', 'eaywag1']

In [12]:
display(len(train_y))
display(train_y[:10])

display(len(val_y))
display(val_y[:10])

10261

array([2, 2, 2, 2, 2, 2, 0, 0, 0, 0])

4236

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0])

# Put the labels and features to one large dataframe

In [13]:
train_features['label'] = train_y
train_features

{'mfcc': array([[[0.2148199 , 0.53295   , 0.58787652, ..., 0.49206123,
          0.58662359, 0.48072821],
         [0.23063787, 0.46433535, 0.56860237, ..., 0.53792659,
          0.53372919, 0.42353284],
         [0.28633877, 0.46663784, 0.5219466 , ..., 0.63144989,
          0.51128766, 0.40030859],
         ...,
         [0.33404294, 0.39252724, 0.54296704, ..., 0.66216793,
          0.55985361, 0.52580219],
         [0.34164979, 0.35850773, 0.55630248, ..., 0.52268431,
          0.66016425, 0.57782299],
         [0.34397934, 0.31435593, 0.5866993 , ..., 0.57935434,
          0.6338881 , 0.46534603]],
 
        [[0.33366811, 0.36054192, 0.57536646, ..., 0.54097245,
          0.56656605, 0.49701663],
         [0.32277581, 0.39225805, 0.55406741, ..., 0.48997935,
          0.57137973, 0.49731252],
         [0.32785961, 0.44343203, 0.52121772, ..., 0.60260673,
          0.50713302, 0.45093677],
         ...,
         [0.35726484, 0.34551004, 0.55016513, ..., 0.64942049,
          0.5531

In [14]:
val_features['label'] = val_y
val_features

{'mfcc': array([[[0.12919118, 0.5332147 , 0.58448285, ..., 0.5001087 ,
          0.5968588 , 0.47302923],
         [0.21850848, 0.5172807 , 0.39729723, ..., 0.46162906,
          0.6031759 , 0.5405065 ],
         [0.35402006, 0.5029663 , 0.37418777, ..., 0.5108615 ,
          0.5117495 , 0.50900865],
         ...,
         [0.61141056, 0.50481147, 0.2987647 , ..., 0.45481598,
          0.31998146, 0.43665218],
         [0.6103744 , 0.4917607 , 0.25696057, ..., 0.33913997,
          0.4614352 , 0.5174127 ],
         [0.6112005 , 0.46322918, 0.22175561, ..., 0.36599386,
          0.4360496 , 0.5096961 ]],
 
        [[0.6335632 , 0.4697707 , 0.29694235, ..., 0.5798572 ,
          0.5047418 , 0.4549193 ],
         [0.63138515, 0.45596153, 0.28812978, ..., 0.56055886,
          0.44908574, 0.51534736],
         [0.62443143, 0.48763642, 0.3296271 , ..., 0.6253878 ,
          0.4033895 , 0.51337624],
         ...,
         [0.6030011 , 0.51813394, 0.29572427, ..., 0.51720375,
          0.3583

In [15]:
merged_dict = {'train': train_features, 'val': val_features}

merged_dict

{'train': {'mfcc': array([[[0.2148199 , 0.53295   , 0.58787652, ..., 0.49206123,
           0.58662359, 0.48072821],
          [0.23063787, 0.46433535, 0.56860237, ..., 0.53792659,
           0.53372919, 0.42353284],
          [0.28633877, 0.46663784, 0.5219466 , ..., 0.63144989,
           0.51128766, 0.40030859],
          ...,
          [0.33404294, 0.39252724, 0.54296704, ..., 0.66216793,
           0.55985361, 0.52580219],
          [0.34164979, 0.35850773, 0.55630248, ..., 0.52268431,
           0.66016425, 0.57782299],
          [0.34397934, 0.31435593, 0.5866993 , ..., 0.57935434,
           0.6338881 , 0.46534603]],
  
         [[0.33366811, 0.36054192, 0.57536646, ..., 0.54097245,
           0.56656605, 0.49701663],
          [0.32277581, 0.39225805, 0.55406741, ..., 0.48997935,
           0.57137973, 0.49731252],
          [0.32785961, 0.44343203, 0.52121772, ..., 0.60260673,
           0.50713302, 0.45093677],
          ...,
          [0.35726484, 0.34551004, 0.55016513, ..

# save the merged dict with labels and features to pkl

In [16]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_5_sec_augmented_audio_features_not_avgpooled.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

In [17]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_5_sec_augmented_audio_features_not_avgpooled.pkl', 'rb') as file:
  merged_df = pickle.load(file)

In [18]:
assert merged_dict.keys() == merged_df.keys()

for top_level_key in merged_dict.keys():
  assert merged_dict[top_level_key].keys() == merged_df[top_level_key].keys()

  for bottom_level_key in merged_dict[top_level_key].keys():
    assert merged_dict[top_level_key][bottom_level_key].shape == merged_df[top_level_key][bottom_level_key].shape
    assert np.array_equal(merged_dict[top_level_key][bottom_level_key], merged_df[top_level_key][bottom_level_key])