<a href="https://colab.research.google.com/github/rachlllg/Project_Bird-Song-Classifier-with-Machine-Learning/blob/main/3.model_prep/d.extract_features_labels/8_sec_augmented_audio_features_not_avgpooled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
# for drive access
from google.colab import drive
drive.mount('/content/drive')

# standard libraries
import numpy as np
import pandas as pd
import pickle
import time

# for audio
import librosa

# for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

# for saving the label & features to disk
import pickle

Mounted at /content/drive


# Self Defined Class Methods For Feature Extraction

In [2]:
class Extraction:

  def __init__(self,
               train_df,
               val_df,
               sr=16000,
               n_mfccs=20, #20 is the default n_mfccs from librosa
               n_mels=20, #reduced from 128 default from librosa to 20
               n_chroma=12, #12 is the default n_chroma from librosa
               features=['mfcc'],
               do_normalize=True,
               do_avgpool=True #WARNING: THE KERNEL WILL CRASH IF YOU TRY TO EXTRACT ALL FEATURES WITHOUT AVGPOOL
               ):
    """
    Instantiate the Extraction class and extract the labels and features from train and val DataFrames.

    The extract_features() method is automatically called to extract the labels and features from the given DataFrame.

    Parameters:
      train_df (pd.DataFrame): DataFrame containing training data.
      val_df (pd.DataFrame): DataFrame containing validation data.
      sr (int): Sample rate of the audio files.
      n_mfccs (int): Number of MFCCs to extract.
      n_mels (int): Number of Mel bands to extract.
      n_chroma (int): Number of chroma bins to use.
      features (list): List of features to extract.
        accepted features: 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type'.
      normalize (bool): Whether to normalize the features.
      maxpool (bool): Whether to maxpool the features.
    """
    # instantiate class variables
    self.sr = sr
    self.n_mfccs = n_mfccs
    self.n_mels = n_mels
    self.n_chroma = n_chroma
    self.do_normalize = do_normalize
    self.do_avgpool = do_avgpool

    # confirm features have been specified
    assert len(features) != 0, "Must Specify At Least One Feature In The Form Of A List."
    self.features = features

    # confirm features are valid
    self.accepted_feature = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type']
    for each in self.features:
      assert each in self.accepted_feature, f"{each} is not an accepted feature, only 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', and 'type' are accpeted features."

    # extract train and val labels and features
    self.train_y, self.train_features = self.extract_features(train_df)
    self.val_y, self.val_features = self.extract_features(val_df)

    # process the features by applying normalization or average pooling
    self.train_features, self.val_features = self.process_features(self.train_features, self.val_features)


  ########################################
  # each of the extract_x() function is callable by the extract_features() function based on the features instantiated
  # each of the features are extracted from librosa and transposed to shape (n_time, n_features)
  ########################################
  def extract_mfcc(self, each):
    return np.transpose(librosa.feature.mfcc(y=np.array(each), sr=self.sr, n_mfcc=self.n_mfccs))

  def extract_chroma(self, each):
    return np.transpose(librosa.feature.chroma_stft(y=np.array(each), sr=self.sr, n_chroma=self.n_chroma))

  def extract_rms(self, each):
    return np.transpose(librosa.feature.rms(y=np.array(each)))

  def extract_spectral_centroid(self, each):
    return np.transpose(librosa.feature.spectral_centroid(y=np.array(each), sr=self.sr))

  def extract_melspectrogram(self, each):
    mel = librosa.feature.melspectrogram(y=np.array(each), sr=self.sr, n_mels=self.n_mels)
    return np.transpose(librosa.power_to_db(mel))


  ########################################
  # normalization function is called by process_features() function for the features that require normalization
  ########################################
  def normalization(self, train_X, val_X, n_time, n_features):
    """
    Normalize the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Normalized training feature
      val_X (npy): Normalized validation feature
    """
    # flatten the np arrays to 1D
    train_X_reshape = train_X.reshape(-1, n_time * n_features)
    val_X_reshape = val_X.reshape(-1, n_time * n_features)

    # use minmaxscaler to normalize the train and val features
    scaler = MinMaxScaler((0,1)).fit(train_X_reshape)
    train_X_scaled = scaler.transform(train_X_reshape)
    val_X_scaled = scaler.transform(val_X_reshape)

    # reshape the features to the original shape (n_time, n_features)
    train_X = train_X_scaled.reshape(-1, n_time, n_features)
    val_X = val_X_scaled.reshape(-1, n_time, n_features)

    return train_X, val_X


  ########################################
  # avgpooling function is called by process_features() function for the features that require average pooling
  ########################################
  def avgpooling(self, train_X, val_X, n_time, n_features):
    """
    Average pooling the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Avgpooled training feature
      val_X (npy): Avgpooled validation feature
    """
    # instantiate the keras layers and model.
    # the model only performs average pooling of inputs
    tf.keras.backend.clear_session()
    input = tf.keras.layers.Input(shape=(n_time, n_features))
    global_avg_pooling = tf.keras.layers.GlobalAveragePooling1D()
    pooled = global_avg_pooling(input)
    pooling_model = tf.keras.models.Model(inputs=input, outputs=pooled)

    # use the instantiated model to avgpool the train and val features
    train_X = pooling_model.predict(train_X)
    val_X = pooling_model.predict(val_X)

    return train_X, val_X


  ########################################
  # process_features function is called when the class is instantiated
  ########################################
  def process_features(self, train_features_dict, val_features_dict):
    """
    Apply normalization or average pooling to train and val features

    Parameters:
      train_features_dict (dict): Dictionary containing the training features
      val_features_dict (dict): Dictionary containing the validation features

    Returns:
      tuple: Tuple containing normalized and/or average pooled training and validation features dictionary
    """
    start_time = time.time()
    display('features processing started ---------->')

    for each in train_features_dict.keys():
      # 'continent', 'rating', and 'type' do not need to be processed
      if each == 'continent' or each == 'rating' or each == 'type':
        pass
      # process numeric features
      else:
        if each == 'mfcc':
          n_features=self.n_mfccs
        elif each == 'chroma':
          n_features=self.n_chroma
        elif each == 'rms' or each == 'spectral_centroid':
          n_features=1
        elif each == 'melspectrogram':
          n_features=self.n_mels

        # index the dictionary to find the value of the feature based on the feature name (key)
        train_feature = train_features_dict[each]
        val_feature = val_features_dict[each]

        # call normalization or avgpooling function to apply normalization or average pooling to features
        if self.do_normalize and self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_features_dict[each], val_features_dict[each], n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_normalize:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        else:
          train_features_dict[each], val_features_dict[each] = train_features_dict[each], val_features_dict[each]

        display(f'{each} processed')

    end_time = time.time()
    display(f'features processing took {(end_time - start_time)/60:.1f} mins')

    return train_features_dict, val_features_dict


  ########################################
  # extract_features function is called when the class is instantiated
  ########################################
  def extract_features(self, dataframe):
    """
    Extract the label & features from the dataframes

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information

    Returns:
      tuple: Tuple containing labels and a dictionary of extracted features
    """
    start_time = time.time()
    display(f'feature extraction started ---------->')

    # create empty list and dict to store the labels and features
    y = []
    features_dict = {item: [] for item in self.features}

    # iterate through each row of the dataframe to extract the label and features
    for _, row in dataframe.iterrows():
      label = row['primary_label']
      framed = row['framed']

      for each in framed:
        y.append(label)

        if 'continent' in self.features:
          features_dict['continent'].append(row['continent'])
        if 'rating' in self.features:
          features_dict['rating'].append(row['rating'])
        if 'type' in self.features:
          features_dict['type'].append(row['type'])

        # dynatically call the extract_x function to extract the listed features
        for feature in self.features:
          extract = f"extract_{feature}"
          if hasattr(self, extract) and callable(func := getattr(self, extract)):
            features_dict[feature].append(func(each))

    # cast lists to np arrays
    for each in features_dict.keys():
      features_dict[each] = np.array(features_dict[each])
    y = np.array(y)

    end_time = time.time()
    display(f'features extraction took {(end_time - start_time)/60:.1f} mins')

    return y, features_dict


# Load train and test framed audio data

In [3]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/train_df_8_sec_augmented.pkl', 'rb') as file:
  train_df = pickle.load(file)

train_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
668,barswa,blank,-33.2377,26.9574,5.0,barswa/XC713037.ogg,58.331,ZA,AFRICA,barswa/XC713037.npy,train,"((tf.Tensor(-1.0659789e-05, shape=(), dtype=fl..."
247,eaywag1,call,54.1191,13.376,3.5,eaywag1/XC658750.ogg,49.68,DE,EUROPE,eaywag1/XC658750.npy,train,"((tf.Tensor(-1.1132361e-06, shape=(), dtype=fl..."
620,barswa,song,35.0573,34.0009,5.0,barswa/XC405617.ogg,114.207375,CY,ASIA,barswa/XC405617.npy,train,"((tf.Tensor(3.9268853e-06, shape=(), dtype=flo..."
86,comsan,call,43.5352,-1.4475,4.5,comsan/XC580687.ogg,59.82,FR,EUROPE,comsan/XC580687.npy,train,"((tf.Tensor(-8.466335e-06, shape=(), dtype=flo..."
450,eaywag1,song,43.5118,3.8367,3.0,eaywag1/XC567939.ogg,10.031,FR,EUROPE,eaywag1/XC567939.npy,train,"((tf.Tensor(-0.00037298112779295535, shape=(),..."


In [4]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/val_df_8_sec.pkl', 'rb') as file:
  val_df = pickle.load(file)

val_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
991,barswa,song,39.4709,-8.0205,3.5,barswa/XC624409.ogg,10.008,PT,EUROPE,barswa/XC624409.npy,val,"((tf.Tensor(3.148432e-08, shape=(), dtype=floa..."
760,barswa,call,51.4522,-9.8189,5.0,barswa/XC666943.ogg,11.885,IE,EUROPE,barswa/XC666943.npy,val,"((tf.Tensor(-8.617062e-07, shape=(), dtype=flo..."
876,comsan,call,50.7347,3.2143,4.0,comsan/XC578171.ogg,21.707,BE,EUROPE,comsan/XC578171.npy,val,"((tf.Tensor(-3.9801816e-06, shape=(), dtype=fl..."
927,eaywag1,song,46.0192,61.846,5.0,eaywag1/XC184434.ogg,63.921625,KZ,ASIA,eaywag1/XC184434.npy,val,"((tf.Tensor(-4.7415142e-07, shape=(), dtype=fl..."
880,comsan,call,42.9037,13.9077,4.0,comsan/XC433334.ogg,24.006,IT,EUROPE,comsan/XC433334.npy,val,"((tf.Tensor(-1.1874385e-05, shape=(), dtype=fl..."


# Extract features

In [5]:
features_list = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram']

In [6]:
%%time

features = Extraction(train_df,
                      val_df,
                      features=features_list,
                      do_normalize=True,
                      do_avgpool=False)

'feature extraction started ---------->'

  return pitch_tuning(


'features extraction took 10.5 mins'

'feature extraction started ---------->'

'features extraction took 4.9 mins'

'features processing started ---------->'

'mfcc processed'

'chroma processed'

'rms processed'

'spectral_centroid processed'

'melspectrogram processed'

'features processing took 0.0 mins'

CPU times: user 14min 38s, sys: 10min 21s, total: 24min 59s
Wall time: 15min 27s


In [7]:
train_y = features.train_y

display(train_y.shape)
display(train_y[:5])

(4763,)

array(['barswa', 'barswa', 'barswa', 'barswa', 'barswa'], dtype='<U7')

In [8]:
train_features = features.train_features

for key in train_features.keys():
  display(key)
  display(train_features[key].shape)
  display(train_features[key][0])

'mfcc'

(4763, 251, 20)

array([[0.22347913, 0.51619039, 0.58610979, ..., 0.46299123, 0.51647623,
        0.40249775],
       [0.3336898 , 0.61088184, 0.52389999, ..., 0.376493  , 0.48077972,
        0.2811073 ],
       [0.40371527, 0.62153083, 0.50498171, ..., 0.34907111, 0.530994  ,
        0.28530052],
       ...,
       [0.60532159, 0.3209368 , 0.46069814, ..., 0.6343445 , 0.43218797,
        0.27568527],
       [0.64946865, 0.23773321, 0.44509425, ..., 0.60717631, 0.43851996,
        0.31840855],
       [0.68213725, 0.21014449, 0.54208053, ..., 0.54995673, 0.58843607,
        0.36812686]])

'chroma'

(4763, 251, 12)

array([[0.52294409, 0.71596001, 0.62643951, ..., 0.63101716, 0.68047392,
        0.54690992],
       [0.56654324, 1.        , 0.91968237, ..., 0.47876214, 0.40841445,
        0.3996152 ],
       [0.51606201, 0.9960639 , 0.93038569, ..., 0.78231568, 0.64017407,
        0.3961056 ],
       ...,
       [0.77519797, 0.63038016, 0.35968553, ..., 1.        , 0.2167529 ,
        0.29014085],
       [0.99494271, 0.72887541, 0.40348705, ..., 0.35124522, 0.2296062 ,
        0.41988482],
       [1.        , 0.81430905, 0.34676393, ..., 0.21572884, 0.26762123,
        0.42807868]])

'rms'

(4763, 251, 1)

array([[0.0006407 ],
       [0.00101887],
       [0.00140997],
       [0.00168901],
       [0.00244486],
       [0.00337473],
       [0.00470947],
       [0.00426743],
       [0.0037418 ],
       [0.00363956],
       [0.00314239],
       [0.00267778],
       [0.00231609],
       [0.00184686],
       [0.00166689],
       [0.00170864],
       [0.00171606],
       [0.0017985 ],
       [0.00187735],
       [0.00201207],
       [0.00223573],
       [0.00206306],
       [0.00145832],
       [0.00123229],
       [0.00129359],
       [0.00168839],
       [0.00292144],
       [0.0052836 ],
       [0.00458286],
       [0.00430993],
       [0.004082  ],
       [0.00298816],
       [0.00181833],
       [0.00156705],
       [0.00163407],
       [0.00139085],
       [0.00128362],
       [0.00123364],
       [0.00119439],
       [0.00138771],
       [0.00216407],
       [0.00387386],
       [0.0057174 ],
       [0.00740583],
       [0.00701333],
       [0.00682821],
       [0.0074862 ],
       [0.006

'spectral_centroid'

(4763, 251, 1)

array([[0.41560237],
       [0.3488841 ],
       [0.35178072],
       [0.3631572 ],
       [0.36763955],
       [0.3819891 ],
       [0.40506985],
       [0.42488024],
       [0.42696371],
       [0.41429555],
       [0.43300418],
       [0.46170061],
       [0.42433029],
       [0.36734657],
       [0.34285891],
       [0.37031847],
       [0.36792572],
       [0.34114203],
       [0.36461973],
       [0.38835786],
       [0.40888544],
       [0.3718865 ],
       [0.34842917],
       [0.35899508],
       [0.34641406],
       [0.36303838],
       [0.40368297],
       [0.46339964],
       [0.51292783],
       [0.54217445],
       [0.49664771],
       [0.41966382],
       [0.40368933],
       [0.42672119],
       [0.45385277],
       [0.43532831],
       [0.41340241],
       [0.39668201],
       [0.39235935],
       [0.40248649],
       [0.41445876],
       [0.44730245],
       [0.47236127],
       [0.49149581],
       [0.46954922],
       [0.46734924],
       [0.4684658 ],
       [0.480

'melspectrogram'

(4763, 251, 20)

array([[0.2364647 , 0.16941354, 0.21313662, ..., 0.11870626, 0.10624752,
        0.11600042],
       [0.40559487, 0.35839159, 0.38869732, ..., 0.23959077, 0.18122774,
        0.15082713],
       [0.44530502, 0.42399529, 0.46755615, ..., 0.29707899, 0.24825851,
        0.2118392 ],
       ...,
       [0.44474467, 0.4106836 , 0.39602755, ..., 0.63550767, 0.6014603 ,
        0.51341518],
       [0.42926503, 0.43442473, 0.40993853, ..., 0.66610135, 0.65376523,
        0.58152049],
       [0.48418387, 0.48706702, 0.45762766, ..., 0.68074861, 0.67409912,
        0.62962232]])

In [9]:
val_y = features.val_y
val_y = np.array(val_y)

display(val_y.shape)
display(val_y[:5])

(2430,)

array(['barswa', 'barswa', 'comsan', 'comsan', 'comsan'], dtype='<U7')

In [10]:
val_features = features.val_features

for key in val_features.keys():
  display(key)
  display(val_features[key].shape)
  display(val_features[key][0])

'mfcc'

(2430, 251, 20)

array([[0.02102988, 0.49546376, 0.5634463 , ..., 0.54980135, 0.5692743 ,
        0.36346972],
       [0.13841833, 0.5677602 , 0.39713162, ..., 0.4689208 , 0.43671373,
        0.4426377 ],
       [0.26611635, 0.5491729 , 0.36295113, ..., 0.37081182, 0.57460433,
        0.41903657],
       ...,
       [0.5380419 , 0.50546974, 0.3585075 , ..., 0.3457247 , 0.41309163,
        0.3303963 ],
       [0.54682106, 0.4728677 , 0.33624262, ..., 0.43985772, 0.38597786,
        0.38060948],
       [0.54967153, 0.46743158, 0.37205654, ..., 0.49031943, 0.40380237,
        0.36328316]], dtype=float32)

'chroma'

(2430, 251, 12)

array([[0.7646641 , 0.6593769 , 1.        , ..., 0.7147243 , 0.7373059 ,
        0.577032  ],
       [0.99999994, 0.9776191 , 0.72994137, ..., 0.2051627 , 0.25378546,
        0.47793117],
       [1.        , 0.95967495, 0.82782406, ..., 0.5209369 , 0.5438997 ,
        0.79366875],
       ...,
       [0.16291185, 0.1436385 , 0.1746708 , ..., 0.20659254, 0.23780039,
        0.16433871],
       [0.20227225, 0.16570723, 0.19001192, ..., 0.13033651, 0.15900525,
        0.17467377],
       [0.14167623, 0.14604002, 0.20582493, ..., 0.14085199, 0.10830344,
        0.16476049]], dtype=float32)

'rms'

(2430, 251, 1)

array([[6.83520193e-05],
       [2.35141866e-04],
       [4.93349507e-04],
       [8.20668822e-04],
       [1.44608179e-03],
       [1.86402572e-03],
       [2.41919397e-03],
       [2.37728003e-03],
       [2.40357849e-03],
       [2.67681363e-03],
       [2.96152546e-03],
       [3.10781086e-03],
       [3.31475982e-03],
       [3.53769539e-03],
       [4.12380928e-03],
       [4.79059434e-03],
       [4.90209414e-03],
       [5.37926611e-03],
       [5.66170877e-03],
       [6.00947905e-03],
       [7.29669677e-03],
       [7.10990280e-03],
       [5.03015472e-03],
       [4.14387183e-03],
       [3.75449890e-03],
       [3.87072191e-03],
       [3.96701228e-03],
       [4.81483899e-03],
       [3.52336420e-03],
       [3.14991991e-03],
       [3.65278241e-03],
       [1.60477366e-02],
       [1.73417255e-02],
       [1.91217829e-02],
       [2.63766553e-02],
       [1.58214401e-02],
       [1.56479180e-02],
       [1.90699417e-02],
       [1.50876287e-02],
       [1.66102834e-02],


'spectral_centroid'

(2430, 251, 1)

array([[0.53918067],
       [0.37127445],
       [0.40004177],
       [0.42715763],
       [0.40471353],
       [0.37641665],
       [0.39608602],
       [0.42663217],
       [0.42418532],
       [0.42157003],
       [0.42801118],
       [0.42451467],
       [0.41180508],
       [0.41901666],
       [0.42760214],
       [0.44360703],
       [0.44889982],
       [0.44368376],
       [0.46388564],
       [0.47408765],
       [0.50927229],
       [0.51997638],
       [0.49789525],
       [0.44221731],
       [0.4297834 ],
       [0.45360216],
       [0.47462421],
       [0.4824302 ],
       [0.45064444],
       [0.45340989],
       [0.43691567],
       [0.53882412],
       [0.74273656],
       [0.78356674],
       [0.73946018],
       [0.74121839],
       [0.71486457],
       [0.66442876],
       [0.69945561],
       [0.71448152],
       [0.66254842],
       [0.6379428 ],
       [0.69981303],
       [0.67002917],
       [0.46624209],
       [0.54703206],
       [0.59645239],
       [0.596

'melspectrogram'

(2430, 251, 20)

array([[0.06690995, 0.03329332, 0.04463517, ..., 0.01262233, 0.01312917,
        0.01779763],
       [0.15448193, 0.15125908, 0.18324402, ..., 0.06022155, 0.05240088,
        0.05764714],
       [0.22855826, 0.2564222 , 0.26973918, ..., 0.16696048, 0.1583665 ,
        0.16625282],
       ...,
       [0.43946368, 0.46047133, 0.4737169 , ..., 0.40462428, 0.40943423,
        0.38751614],
       [0.4163849 , 0.46501952, 0.4810025 , ..., 0.4137594 , 0.4102102 ,
        0.3908512 ],
       [0.4134815 , 0.46015227, 0.49349833, ..., 0.3973255 , 0.40692577,
        0.40018758]], dtype=float32)

# Encode Classes

In [11]:
label_encoder = LabelEncoder().fit(train_y)
train_y = label_encoder.transform(train_y)
val_y = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0,1,2]))
classes

['barswa', 'comsan', 'eaywag1']

In [12]:
display(len(train_y))
display(train_y[:10])

display(len(val_y))
display(val_y[:10])

4763

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

2430

array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2])

# Put the labels and features to one large dataframe

In [13]:
train_features['label'] = train_y
train_features

{'mfcc': array([[[0.22347913, 0.51619039, 0.58610979, ..., 0.46299123,
          0.51647623, 0.40249775],
         [0.3336898 , 0.61088184, 0.52389999, ..., 0.376493  ,
          0.48077972, 0.2811073 ],
         [0.40371527, 0.62153083, 0.50498171, ..., 0.34907111,
          0.530994  , 0.28530052],
         ...,
         [0.60532159, 0.3209368 , 0.46069814, ..., 0.6343445 ,
          0.43218797, 0.27568527],
         [0.64946865, 0.23773321, 0.44509425, ..., 0.60717631,
          0.43851996, 0.31840855],
         [0.68213725, 0.21014449, 0.54208053, ..., 0.54995673,
          0.58843607, 0.36812686]],
 
        [[0.47704638, 0.47512975, 0.44260051, ..., 0.22843119,
          0.63831925, 0.59547307],
         [0.4762708 , 0.48589278, 0.40713749, ..., 0.34864821,
          0.60366178, 0.58643531],
         [0.48173468, 0.49144079, 0.40342839, ..., 0.35512423,
          0.70732799, 0.43648864],
         ...,
         [0.61064922, 0.29490932, 0.54876544, ..., 0.27377168,
          0.5193

In [14]:
val_features['label'] = val_y
val_features

{'mfcc': array([[[0.02102988, 0.49546376, 0.5634463 , ..., 0.54980135,
          0.5692743 , 0.36346972],
         [0.13841833, 0.5677602 , 0.39713162, ..., 0.4689208 ,
          0.43671373, 0.4426377 ],
         [0.26611635, 0.5491729 , 0.36295113, ..., 0.37081182,
          0.57460433, 0.41903657],
         ...,
         [0.5380419 , 0.50546974, 0.3585075 , ..., 0.3457247 ,
          0.41309163, 0.3303963 ],
         [0.54682106, 0.4728677 , 0.33624262, ..., 0.43985772,
          0.38597786, 0.38060948],
         [0.54967153, 0.46743158, 0.37205654, ..., 0.49031943,
          0.40380237, 0.36328316]],
 
        [[0.36780798, 0.64059895, 0.74329525, ..., 0.42773783,
          0.5616721 , 0.4949691 ],
         [0.5558524 , 0.7846735 , 0.5718928 , ..., 0.55828017,
          0.5474922 , 0.7035029 ],
         [0.6395415 , 0.7731212 , 0.54436547, ..., 0.5974678 ,
          0.62592334, 0.5966517 ],
         ...,
         [0.7475659 , 0.63192075, 0.5725203 , ..., 0.63424027,
          0.5511

In [15]:
merged_dict = {'train': train_features, 'val': val_features}

merged_dict

{'train': {'mfcc': array([[[0.22347913, 0.51619039, 0.58610979, ..., 0.46299123,
           0.51647623, 0.40249775],
          [0.3336898 , 0.61088184, 0.52389999, ..., 0.376493  ,
           0.48077972, 0.2811073 ],
          [0.40371527, 0.62153083, 0.50498171, ..., 0.34907111,
           0.530994  , 0.28530052],
          ...,
          [0.60532159, 0.3209368 , 0.46069814, ..., 0.6343445 ,
           0.43218797, 0.27568527],
          [0.64946865, 0.23773321, 0.44509425, ..., 0.60717631,
           0.43851996, 0.31840855],
          [0.68213725, 0.21014449, 0.54208053, ..., 0.54995673,
           0.58843607, 0.36812686]],
  
         [[0.47704638, 0.47512975, 0.44260051, ..., 0.22843119,
           0.63831925, 0.59547307],
          [0.4762708 , 0.48589278, 0.40713749, ..., 0.34864821,
           0.60366178, 0.58643531],
          [0.48173468, 0.49144079, 0.40342839, ..., 0.35512423,
           0.70732799, 0.43648864],
          ...,
          [0.61064922, 0.29490932, 0.54876544, ..

# save the merged dict with labels and features to pkl

In [16]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_8_sec_augmented_audio_features_not_avgpooled.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

In [17]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_8_sec_augmented_audio_features_not_avgpooled.pkl', 'rb') as file:
  merged_df = pickle.load(file)

In [18]:
assert merged_dict.keys() == merged_df.keys()

for top_level_key in merged_dict.keys():
  assert merged_dict[top_level_key].keys() == merged_df[top_level_key].keys()

  for bottom_level_key in merged_dict[top_level_key].keys():
    assert merged_dict[top_level_key][bottom_level_key].shape == merged_df[top_level_key][bottom_level_key].shape
    assert np.array_equal(merged_dict[top_level_key][bottom_level_key], merged_df[top_level_key][bottom_level_key])