# Import Libraries

In [1]:
# for drive access
from google.colab import drive
drive.mount('/content/drive')

# standard libraries
import numpy as np
import pandas as pd
import pickle
import time

# for audio
import librosa

# for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

# for saving the label & features to disk
import pickle

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Self Defined Class Methods For Feature Extraction

In [2]:
class Extraction:

  def __init__(self,
               train_df,
               test_df,
               sr=16000,
               n_mfccs=20, #20 is the default n_mfccs from librosa
               n_mels=20, #reduced from 128 default from librosa to 20
               n_chroma=12, #12 is the default n_chroma from librosa
               features=['mfcc'],
               do_normalize=True,
               do_avgpool=True #WARNING: THE KERNEL WILL CRASH IF YOU TRY TO EXTRACT ALL FEATURES WITHOUT AVGPOOL
               ):
    """
    Instantiate the Extraction class and extract the labels and features from train and val DataFrames.

    The extract_features() method is automatically called to extract the labels and features from the given DataFrame.

    Parameters:
      train_df (pd.DataFrame): DataFrame containing training data.
      test_df (pd.DataFrame): DataFrame containing test data.
      sr (int): Sample rate of the audio files.
      n_mfccs (int): Number of MFCCs to extract.
      n_mels (int): Number of Mel bands to extract.
      n_chroma (int): Number of chroma bins to use.
      features (list): List of features to extract.
        accepted features: 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type'.
      normalize (bool): Whether to normalize the features.
      maxpool (bool): Whether to maxpool the features.
    """
    # instantiate class variables
    self.sr = sr
    self.n_mfccs = n_mfccs
    self.n_mels = n_mels
    self.n_chroma = n_chroma
    self.do_normalize = do_normalize
    self.do_avgpool = do_avgpool

    # confirm features have been specified
    assert len(features) != 0, "Must Specify At Least One Feature In The Form Of A List."
    self.features = features

    # confirm features are valid
    self.accepted_feature = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type']
    for each in self.features:
      assert each in self.accepted_feature, f"{each} is not an accepted feature, only 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', and 'type' are accpeted features."

    # extract train and val labels and features
    self.train_y, self.train_features = self.extract_features(train_df)
    self.test_y, self.test_features = self.extract_features(test_df)

    # process the features by applying normalization or average pooling
    self.train_features, self.test_features = self.process_features(self.train_features, self.test_features)


  ########################################
  # each of the extract_x() function is callable by the extract_features() function based on the features instantiated
  # each of the features are extracted from librosa and transposed to shape (n_time, n_features)
  ########################################
  def extract_mfcc(self, each):
    return np.transpose(librosa.feature.mfcc(y=np.array(each), sr=self.sr, n_mfcc=self.n_mfccs))

  def extract_chroma(self, each):
    return np.transpose(librosa.feature.chroma_stft(y=np.array(each), sr=self.sr, n_chroma=self.n_chroma))

  def extract_rms(self, each):
    return np.transpose(librosa.feature.rms(y=np.array(each)))

  def extract_spectral_centroid(self, each):
    return np.transpose(librosa.feature.spectral_centroid(y=np.array(each), sr=self.sr))

  def extract_melspectrogram(self, each):
    mel = librosa.feature.melspectrogram(y=np.array(each), sr=self.sr, n_mels=self.n_mels)
    return np.transpose(librosa.power_to_db(mel))


  ########################################
  # normalization function is called by process_features() function for the features that require normalization
  ########################################
  def normalization(self, train_X, test_X, n_time, n_features):
    """
    Normalize the train and val features

    Parameters:
      train_X (npy): Training feature
      test_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Normalized training feature
      test_X (npy): Normalized validation feature
    """
    # flatten the np arrays to 1D
    train_X_reshape = train_X.reshape(-1, n_time * n_features)
    test_X_reshape = test_X.reshape(-1, n_time * n_features)

    # use minmaxscaler to normalize the train and val features
    scaler = MinMaxScaler((0,1)).fit(train_X_reshape)
    train_X_scaled = scaler.transform(train_X_reshape)
    test_X_scaled = scaler.transform(test_X_reshape)

    # reshape the features to the original shape (n_time, n_features)
    train_X = train_X_scaled.reshape(-1, n_time, n_features)
    test_X = test_X_scaled.reshape(-1, n_time, n_features)

    return train_X, test_X


  ########################################
  # avgpooling function is called by process_features() function for the features that require average pooling
  ########################################
  def avgpooling(self, train_X, test_X, n_time, n_features):
    """
    Average pooling the train and val features

    Parameters:
      train_X (npy): Training feature
      test_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Avgpooled training feature
      test_X (npy): Avgpooled validation feature
    """
    # instantiate the keras layers and model.
    # the model only performs average pooling of inputs
    tf.keras.backend.clear_session()
    input = tf.keras.layers.Input(shape=(n_time, n_features))
    global_avg_pooling = tf.keras.layers.GlobalAveragePooling1D()
    pooled = global_avg_pooling(input)
    pooling_model = tf.keras.models.Model(inputs=input, outputs=pooled)

    # use the instantiated model to avgpool the train and val features
    train_X = pooling_model.predict(train_X)
    test_X = pooling_model.predict(test_X)

    return train_X, test_X


  ########################################
  # process_features function is called when the class is instantiated
  ########################################
  def process_features(self, train_features_dict, test_features_dict):
    """
    Apply normalization or average pooling to train and val features

    Parameters:
      train_features_dict (dict): Dictionary containing the training features
      test_features_dict (dict): Dictionary containing the validation features

    Returns:
      tuple: Tuple containing normalized and/or average pooled training and validation features dictionary
    """
    start_time = time.time()
    display('features processing started ---------->')

    for each in train_features_dict.keys():
      # 'continent', 'rating', and 'type' do not need to be processed
      if each == 'continent' or each == 'rating' or each == 'type':
        pass
      # process numeric features
      else:
        if each == 'mfcc':
          n_features=self.n_mfccs
        elif each == 'chroma':
          n_features=self.n_chroma
        elif each == 'rms' or each == 'spectral_centroid':
          n_features=1
        elif each == 'melspectrogram':
          n_features=self.n_mels

        # index the dictionary to find the value of the feature based on the feature name (key)
        train_feature = train_features_dict[each]
        test_feature = test_features_dict[each]

        # call normalization or avgpooling function to apply normalization or average pooling to features
        if self.do_normalize and self.do_avgpool:
          train_features_dict[each], test_features_dict[each] = self.normalization(train_feature, test_feature, n_time=train_feature.shape[1], n_features=n_features)
          train_features_dict[each], test_features_dict[each] = self.avgpooling(train_features_dict[each], test_features_dict[each], n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_normalize:
          train_features_dict[each], test_features_dict[each] = self.normalization(train_feature, test_feature, n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_avgpool:
          train_features_dict[each], test_features_dict[each] = self.avgpooling(train_feature, test_feature, n_time=train_feature.shape[1], n_features=n_features)
        else:
          train_features_dict[each], test_features_dict[each] = train_features_dict[each], test_features_dict[each]

        display(f'{each} processed')

    end_time = time.time()
    display(f'features processing took {(end_time - start_time)/60:.1f} mins')

    return train_features_dict, test_features_dict


  ########################################
  # extract_features function is called when the class is instantiated
  ########################################
  def extract_features(self, dataframe):
    """
    Extract the label & features from the dataframes

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information

    Returns:
      tuple: Tuple containing labels and a dictionary of extracted features
    """
    start_time = time.time()
    display(f'feature extraction started ---------->')

    # create empty list and dict to store the labels and features
    y = []
    features_dict = {item: [] for item in self.features}

    # iterate through each row of the dataframe to extract the label and features
    for _, row in dataframe.iterrows():
      label = row['primary_label']
      framed = row['framed']

      for each in framed:
        y.append(label)

        if 'continent' in self.features:
          features_dict['continent'].append(row['continent'])
        if 'rating' in self.features:
          features_dict['rating'].append(row['rating'])
        if 'type' in self.features:
          features_dict['type'].append(row['type'])

        # dynatically call the extract_x function to extract the listed features
        for feature in self.features:
          extract = f"extract_{feature}"
          if hasattr(self, extract) and callable(func := getattr(self, extract)):
            features_dict[feature].append(func(each))

    # cast lists to np arrays
    for each in features_dict.keys():
      features_dict[each] = np.array(features_dict[each])
    y = np.array(y)

    end_time = time.time()
    display(f'features extraction took {(end_time - start_time)/60:.1f} mins')

    return y, features_dict


# Load train and test framed audio data

In [4]:
with open('/content/drive/MyDrive/Projects/train_val_csv_pkl/train_df_8_sec.pkl', 'rb') as file:
  train_df = pickle.load(file)

train_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
668,barswa,blank,-33.2377,26.9574,5.0,barswa/XC713037.ogg,58.331,ZA,AFRICA,barswa/XC713037.npy,train,"((tf.Tensor(-1.3069126e-05, shape=(), dtype=fl..."
247,eaywag1,call,54.1191,13.376,3.5,eaywag1/XC658750.ogg,49.68,DE,EUROPE,eaywag1/XC658750.npy,train,"((tf.Tensor(-1.1132361e-06, shape=(), dtype=fl..."
620,barswa,song,35.0573,34.0009,5.0,barswa/XC405617.ogg,114.207375,CY,ASIA,barswa/XC405617.npy,train,"((tf.Tensor(3.5154517e-06, shape=(), dtype=flo..."
86,comsan,call,43.5352,-1.4475,4.5,comsan/XC580687.ogg,59.82,FR,EUROPE,comsan/XC580687.npy,train,"((tf.Tensor(-7.335485e-06, shape=(), dtype=flo..."
450,eaywag1,song,43.5118,3.8367,3.0,eaywag1/XC567939.ogg,10.031,FR,EUROPE,eaywag1/XC567939.npy,train,"((tf.Tensor(7.8262474e-07, shape=(), dtype=flo..."


In [5]:
with open('/content/drive/MyDrive/Projects/test_csv_pkl/test_df_8_sec.pkl', 'rb') as file:
  test_df = pickle.load(file)

test_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,framed
0,barswa,blank,53.2509,5.598,4.0,barswa/XC721711.ogg,19.069375,NL,EUROPE,barswa/XC721711.npy,"((tf.Tensor(1.0793006e-05, shape=(), dtype=flo..."
1,comsan,call,48.8306,2.1999,4.0,comsan/XC496602.ogg,28.995938,FR,EUROPE,comsan/XC496602.npy,"((tf.Tensor(-1.02154445e-05, shape=(), dtype=f..."
3,eaywag1,call,37.1357,-7.6138,4.5,eaywag1/XC481360.ogg,151.944,PT,EUROPE,eaywag1/XC481360.npy,"((tf.Tensor(-1.7517013e-06, shape=(), dtype=fl..."
4,barswa,blank,19.3551,-99.0467,5.0,barswa/XC698512.ogg,18.703688,MX,AMERICAS,barswa/XC698512.npy,"((tf.Tensor(-2.801371e-06, shape=(), dtype=flo..."
5,comsan,call,56.0851,47.2602,5.0,comsan/XC492456.ogg,44.512687,RU,EUROPE,comsan/XC492456.npy,"((tf.Tensor(-1.2049219e-05, shape=(), dtype=fl..."


# Extract features

In [6]:
features_list = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram']

In [7]:
%%time

features = Extraction(train_df,
                      test_df,
                      features=features_list,
                      do_normalize=True,
                      do_avgpool=False)

'feature extraction started ---------->'

  return pitch_tuning(


'features extraction took 5.1 mins'

'feature extraction started ---------->'

'features extraction took 3.6 mins'

'features processing started ---------->'

'mfcc processed'

'chroma processed'

'rms processed'

'spectral_centroid processed'

'melspectrogram processed'

'features processing took 0.0 mins'

CPU times: user 9min 23s, sys: 6min 8s, total: 15min 32s
Wall time: 8min 43s


In [8]:
train_y = features.train_y

display(train_y.shape)
display(train_y[:5])

(4763,)

array(['barswa', 'barswa', 'barswa', 'barswa', 'barswa'], dtype='<U7')

In [9]:
train_features = features.train_features

for key in train_features.keys():
  display(key)
  display(train_features[key].shape)
  display(train_features[key][0])

'mfcc'

(4763, 251, 20)

array([[0.2926833 , 0.5472208 , 0.51762646, ..., 0.42902297, 0.5246572 ,
        0.39747417],
       [0.38219857, 0.63193434, 0.4692648 , ..., 0.4789598 , 0.5143834 ,
        0.35231233],
       [0.4352324 , 0.62795496, 0.4937711 , ..., 0.53823024, 0.5205363 ,
        0.36665156],
       ...,
       [0.6295364 , 0.3740691 , 0.4684327 , ..., 0.74904454, 0.6659465 ,
        0.13212648],
       [0.6729512 , 0.37194872, 0.47004578, ..., 0.63276273, 0.6894711 ,
        0.25973636],
       [0.6984098 , 0.35604227, 0.50428545, ..., 0.49908003, 0.694181  ,
        0.40737295]], dtype=float32)

'chroma'

(4763, 251, 12)

array([[0.73383594, 0.69802487, 0.54096204, ..., 1.        , 0.608227  ,
        0.48766178],
       [1.        , 0.9552251 , 0.97629696, ..., 0.819532  , 0.46592206,
        0.4471012 ],
       [0.70929813, 0.856629  , 1.        , ..., 0.9266538 , 0.5471759 ,
        0.3217781 ],
       ...,
       [0.88756543, 0.4946972 , 0.0864549 , ..., 0.22002427, 0.10811816,
        0.53788805],
       [1.        , 0.20042695, 0.16474293, ..., 0.17202397, 0.26110023,
        0.83525467],
       [0.90538883, 0.38594005, 0.5073254 , ..., 0.5220612 , 0.7847499 ,
        1.        ]], dtype=float32)

'rms'

(4763, 251, 1)

array([[0.00066166],
       [0.00108187],
       [0.00132881],
       [0.00173146],
       [0.00243525],
       [0.0040241 ],
       [0.00404387],
       [0.00451472],
       [0.00460673],
       [0.00389649],
       [0.00389425],
       [0.00364176],
       [0.00360476],
       [0.00298573],
       [0.00220647],
       [0.00210538],
       [0.00219035],
       [0.00228781],
       [0.00237162],
       [0.00214912],
       [0.00185741],
       [0.00167973],
       [0.00148034],
       [0.00144359],
       [0.00138289],
       [0.0019026 ],
       [0.00349377],
       [0.00674044],
       [0.00596432],
       [0.0055888 ],
       [0.00537581],
       [0.00388524],
       [0.00228059],
       [0.00197732],
       [0.00216695],
       [0.00183421],
       [0.00163533],
       [0.00158946],
       [0.00150203],
       [0.00165542],
       [0.0019687 ],
       [0.00324267],
       [0.00541037],
       [0.00855162],
       [0.00869734],
       [0.00860727],
       [0.00906659],
       [0.007

'spectral_centroid'

(4763, 251, 1)

array([[0.43021879],
       [0.35183347],
       [0.34781837],
       [0.33187161],
       [0.34889517],
       [0.36813308],
       [0.41916818],
       [0.41816175],
       [0.41738839],
       [0.40180619],
       [0.42645345],
       [0.44920148],
       [0.42218698],
       [0.34067167],
       [0.34579915],
       [0.39024828],
       [0.36757338],
       [0.33677927],
       [0.35095777],
       [0.39534415],
       [0.41730669],
       [0.3524761 ],
       [0.33825169],
       [0.36009482],
       [0.3485585 ],
       [0.34525193],
       [0.37783899],
       [0.44370878],
       [0.496219  ],
       [0.5361965 ],
       [0.52634682],
       [0.41777292],
       [0.37638074],
       [0.41059608],
       [0.43082292],
       [0.43644633],
       [0.42599421],
       [0.3998278 ],
       [0.38076336],
       [0.39246149],
       [0.41569275],
       [0.42230446],
       [0.43493215],
       [0.4522872 ],
       [0.46056001],
       [0.44753668],
       [0.44646046],
       [0.463

'melspectrogram'

(4763, 251, 20)

array([[0.24356848, 0.19709486, 0.19181323, ..., 0.14215577, 0.14264905,
        0.16532534],
       [0.40376735, 0.3655113 , 0.42287964, ..., 0.23766714, 0.1897651 ,
        0.1521129 ],
       [0.45815992, 0.46074304, 0.4943219 , ..., 0.2976507 , 0.25157702,
        0.2118563 ],
       ...,
       [0.49117148, 0.43984422, 0.43819213, ..., 0.6310359 , 0.56908995,
        0.5044948 ],
       [0.46418417, 0.4439577 , 0.45830843, ..., 0.6673047 , 0.62528354,
        0.5652833 ],
       [0.4987093 , 0.456418  , 0.46511737, ..., 0.6581471 , 0.6399298 ,
        0.5732062 ]], dtype=float32)

In [10]:
test_y = features.test_y
test_y = np.array(test_y)

display(test_y.shape)
display(test_y[:5])

(3402,)

array(['barswa', 'barswa', 'barswa', 'comsan', 'comsan'], dtype='<U7')

In [11]:
test_features = features.test_features

for key in test_features.keys():
  display(key)
  display(test_features[key].shape)
  display(test_features[key][0])

'mfcc'

(3402, 251, 20)

array([[0.12637472, 0.66154164, 0.51451486, ..., 0.41870528, 0.5113902 ,
        0.48631796],
       [0.321715  , 0.7411357 , 0.45207432, ..., 0.6530287 , 0.63088936,
        0.50485253],
       [0.38742942, 0.7113186 , 0.4827687 , ..., 0.6936252 , 0.6730265 ,
        0.56082416],
       ...,
       [0.41875988, 0.70825344, 0.48139268, ..., 0.56900203, 0.5537507 ,
        0.43906337],
       [0.42478788, 0.72792226, 0.47220683, ..., 0.5131804 , 0.5479504 ,
        0.48395485],
       [0.4225548 , 0.7265101 , 0.45931727, ..., 0.5552089 , 0.56114554,
        0.4701957 ]], dtype=float32)

'chroma'

(3402, 251, 12)

array([[0.52948296, 0.6134329 , 0.74272865, ..., 0.85264695, 0.99999994,
        0.69691455],
       [0.69769573, 0.70837045, 0.67038244, ..., 0.704675  , 0.8265218 ,
        0.8209121 ],
       [0.66979647, 0.6454641 , 0.76218367, ..., 0.7505319 , 0.8628132 ,
        0.8713193 ],
       ...,
       [0.39303148, 0.465551  , 0.7184209 , ..., 0.5434893 , 0.68287253,
        0.5313899 ],
       [0.6909108 , 0.74325573, 1.        , ..., 0.623076  , 0.75277185,
        0.893962  ],
       [0.5738938 , 0.70136404, 1.0000001 , ..., 0.734778  , 0.9603952 ,
        0.83558875]], dtype=float32)

'rms'

(3402, 251, 1)

array([[0.00151953],
       [0.00210652],
       [0.00210424],
       [0.00251653],
       [0.00261859],
       [0.00188212],
       [0.00161983],
       [0.00144908],
       [0.00138256],
       [0.00135395],
       [0.00138989],
       [0.00142041],
       [0.00157155],
       [0.00156389],
       [0.00159224],
       [0.00161361],
       [0.00162013],
       [0.00175908],
       [0.00183662],
       [0.00200717],
       [0.00176342],
       [0.00155806],
       [0.00141897],
       [0.00117192],
       [0.00115281],
       [0.00128784],
       [0.00139197],
       [0.00169259],
       [0.00115968],
       [0.00104613],
       [0.00113098],
       [0.00119672],
       [0.00129046],
       [0.0014369 ],
       [0.00158103],
       [0.00125639],
       [0.00115673],
       [0.00101575],
       [0.00108872],
       [0.00192184],
       [0.00234318],
       [0.00268936],
       [0.00382477],
       [0.00423042],
       [0.00360007],
       [0.00343099],
       [0.00205731],
       [0.001

'spectral_centroid'

(3402, 251, 1)

array([[0.32759658],
       [0.1763402 ],
       [0.1911739 ],
       [0.20247421],
       [0.21524033],
       [0.22246464],
       [0.24810807],
       [0.24042426],
       [0.25132414],
       [0.23677617],
       [0.2257512 ],
       [0.22598261],
       [0.22455441],
       [0.23232559],
       [0.22148172],
       [0.23121971],
       [0.22975312],
       [0.22757042],
       [0.23174907],
       [0.22108005],
       [0.20862932],
       [0.21274976],
       [0.22630565],
       [0.21701971],
       [0.21591482],
       [0.21423441],
       [0.21946645],
       [0.23038949],
       [0.23493988],
       [0.24283719],
       [0.23576425],
       [0.20690592],
       [0.21279964],
       [0.21625296],
       [0.21102255],
       [0.22675821],
       [0.24414448],
       [0.22710959],
       [0.2229067 ],
       [0.22023144],
       [0.18902988],
       [0.16707376],
       [0.16205933],
       [0.14850789],
       [0.17459478],
       [0.19989911],
       [0.1978003 ],
       [0.203

'melspectrogram'

(3402, 251, 20)

array([[0.31809407, 0.19190562, 0.134889  , ..., 0.03883749, 0.0312348 ,
        0.04001206],
       [0.49726686, 0.35696265, 0.33532172, ..., 0.14302975, 0.12461662,
        0.1253565 ],
       [0.5462077 , 0.43100378, 0.40487617, ..., 0.1934368 , 0.18453193,
        0.19068283],
       ...,
       [0.5217989 , 0.43670923, 0.4611032 , ..., 0.22723418, 0.1852122 ,
        0.17907518],
       [0.50518435, 0.4377943 , 0.4427187 , ..., 0.24801129, 0.1951111 ,
        0.17684048],
       [0.49736208, 0.40946913, 0.40826297, ..., 0.22673619, 0.20987487,
        0.18848014]], dtype=float32)

# Encode Classes

In [12]:
label_encoder = LabelEncoder().fit(train_y)
train_y = label_encoder.transform(train_y)
test_y = label_encoder.transform(test_y)

classes = list(label_encoder.inverse_transform([0,1,2]))
classes

['barswa', 'comsan', 'eaywag1']

In [13]:
display(len(train_y))
display(train_y[:10])

display(len(test_y))
display(test_y[:10])

4763

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

3402

array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2])

# Put the labels and features to one large dataframe

In [14]:
train_features['label'] = train_y
train_features

{'mfcc': array([[[0.2926833 , 0.5472208 , 0.51762646, ..., 0.42902297,
          0.5246572 , 0.39747417],
         [0.38219857, 0.63193434, 0.4692648 , ..., 0.4789598 ,
          0.5143834 , 0.35231233],
         [0.4352324 , 0.62795496, 0.4937711 , ..., 0.53823024,
          0.5205363 , 0.36665156],
         ...,
         [0.6295364 , 0.3740691 , 0.4684327 , ..., 0.74904454,
          0.6659465 , 0.13212648],
         [0.6729512 , 0.37194872, 0.47004578, ..., 0.63276273,
          0.6894711 , 0.25973636],
         [0.6984098 , 0.35604227, 0.50428545, ..., 0.49908003,
          0.694181  , 0.40737295]],
 
        [[0.571982  , 0.52721703, 0.35377806, ..., 0.31411108,
          0.45345846, 0.5325761 ],
         [0.57278824, 0.51283437, 0.33133256, ..., 0.44663262,
          0.59104645, 0.5540432 ],
         [0.5442881 , 0.50030595, 0.39622635, ..., 0.4177069 ,
          0.63780564, 0.5152073 ],
         ...,
         [0.65820706, 0.30996823, 0.5541669 , ..., 0.3272183 ,
          0.6759

In [15]:
test_features['label'] = test_y
test_features

{'mfcc': array([[[0.12637472, 0.66154164, 0.51451486, ..., 0.41870528,
          0.5113902 , 0.48631796],
         [0.321715  , 0.7411357 , 0.45207432, ..., 0.6530287 ,
          0.63088936, 0.50485253],
         [0.38742942, 0.7113186 , 0.4827687 , ..., 0.6936252 ,
          0.6730265 , 0.56082416],
         ...,
         [0.41875988, 0.70825344, 0.48139268, ..., 0.56900203,
          0.5537507 , 0.43906337],
         [0.42478788, 0.72792226, 0.47220683, ..., 0.5131804 ,
          0.5479504 , 0.48395485],
         [0.4225548 , 0.7265101 , 0.45931727, ..., 0.5552089 ,
          0.56114554, 0.4701957 ]],
 
        [[0.43888956, 0.74967396, 0.42380938, ..., 0.5428295 ,
          0.5410676 , 0.5198164 ],
         [0.42741203, 0.733481  , 0.41031593, ..., 0.58994764,
          0.59406835, 0.5892371 ],
         [0.4024588 , 0.7002074 , 0.45497358, ..., 0.6469444 ,
          0.5833043 , 0.5068715 ],
         ...,
         [0.5376694 , 0.7608158 , 0.37164068, ..., 0.59482586,
          0.6321

In [16]:
merged_dict = {'test': test_features}

merged_dict

{'test': {'mfcc': array([[[0.12637472, 0.66154164, 0.51451486, ..., 0.41870528,
           0.5113902 , 0.48631796],
          [0.321715  , 0.7411357 , 0.45207432, ..., 0.6530287 ,
           0.63088936, 0.50485253],
          [0.38742942, 0.7113186 , 0.4827687 , ..., 0.6936252 ,
           0.6730265 , 0.56082416],
          ...,
          [0.41875988, 0.70825344, 0.48139268, ..., 0.56900203,
           0.5537507 , 0.43906337],
          [0.42478788, 0.72792226, 0.47220683, ..., 0.5131804 ,
           0.5479504 , 0.48395485],
          [0.4225548 , 0.7265101 , 0.45931727, ..., 0.5552089 ,
           0.56114554, 0.4701957 ]],
  
         [[0.43888956, 0.74967396, 0.42380938, ..., 0.5428295 ,
           0.5410676 , 0.5198164 ],
          [0.42741203, 0.733481  , 0.41031593, ..., 0.58994764,
           0.59406835, 0.5892371 ],
          [0.4024588 , 0.7002074 , 0.45497358, ..., 0.6469444 ,
           0.5833043 , 0.5068715 ],
          ...,
          [0.5376694 , 0.7608158 , 0.37164068, ...

# save the merged dict with labels and features to pkl

In [17]:
with open('/content/drive/MyDrive/Projects/test_features_pkl/test_8_sec_audio_features_not_avgpooled.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

In [18]:
with open('/content/drive/MyDrive/Projects/test_features_pkl/test_8_sec_audio_features_not_avgpooled.pkl', 'rb') as file:
  merged_df = pickle.load(file)

In [19]:
assert merged_dict.keys() == merged_df.keys()

for top_level_key in merged_dict.keys():
  assert merged_dict[top_level_key].keys() == merged_df[top_level_key].keys()

  for bottom_level_key in merged_dict[top_level_key].keys():
    assert merged_dict[top_level_key][bottom_level_key].shape == merged_df[top_level_key][bottom_level_key].shape
    assert np.array_equal(merged_dict[top_level_key][bottom_level_key], merged_df[top_level_key][bottom_level_key])