<a href="https://colab.research.google.com/github/rachlllg/Project_Bird-Song-Classifier-with-Machine-Learning/blob/main/5_sec_audio_features_not_avgpooled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
# for drive access
from google.colab import drive
drive.mount('/content/drive')

# standard libraries
import numpy as np
import pandas as pd
import pickle
import time

# for audio
import librosa

# for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

# for saving the label & features to disk
import pickle

Mounted at /content/drive


# Self Defined Class Methods For Feature Extraction

In [2]:
class Extraction:

  def __init__(self,
               train_df,
               val_df,
               sr=16000,
               n_mfccs=20, #20 is the default n_mfccs from librosa
               n_mels=20, #reduced from 128 default from librosa to 20
               n_chroma=12, #12 is the default n_chroma from librosa
               features=['mfcc'],
               do_normalize=True,
               do_avgpool=True #WARNING: THE KERNEL WILL CRASH IF YOU TRY TO EXTRACT ALL FEATURES WITHOUT AVGPOOL
               ):
    """
    Instantiate the Extraction class and extract the labels and features from train and val DataFrames.

    The extract_features() method is automatically called to extract the labels and features from the given DataFrame.

    Parameters:
      train_df (pd.DataFrame): DataFrame containing training data.
      val_df (pd.DataFrame): DataFrame containing validation data.
      sr (int): Sample rate of the audio files.
      n_mfccs (int): Number of MFCCs to extract.
      n_mels (int): Number of Mel bands to extract.
      n_chroma (int): Number of chroma bins to use.
      features (list): List of features to extract.
        accepted features: 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type'.
      normalize (bool): Whether to normalize the features.
      maxpool (bool): Whether to maxpool the features.
    """
    # instantiate class variables
    self.sr = sr
    self.n_mfccs = n_mfccs
    self.n_mels = n_mels
    self.n_chroma = n_chroma
    self.do_normalize = do_normalize
    self.do_avgpool = do_avgpool

    # confirm features have been specified
    assert len(features) != 0, "Must Specify At Least One Feature In The Form Of A List."
    self.features = features

    # confirm features are valid
    self.accepted_feature = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type']
    for each in self.features:
      assert each in self.accepted_feature, f"{each} is not an accepted feature, only 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', and 'type' are accpeted features."

    # extract train and val labels and features
    self.train_y, self.train_features = self.extract_features(train_df)
    self.val_y, self.val_features = self.extract_features(val_df)

    # process the features by applying normalization or average pooling
    self.train_features, self.val_features = self.process_features(self.train_features, self.val_features)


  ########################################
  # each of the extract_x() function is callable by the extract_features() function based on the features instantiated
  # each of the features are extracted from librosa and transposed to shape (n_time, n_features)
  ########################################
  def extract_mfcc(self, each):
    return np.transpose(librosa.feature.mfcc(y=np.array(each), sr=self.sr, n_mfcc=self.n_mfccs))

  def extract_chroma(self, each):
    return np.transpose(librosa.feature.chroma_stft(y=np.array(each), sr=self.sr, n_chroma=self.n_chroma))

  def extract_rms(self, each):
    return np.transpose(librosa.feature.rms(y=np.array(each)))

  def extract_spectral_centroid(self, each):
    return np.transpose(librosa.feature.spectral_centroid(y=np.array(each), sr=self.sr))

  def extract_melspectrogram(self, each):
    mel = librosa.feature.melspectrogram(y=np.array(each), sr=self.sr, n_mels=self.n_mels)
    return np.transpose(librosa.power_to_db(mel))


  ########################################
  # normalization function is called by process_features() function for the features that require normalization
  ########################################
  def normalization(self, train_X, val_X, n_time, n_features):
    """
    Normalize the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Normalized training feature
      val_X (npy): Normalized validation feature
    """
    # flatten the np arrays to 1D
    train_X_reshape = train_X.reshape(-1, n_time * n_features)
    val_X_reshape = val_X.reshape(-1, n_time * n_features)

    # use minmaxscaler to normalize the train and val features
    scaler = MinMaxScaler((0,1)).fit(train_X_reshape)
    train_X_scaled = scaler.transform(train_X_reshape)
    val_X_scaled = scaler.transform(val_X_reshape)

    # reshape the features to the original shape (n_time, n_features)
    train_X = train_X_scaled.reshape(-1, n_time, n_features)
    val_X = val_X_scaled.reshape(-1, n_time, n_features)

    return train_X, val_X


  ########################################
  # avgpooling function is called by process_features() function for the features that require average pooling
  ########################################
  def avgpooling(self, train_X, val_X, n_time, n_features):
    """
    Average pooling the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Avgpooled training feature
      val_X (npy): Avgpooled validation feature
    """
    # instantiate the keras layers and model.
    # the model only performs average pooling of inputs
    tf.keras.backend.clear_session()
    input = tf.keras.layers.Input(shape=(n_time, n_features))
    global_avg_pooling = tf.keras.layers.GlobalAveragePooling1D()
    pooled = global_avg_pooling(input)
    pooling_model = tf.keras.models.Model(inputs=input, outputs=pooled)

    # use the instantiated model to avgpool the train and val features
    train_X = pooling_model.predict(train_X)
    val_X = pooling_model.predict(val_X)

    return train_X, val_X


  ########################################
  # process_features function is called when the class is instantiated
  ########################################
  def process_features(self, train_features_dict, val_features_dict):
    """
    Apply normalization or average pooling to train and val features

    Parameters:
      train_features_dict (dict): Dictionary containing the training features
      val_features_dict (dict): Dictionary containing the validation features

    Returns:
      tuple: Tuple containing normalized and/or average pooled training and validation features dictionary
    """
    start_time = time.time()
    display('features processing started ---------->')

    for each in train_features_dict.keys():
      # 'continent', 'rating', and 'type' do not need to be processed
      if each == 'continent' or each == 'rating' or each == 'type':
        pass
      # process numeric features
      else:
        if each == 'mfcc':
          n_features=self.n_mfccs
        elif each == 'chroma':
          n_features=self.n_chroma
        elif each == 'rms' or each == 'spectral_centroid':
          n_features=1
        elif each == 'melspectrogram':
          n_features=self.n_mels

        # index the dictionary to find the value of the feature based on the feature name (key)
        train_feature = train_features_dict[each]
        val_feature = val_features_dict[each]

        # call normalization or avgpooling function to apply normalization or average pooling to features
        if self.do_normalize and self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_features_dict[each], val_features_dict[each], n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_normalize:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        else:
          train_features_dict[each], val_features_dict[each] = train_features_dict[each], val_features_dict[each]

        display(f'{each} processed')

    end_time = time.time()
    display(f'features processing took {(end_time - start_time)/60:.1f} mins')

    return train_features_dict, val_features_dict


  ########################################
  # extract_features function is called when the class is instantiated
  ########################################
  def extract_features(self, dataframe):
    """
    Extract the label & features from the dataframes

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information

    Returns:
      tuple: Tuple containing labels and a dictionary of extracted features
    """
    start_time = time.time()
    display(f'feature extraction started ---------->')

    # create empty list and dict to store the labels and features
    y = []
    features_dict = {item: [] for item in self.features}

    # iterate through each row of the dataframe to extract the label and features
    for _, row in dataframe.iterrows():
      label = row['primary_label']
      framed = row['framed']

      for each in framed:
        y.append(label)

        if 'continent' in self.features:
          features_dict['continent'].append(row['continent'])
        if 'rating' in self.features:
          features_dict['rating'].append(row['rating'])
        if 'type' in self.features:
          features_dict['type'].append(row['type'])

        # dynatically call the extract_x function to extract the listed features
        for feature in self.features:
          extract = f"extract_{feature}"
          if hasattr(self, extract) and callable(func := getattr(self, extract)):
            features_dict[feature].append(func(each))

    # cast lists to np arrays
    for each in features_dict.keys():
      features_dict[each] = np.array(features_dict[each])
    y = np.array(y)

    end_time = time.time()
    display(f'features extraction took {(end_time - start_time)/60:.1f} mins')

    return y, features_dict


# Load train and test framed audio data

In [3]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/train_df_5_sec.pkl', 'rb') as file:
  train_df = pickle.load(file)

train_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
240,eaywag1,call,42.965,9.4512,5.0,eaywag1/XC471811.ogg,17.8155,FR,EUROPE,eaywag1/XC471811.npy,train,"((tf.Tensor(7.637085e-06, shape=(), dtype=floa..."
542,barswa,song,42.806,13.8335,4.0,barswa/XC371853.ogg,50.834313,IT,EUROPE,barswa/XC371853.npy,train,"((tf.Tensor(-1.867149e-06, shape=(), dtype=flo..."
214,comsan,call,60.2357,25.0058,1.0,comsan/XC554068.ogg,25.6,FI,EUROPE,comsan/XC554068.npy,train,"((tf.Tensor(8.780201e-06, shape=(), dtype=floa..."
492,barswa,song,53.9299,-2.9833,2.5,barswa/XC690496.ogg,15.098812,GB,EUROPE,barswa/XC690496.npy,train,"((tf.Tensor(4.4646572e-07, shape=(), dtype=flo..."
190,comsan,call,51.5579,17.509,4.0,comsan/XC492893.ogg,11.755,PL,EUROPE,comsan/XC492893.npy,train,"((tf.Tensor(-3.6929268e-06, shape=(), dtype=fl..."


In [4]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/val_df_5_sec.pkl', 'rb') as file:
  val_df = pickle.load(file)

val_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
969,comsan,call,41.8074,-8.8626,4.0,comsan/XC670080.ogg,11.206,PT,EUROPE,comsan/XC670080.npy,val,"((tf.Tensor(9.328758e-06, shape=(), dtype=floa..."
851,comsan,blank,51.6578,19.3281,5.0,comsan/XC738993.ogg,11.616,PL,EUROPE,comsan/XC738993.npy,val,"((tf.Tensor(6.5908675e-06, shape=(), dtype=flo..."
998,barswa,song,48.2131,-3.0137,5.0,barswa/XC643586.ogg,49.536,FR,EUROPE,barswa/XC643586.npy,val,"((tf.Tensor(1.9546133e-06, shape=(), dtype=flo..."
771,comsan,call,56.0779,47.9129,5.0,comsan/XC371997.ogg,12.355937,RU,EUROPE,comsan/XC371997.npy,val,"((tf.Tensor(5.00679e-06, shape=(), dtype=float..."
988,eaywag1,call,52.2003,-6.4349,2.0,eaywag1/XC687527.ogg,15.464,IE,EUROPE,eaywag1/XC687527.npy,val,"((tf.Tensor(2.4605542e-06, shape=(), dtype=flo..."


# Extract features

In [5]:
features_list = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram']

In [6]:
%%time

features = Extraction(train_df,
                      val_df,
                      features=features_list,
                      do_normalize=True,
                      do_avgpool=False)

'feature extraction started ---------->'

  return pitch_tuning(


'features extraction took 11.9 mins'

'feature extraction started ---------->'

'features extraction took 6.3 mins'

'features processing started ---------->'

'mfcc processed'

'chroma processed'

'rms processed'

'spectral_centroid processed'

'melspectrogram processed'

'features processing took 0.0 mins'

CPU times: user 17min 16s, sys: 12min 15s, total: 29min 32s
Wall time: 18min 16s


In [7]:
train_y = features.train_y

display(train_y.shape)
display(train_y[:5])

(8236,)

array(['eaywag1', 'eaywag1', 'eaywag1', 'eaywag1', 'eaywag1'], dtype='<U7')

In [8]:
train_features = features.train_features

for key in train_features.keys():
  display(key)
  display(train_features[key].shape)
  display(train_features[key][0])

'mfcc'

(8236, 157, 20)

array([[0.25524157, 0.49361926, 0.5395643 , ..., 0.5395047 , 0.5315723 ,
        0.4432514 ],
       [0.24719143, 0.44379455, 0.5591471 , ..., 0.6371831 , 0.5456409 ,
        0.41449252],
       [0.2950486 , 0.45913303, 0.54112226, ..., 0.6816203 , 0.60937226,
        0.3714976 ],
       ...,
       [0.35305548, 0.38608742, 0.5380844 , ..., 0.59507096, 0.5259153 ,
        0.46720576],
       [0.3647915 , 0.35044894, 0.5708952 , ..., 0.43645465, 0.59872067,
        0.49044472],
       [0.37175345, 0.32072663, 0.63013554, ..., 0.41715518, 0.6089424 ,
        0.418767  ]], dtype=float32)

'chroma'

(8236, 157, 12)

array([[0.82161397, 1.        , 0.53707725, ..., 0.37460336, 0.5418486 ,
        0.7557603 ],
       [0.631908  , 1.        , 0.3810592 , ..., 0.11571001, 0.22267492,
        0.27532226],
       [0.8004797 , 1.0000001 , 0.346143  , ..., 0.12254407, 0.186572  ,
        0.2575719 ],
       ...,
       [0.25490853, 0.39408174, 0.15544528, ..., 0.11202694, 0.07273073,
        0.12418766],
       [0.278215  , 0.24971205, 0.20988451, ..., 0.15236686, 0.4174427 ,
        0.6980743 ],
       [0.05270218, 0.10585765, 0.11305311, ..., 0.08448039, 0.30557925,
        0.23732732]], dtype=float32)

'rms'

(8236, 157, 1)

array([[2.6375562e-04],
       [4.5816271e-04],
       [5.0920702e-04],
       [5.8771687e-04],
       [5.5731949e-04],
       [5.3923117e-04],
       [5.8927707e-04],
       [5.8713631e-04],
       [5.4910523e-04],
       [7.0405670e-04],
       [9.4424462e-04],
       [1.1168404e-03],
       [1.0653572e-03],
       [8.6202123e-04],
       [1.2056167e-03],
       [3.3745259e-02],
       [6.6267177e-02],
       [7.1777239e-02],
       [6.4996779e-02],
       [1.0275630e-01],
       [1.5441386e-01],
       [1.3730411e-01],
       [1.2176923e-01],
       [8.4493555e-02],
       [1.0506801e-02],
       [1.2803611e-03],
       [1.2859026e-03],
       [1.2505248e-03],
       [7.6322630e-04],
       [5.5099576e-04],
       [5.7182857e-04],
       [5.5238471e-04],
       [4.6625044e-04],
       [4.4070816e-04],
       [9.3954551e-04],
       [8.2508556e-04],
       [7.8747107e-04],
       [7.7570544e-04],
       [8.9455448e-04],
       [1.5772196e-03],
       [1.4579098e-03],
       [1.751890

'spectral_centroid'

(8236, 157, 1)

array([[0.57337621],
       [0.53776801],
       [0.54457204],
       [0.57521795],
       [0.65339129],
       [0.69669815],
       [0.64056215],
       [0.54801452],
       [0.51817379],
       [0.56901767],
       [0.68322192],
       [0.74700854],
       [0.734039  ],
       [0.63728494],
       [0.57202249],
       [0.57696286],
       [0.59248134],
       [0.61513627],
       [0.64475904],
       [0.7944066 ],
       [0.82766711],
       [0.73966229],
       [0.69204237],
       [0.66083608],
       [0.72370764],
       [0.75462254],
       [0.74631373],
       [0.69247622],
       [0.65153089],
       [0.63996927],
       [0.62023941],
       [0.57519626],
       [0.58221766],
       [0.57971797],
       [0.52175561],
       [0.53946833],
       [0.55667616],
       [0.52752015],
       [0.48649065],
       [0.59845184],
       [0.6715698 ],
       [0.6603986 ],
       [0.52195188],
       [0.51057924],
       [0.50888086],
       [0.63416809],
       [0.65198951],
       [0.605

'melspectrogram'

(8236, 157, 20)

array([[0.26123977, 0.21945232, 0.19433105, ..., 0.16284543, 0.16774637,
        0.19318956],
       [0.25340915, 0.16392392, 0.19068044, ..., 0.22753304, 0.19299585,
        0.15846723],
       [0.25611585, 0.19613642, 0.25122166, ..., 0.267843  , 0.24718678,
        0.21001625],
       ...,
       [0.262945  , 0.24806285, 0.23691773, ..., 0.3431118 , 0.47619197,
        0.44431853],
       [0.26405883, 0.2368877 , 0.21810949, ..., 0.3940933 , 0.45707822,
        0.4567468 ],
       [0.26778537, 0.24493736, 0.19977653, ..., 0.45978346, 0.4989148 ,
        0.45877394]], dtype=float32)

In [9]:
val_y = features.val_y
val_y = np.array(val_y)

display(val_y.shape)
display(val_y[:5])

(4236,)

array(['comsan', 'comsan', 'comsan', 'comsan', 'comsan'], dtype='<U7')

In [10]:
val_features = features.val_features

for key in val_features.keys():
  display(key)
  display(val_features[key].shape)
  display(val_features[key][0])

'mfcc'

(4236, 157, 20)

array([[0.12257361, 0.49381563, 0.5363848 , ..., 0.54724383, 0.5410692 ,
        0.43632296],
       [0.20635414, 0.46878126, 0.39141995, ..., 0.5889858 , 0.5980245 ,
        0.542831  ],
       [0.33625484, 0.46939433, 0.39404476, ..., 0.55657846, 0.5933179 ,
        0.5046429 ],
       ...,
       [0.60740757, 0.49200696, 0.3000493 , ..., 0.37832415, 0.41243577,
        0.43907583],
       [0.6111891 , 0.48030654, 0.2642255 , ..., 0.34313023, 0.39706486,
        0.50338286],
       [0.6119388 , 0.4810381 , 0.27204967, ..., 0.39163285, 0.41351283,
        0.45217353]], dtype=float32)

'chroma'

(4236, 157, 12)

array([[0.63831   , 0.6402529 , 0.634852  , ..., 0.67859095, 0.81784767,
        0.68421537],
       [0.8057598 , 0.54312015, 0.7578703 , ..., 0.5369649 , 0.7538784 ,
        0.57108074],
       [0.53621113, 0.46964788, 1.0000001 , ..., 0.85127556, 0.8402061 ,
        0.4952016 ],
       ...,
       [0.56576276, 0.5783525 , 0.6500901 , ..., 1.        , 0.85441506,
        0.3876618 ],
       [0.5738298 , 1.0000001 , 0.99844927, ..., 0.6838631 , 0.7646755 ,
        0.47221422],
       [0.4907602 , 1.        , 0.6255037 , ..., 0.63982683, 0.628425  ,
        0.57075745]], dtype=float32)

'rms'

(4236, 157, 1)

array([[0.00010426],
       [0.0006315 ],
       [0.0012026 ],
       [0.00225174],
       [0.00371365],
       [0.00475004],
       [0.0064033 ],
       [0.00821429],
       [0.00876107],
       [0.00921157],
       [0.01107191],
       [0.01649786],
       [0.01852473],
       [0.01923229],
       [0.01971584],
       [0.01716523],
       [0.01416391],
       [0.01485883],
       [0.01193624],
       [0.01123923],
       [0.01320501],
       [0.01164537],
       [0.01094686],
       [0.01137786],
       [0.01222731],
       [0.01352617],
       [0.014822  ],
       [0.01637374],
       [0.01371586],
       [0.01285068],
       [0.01356701],
       [0.01364477],
       [0.01283439],
       [0.0126944 ],
       [0.01515989],
       [0.01340893],
       [0.01224498],
       [0.01165101],
       [0.01219794],
       [0.01341363],
       [0.01260326],
       [0.01346611],
       [0.01551141],
       [0.01520022],
       [0.01369088],
       [0.01427731],
       [0.01225941],
       [0.011

'spectral_centroid'

(4236, 157, 1)

array([[0.53367753],
       [0.44703051],
       [0.4728109 ],
       [0.50805186],
       [0.55379061],
       [0.58286335],
       [0.58264086],
       [0.55110893],
       [0.49350227],
       [0.48463893],
       [0.4936866 ],
       [0.49658384],
       [0.50778496],
       [0.51003084],
       [0.50353215],
       [0.493073  ],
       [0.48253984],
       [0.46082972],
       [0.46801654],
       [0.45530475],
       [0.4464236 ],
       [0.4384515 ],
       [0.467609  ],
       [0.54142372],
       [0.58907474],
       [0.52650417],
       [0.49621278],
       [0.51877048],
       [0.52548328],
       [0.53088992],
       [0.51588885],
       [0.47842333],
       [0.49781405],
       [0.5080489 ],
       [0.500679  ],
       [0.51531876],
       [0.50692652],
       [0.4846708 ],
       [0.46160621],
       [0.49092231],
       [0.51357833],
       [0.53218764],
       [0.55626354],
       [0.59653586],
       [0.58953021],
       [0.53958173],
       [0.51937782],
       [0.509

'melspectrogram'

(4236, 157, 20)

array([[0.12102097, 0.05396634, 0.05933326, ..., 0.04239196, 0.03105116,
        0.03611428],
       [0.14085478, 0.05847347, 0.09956008, ..., 0.14534974, 0.13388371,
        0.15331137],
       [0.21521336, 0.14542067, 0.19903266, ..., 0.25661063, 0.27904606,
        0.28285092],
       ...,
       [0.36834198, 0.4083164 , 0.47958025, ..., 0.45183077, 0.4902231 ,
        0.4984774 ],
       [0.37136614, 0.39674044, 0.46400753, ..., 0.44627494, 0.46096855,
        0.50126386],
       [0.3717538 , 0.40080598, 0.45010328, ..., 0.4511329 , 0.47429442,
        0.5038996 ]], dtype=float32)

# Encode Classes

In [11]:
label_encoder = LabelEncoder().fit(train_y)
train_y = label_encoder.transform(train_y)
val_y = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0,1,2]))
classes

['barswa', 'comsan', 'eaywag1']

In [12]:
display(len(train_y))
display(train_y[:10])

display(len(val_y))
display(val_y[:10])

8236

array([2, 2, 2, 2, 2, 2, 0, 0, 0, 0])

4236

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0])

# Put the labels and features to one large dataframe

In [13]:
train_features['label'] = train_y
train_features

{'mfcc': array([[[0.25524157, 0.49361926, 0.5395643 , ..., 0.5395047 ,
          0.5315723 , 0.4432514 ],
         [0.24719143, 0.44379455, 0.5591471 , ..., 0.6371831 ,
          0.5456409 , 0.41449252],
         [0.2950486 , 0.45913303, 0.54112226, ..., 0.6816203 ,
          0.60937226, 0.3714976 ],
         ...,
         [0.35305548, 0.38608742, 0.5380844 , ..., 0.59507096,
          0.5259153 , 0.46720576],
         [0.3647915 , 0.35044894, 0.5708952 , ..., 0.43645465,
          0.59872067, 0.49044472],
         [0.37175345, 0.32072663, 0.63013554, ..., 0.41715518,
          0.6089424 , 0.418767  ]],
 
        [[0.36256403, 0.35165498, 0.53897995, ..., 0.5307283 ,
          0.552157  , 0.41435444],
         [0.34871078, 0.35259488, 0.551566  , ..., 0.55554956,
          0.554857  , 0.45563632],
         [0.33109337, 0.4270955 , 0.5696466 , ..., 0.5756002 ,
          0.6406246 , 0.4395522 ],
         ...,
         [0.3091538 , 0.47132248, 0.4928383 , ..., 0.44519082,
          0.7214

In [14]:
val_features['label'] = val_y
val_features

{'mfcc': array([[[0.12257361, 0.49381563, 0.5363848 , ..., 0.54724383,
          0.5410692 , 0.43632296],
         [0.20635414, 0.46878126, 0.39141995, ..., 0.5889858 ,
          0.5980245 , 0.542831  ],
         [0.33625484, 0.46939433, 0.39404476, ..., 0.55657846,
          0.5933179 , 0.5046429 ],
         ...,
         [0.60740757, 0.49200696, 0.3000493 , ..., 0.37832415,
          0.41243577, 0.43907583],
         [0.6111891 , 0.48030654, 0.2642255 , ..., 0.34313023,
          0.39706486, 0.50338286],
         [0.6119388 , 0.4810381 , 0.27204967, ..., 0.39163285,
          0.41351283, 0.45217353]],
 
        [[0.64980435, 0.43505952, 0.26370388, ..., 0.6201824 ,
          0.45315117, 0.41998333],
         [0.6430247 , 0.4096679 , 0.28386742, ..., 0.6734216 ,
          0.43863803, 0.5186633 ],
         [0.62583566, 0.4540848 , 0.34711936, ..., 0.6673721 ,
          0.48802733, 0.5081568 ],
         ...,
         [0.59859735, 0.50536543, 0.29699576, ..., 0.44717816,
          0.4419

In [15]:
merged_dict = {'train': train_features, 'val': val_features}

merged_dict

{'train': {'mfcc': array([[[0.25524157, 0.49361926, 0.5395643 , ..., 0.5395047 ,
           0.5315723 , 0.4432514 ],
          [0.24719143, 0.44379455, 0.5591471 , ..., 0.6371831 ,
           0.5456409 , 0.41449252],
          [0.2950486 , 0.45913303, 0.54112226, ..., 0.6816203 ,
           0.60937226, 0.3714976 ],
          ...,
          [0.35305548, 0.38608742, 0.5380844 , ..., 0.59507096,
           0.5259153 , 0.46720576],
          [0.3647915 , 0.35044894, 0.5708952 , ..., 0.43645465,
           0.59872067, 0.49044472],
          [0.37175345, 0.32072663, 0.63013554, ..., 0.41715518,
           0.6089424 , 0.418767  ]],
  
         [[0.36256403, 0.35165498, 0.53897995, ..., 0.5307283 ,
           0.552157  , 0.41435444],
          [0.34871078, 0.35259488, 0.551566  , ..., 0.55554956,
           0.554857  , 0.45563632],
          [0.33109337, 0.4270955 , 0.5696466 , ..., 0.5756002 ,
           0.6406246 , 0.4395522 ],
          ...,
          [0.3091538 , 0.47132248, 0.4928383 , ..

# save the merged dict with labels and features to pkl

In [16]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_5_sec_audio_features_not_avgpooled.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

In [17]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_5_sec_audio_features_not_avgpooled.pkl', 'rb') as file:
  merged_df = pickle.load(file)

In [18]:
assert merged_dict.keys() == merged_df.keys()

for top_level_key in merged_dict.keys():
  assert merged_dict[top_level_key].keys() == merged_df[top_level_key].keys()

  for bottom_level_key in merged_dict[top_level_key].keys():
    assert merged_dict[top_level_key][bottom_level_key].shape == merged_df[top_level_key][bottom_level_key].shape
    assert np.array_equal(merged_dict[top_level_key][bottom_level_key], merged_df[top_level_key][bottom_level_key])