# Import Libraries

In [1]:
# for drive access
from google.colab import drive
drive.mount('/content/drive')

# standard libraries
import numpy as np
import pandas as pd
import pickle
import time

# for audio
import librosa

# for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

# for saving the label & features to disk
import pickle

Mounted at /content/drive


# Self Defined Class Methods For Feature Extraction

In [2]:
class Extraction:

  def __init__(self,
               train_df,
               val_df,
               sr=16000,
               n_mfccs=20, #20 is the default n_mfccs from librosa
               n_mels=20, #reduced from 128 default from librosa to 20
               n_chroma=12, #12 is the default n_chroma from librosa
               features=['mfcc'],
               do_normalize=True,
               do_avgpool=True #WARNING: THE KERNEL WILL CRASH IF YOU TRY TO EXTRACT ALL FEATURES WITHOUT AVGPOOL
               ):
    """
    Instantiate the Extraction class and extract the labels and features from train and val DataFrames.

    The extract_features() method is automatically called to extract the labels and features from the given DataFrame.

    Parameters:
      train_df (pd.DataFrame): DataFrame containing training data.
      val_df (pd.DataFrame): DataFrame containing validation data.
      sr (int): Sample rate of the audio files.
      n_mfccs (int): Number of MFCCs to extract.
      n_mels (int): Number of Mel bands to extract.
      n_chroma (int): Number of chroma bins to use.
      features (list): List of features to extract.
        accepted features: 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type'.
      normalize (bool): Whether to normalize the features.
      maxpool (bool): Whether to maxpool the features.
    """
    # instantiate class variables
    self.sr = sr
    self.n_mfccs = n_mfccs
    self.n_mels = n_mels
    self.n_chroma = n_chroma
    self.do_normalize = do_normalize
    self.do_avgpool = do_avgpool

    # confirm features have been specified
    assert len(features) != 0, "Must Specify At Least One Feature In The Form Of A List."
    self.features = features

    # confirm features are valid
    self.accepted_feature = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', 'type']
    for each in self.features:
      assert each in self.accepted_feature, f"{each} is not an accepted feature, only 'mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram', 'continent', 'rating', and 'type' are accpeted features."

    # extract train and val labels and features
    self.train_y, self.train_features = self.extract_features(train_df)
    self.val_y, self.val_features = self.extract_features(val_df)

    # process the features by applying normalization or average pooling
    self.train_features, self.val_features = self.process_features(self.train_features, self.val_features)


  ########################################
  # each of the extract_x() function is callable by the extract_features() function based on the features instantiated
  # each of the features are extracted from librosa and transposed to shape (n_time, n_features)
  ########################################
  def extract_mfcc(self, each):
    return np.transpose(librosa.feature.mfcc(y=np.array(each), sr=self.sr, n_mfcc=self.n_mfccs))

  def extract_chroma(self, each):
    return np.transpose(librosa.feature.chroma_stft(y=np.array(each), sr=self.sr, n_chroma=self.n_chroma))

  def extract_rms(self, each):
    return np.transpose(librosa.feature.rms(y=np.array(each)))

  def extract_spectral_centroid(self, each):
    return np.transpose(librosa.feature.spectral_centroid(y=np.array(each), sr=self.sr))

  def extract_melspectrogram(self, each):
    mel = librosa.feature.melspectrogram(y=np.array(each), sr=self.sr, n_mels=self.n_mels)
    return np.transpose(librosa.power_to_db(mel))


  ########################################
  # normalization function is called by process_features() function for the features that require normalization
  ########################################
  def normalization(self, train_X, val_X, n_time, n_features):
    """
    Normalize the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Normalized training feature
      val_X (npy): Normalized validation feature
    """
    # flatten the np arrays to 1D
    train_X_reshape = train_X.reshape(-1, n_time * n_features)
    val_X_reshape = val_X.reshape(-1, n_time * n_features)

    # use minmaxscaler to normalize the train and val features
    scaler = MinMaxScaler((0,1)).fit(train_X_reshape)
    train_X_scaled = scaler.transform(train_X_reshape)
    val_X_scaled = scaler.transform(val_X_reshape)

    # reshape the features to the original shape (n_time, n_features)
    train_X = train_X_scaled.reshape(-1, n_time, n_features)
    val_X = val_X_scaled.reshape(-1, n_time, n_features)

    return train_X, val_X


  ########################################
  # avgpooling function is called by process_features() function for the features that require average pooling
  ########################################
  def avgpooling(self, train_X, val_X, n_time, n_features):
    """
    Average pooling the train and val features

    Parameters:
      train_X (npy): Training feature
      val_X (npy): Validation feature
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Avgpooled training feature
      val_X (npy): Avgpooled validation feature
    """
    # instantiate the keras layers and model.
    # the model only performs average pooling of inputs
    tf.keras.backend.clear_session()
    input = tf.keras.layers.Input(shape=(n_time, n_features))
    global_avg_pooling = tf.keras.layers.GlobalAveragePooling1D()
    pooled = global_avg_pooling(input)
    pooling_model = tf.keras.models.Model(inputs=input, outputs=pooled)

    # use the instantiated model to avgpool the train and val features
    train_X = pooling_model.predict(train_X)
    val_X = pooling_model.predict(val_X)

    return train_X, val_X


  ########################################
  # process_features function is called when the class is instantiated
  ########################################
  def process_features(self, train_features_dict, val_features_dict):
    """
    Apply normalization or average pooling to train and val features

    Parameters:
      train_features_dict (dict): Dictionary containing the training features
      val_features_dict (dict): Dictionary containing the validation features

    Returns:
      tuple: Tuple containing normalized and/or average pooled training and validation features dictionary
    """
    start_time = time.time()
    display('features processing started ---------->')

    for each in train_features_dict.keys():
      # 'continent', 'rating', and 'type' do not need to be processed
      if each == 'continent' or each == 'rating' or each == 'type':
        pass
      # process numeric features
      else:
        if each == 'mfcc':
          n_features=self.n_mfccs
        elif each == 'chroma':
          n_features=self.n_chroma
        elif each == 'rms' or each == 'spectral_centroid':
          n_features=1
        elif each == 'melspectrogram':
          n_features=self.n_mels

        # index the dictionary to find the value of the feature based on the feature name (key)
        train_feature = train_features_dict[each]
        val_feature = val_features_dict[each]

        # call normalization or avgpooling function to apply normalization or average pooling to features
        if self.do_normalize and self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_features_dict[each], val_features_dict[each], n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_normalize:
          train_features_dict[each], val_features_dict[each] = self.normalization(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        elif self.do_avgpool:
          train_features_dict[each], val_features_dict[each] = self.avgpooling(train_feature, val_feature, n_time=train_feature.shape[1], n_features=n_features)
        else:
          train_features_dict[each], val_features_dict[each] = train_features_dict[each], val_features_dict[each]

        display(f'{each} processed')

    end_time = time.time()
    display(f'features processing took {(end_time - start_time)/60:.1f} mins')

    return train_features_dict, val_features_dict


  ########################################
  # extract_features function is called when the class is instantiated
  ########################################
  def extract_features(self, dataframe):
    """
    Extract the label & features from the dataframes

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information

    Returns:
      tuple: Tuple containing labels and a dictionary of extracted features
    """
    start_time = time.time()
    display(f'feature extraction started ---------->')

    # create empty list and dict to store the labels and features
    y = []
    features_dict = {item: [] for item in self.features}

    # iterate through each row of the dataframe to extract the label and features
    for _, row in dataframe.iterrows():
      label = row['primary_label']
      framed = row['framed']

      for each in framed:
        y.append(label)

        if 'continent' in self.features:
          features_dict['continent'].append(row['continent'])
        if 'rating' in self.features:
          features_dict['rating'].append(row['rating'])
        if 'type' in self.features:
          features_dict['type'].append(row['type'])

        # dynatically call the extract_x function to extract the listed features
        for feature in self.features:
          extract = f"extract_{feature}"
          if hasattr(self, extract) and callable(func := getattr(self, extract)):
            features_dict[feature].append(func(each))

    # cast lists to np arrays
    for each in features_dict.keys():
      features_dict[each] = np.array(features_dict[each])
    y = np.array(y)

    end_time = time.time()
    display(f'features extraction took {(end_time - start_time)/60:.1f} mins')

    return y, features_dict


# Load train and test framed audio data

In [3]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/train_df_8_sec.pkl', 'rb') as file:
  train_df = pickle.load(file)

train_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
668,barswa,blank,-33.2377,26.9574,5.0,barswa/XC713037.ogg,58.331,ZA,AFRICA,barswa/XC713037.npy,train,"((tf.Tensor(-1.3069126e-05, shape=(), dtype=fl..."
247,eaywag1,call,54.1191,13.376,3.5,eaywag1/XC658750.ogg,49.68,DE,EUROPE,eaywag1/XC658750.npy,train,"((tf.Tensor(-1.1132361e-06, shape=(), dtype=fl..."
620,barswa,song,35.0573,34.0009,5.0,barswa/XC405617.ogg,114.207375,CY,ASIA,barswa/XC405617.npy,train,"((tf.Tensor(3.5154517e-06, shape=(), dtype=flo..."
86,comsan,call,43.5352,-1.4475,4.5,comsan/XC580687.ogg,59.82,FR,EUROPE,comsan/XC580687.npy,train,"((tf.Tensor(-7.335485e-06, shape=(), dtype=flo..."
450,eaywag1,song,43.5118,3.8367,3.0,eaywag1/XC567939.ogg,10.031,FR,EUROPE,eaywag1/XC567939.npy,train,"((tf.Tensor(7.8262474e-07, shape=(), dtype=flo..."


In [4]:
with open('/content/drive/MyDrive/project/train_val_csv_pkl/val_df_8_sec.pkl', 'rb') as file:
  val_df = pickle.load(file)

val_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,set,framed
991,barswa,song,39.4709,-8.0205,3.5,barswa/XC624409.ogg,10.008,PT,EUROPE,barswa/XC624409.npy,val,"((tf.Tensor(3.148432e-08, shape=(), dtype=floa..."
760,barswa,call,51.4522,-9.8189,5.0,barswa/XC666943.ogg,11.885,IE,EUROPE,barswa/XC666943.npy,val,"((tf.Tensor(-8.617062e-07, shape=(), dtype=flo..."
876,comsan,call,50.7347,3.2143,4.0,comsan/XC578171.ogg,21.707,BE,EUROPE,comsan/XC578171.npy,val,"((tf.Tensor(-3.9801816e-06, shape=(), dtype=fl..."
927,eaywag1,song,46.0192,61.846,5.0,eaywag1/XC184434.ogg,63.921625,KZ,ASIA,eaywag1/XC184434.npy,val,"((tf.Tensor(-4.7415142e-07, shape=(), dtype=fl..."
880,comsan,call,42.9037,13.9077,4.0,comsan/XC433334.ogg,24.006,IT,EUROPE,comsan/XC433334.npy,val,"((tf.Tensor(-1.1874385e-05, shape=(), dtype=fl..."


# Extract features

In [5]:
features_list = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'melspectrogram']

In [6]:
%%time

features = Extraction(train_df,
                      val_df,
                      features=features_list,
                      do_normalize=True,
                      do_avgpool=False)

'feature extraction started ---------->'

  return pitch_tuning(


'features extraction took 9.4 mins'

'feature extraction started ---------->'

'features extraction took 4.7 mins'

'features processing started ---------->'

'mfcc processed'

'chroma processed'

'rms processed'

'spectral_centroid processed'

'melspectrogram processed'

'features processing took 0.0 mins'

CPU times: user 13min 27s, sys: 9min 22s, total: 22min 50s
Wall time: 14min 3s


In [7]:
train_y = features.train_y

display(train_y.shape)
display(train_y[:5])

(4763,)

array(['barswa', 'barswa', 'barswa', 'barswa', 'barswa'], dtype='<U7')

In [8]:
train_features = features.train_features

for key in train_features.keys():
  display(key)
  display(train_features[key].shape)
  display(train_features[key][0])

'mfcc'

(4763, 251, 20)

array([[0.2926833 , 0.5472208 , 0.51762646, ..., 0.42902297, 0.5246572 ,
        0.39747417],
       [0.38219857, 0.63193434, 0.4692648 , ..., 0.4789598 , 0.5143834 ,
        0.35231233],
       [0.4352324 , 0.62795496, 0.4937711 , ..., 0.53823024, 0.5205363 ,
        0.36665156],
       ...,
       [0.6295364 , 0.3740691 , 0.4684327 , ..., 0.74904454, 0.6659465 ,
        0.13212648],
       [0.6729512 , 0.37194872, 0.47004578, ..., 0.63276273, 0.6894711 ,
        0.25973636],
       [0.6984098 , 0.35604227, 0.50428545, ..., 0.49908003, 0.694181  ,
        0.40737295]], dtype=float32)

'chroma'

(4763, 251, 12)

array([[0.73383594, 0.69802487, 0.54096204, ..., 1.        , 0.608227  ,
        0.48766178],
       [1.        , 0.9552251 , 0.97629696, ..., 0.819532  , 0.46592206,
        0.4471012 ],
       [0.70929813, 0.856629  , 1.        , ..., 0.9266538 , 0.5471759 ,
        0.3217781 ],
       ...,
       [0.88756543, 0.4946972 , 0.0864549 , ..., 0.22002427, 0.10811816,
        0.53788805],
       [1.        , 0.20042695, 0.16474293, ..., 0.17202397, 0.26110023,
        0.83525467],
       [0.90538883, 0.38594005, 0.5073254 , ..., 0.5220612 , 0.7847499 ,
        1.        ]], dtype=float32)

'rms'

(4763, 251, 1)

array([[0.00066166],
       [0.00108187],
       [0.00132881],
       [0.00173146],
       [0.00243525],
       [0.0040241 ],
       [0.00404387],
       [0.00451472],
       [0.00460673],
       [0.00389649],
       [0.00389425],
       [0.00364176],
       [0.00360476],
       [0.00298573],
       [0.00220647],
       [0.00210538],
       [0.00219035],
       [0.00228781],
       [0.00237162],
       [0.00214912],
       [0.00185741],
       [0.00167973],
       [0.00148034],
       [0.00144359],
       [0.00138289],
       [0.0019026 ],
       [0.00349377],
       [0.00674044],
       [0.00596432],
       [0.0055888 ],
       [0.00537581],
       [0.00388524],
       [0.00228059],
       [0.00197732],
       [0.00216695],
       [0.00183421],
       [0.00163533],
       [0.00158946],
       [0.00150203],
       [0.00165542],
       [0.0019687 ],
       [0.00324267],
       [0.00541037],
       [0.00855162],
       [0.00869734],
       [0.00860727],
       [0.00906659],
       [0.007

'spectral_centroid'

(4763, 251, 1)

array([[0.43021879],
       [0.35183347],
       [0.34781837],
       [0.33187161],
       [0.34889517],
       [0.36813308],
       [0.41916818],
       [0.41816175],
       [0.41738839],
       [0.40180619],
       [0.42645345],
       [0.44920148],
       [0.42218698],
       [0.34067167],
       [0.34579915],
       [0.39024828],
       [0.36757338],
       [0.33677927],
       [0.35095777],
       [0.39534415],
       [0.41730669],
       [0.3524761 ],
       [0.33825169],
       [0.36009482],
       [0.3485585 ],
       [0.34525193],
       [0.37783899],
       [0.44370878],
       [0.496219  ],
       [0.5361965 ],
       [0.52634682],
       [0.41777292],
       [0.37638074],
       [0.41059608],
       [0.43082292],
       [0.43644633],
       [0.42599421],
       [0.3998278 ],
       [0.38076336],
       [0.39246149],
       [0.41569275],
       [0.42230446],
       [0.43493215],
       [0.4522872 ],
       [0.46056001],
       [0.44753668],
       [0.44646046],
       [0.463

'melspectrogram'

(4763, 251, 20)

array([[0.24356848, 0.19709486, 0.19181323, ..., 0.14215577, 0.14264905,
        0.16532534],
       [0.40376735, 0.3655113 , 0.42287964, ..., 0.23766714, 0.1897651 ,
        0.1521129 ],
       [0.45815992, 0.46074304, 0.4943219 , ..., 0.2976507 , 0.25157702,
        0.2118563 ],
       ...,
       [0.49117148, 0.43984422, 0.43819213, ..., 0.6310359 , 0.56908995,
        0.5044948 ],
       [0.46418417, 0.4439577 , 0.45830843, ..., 0.6673047 , 0.62528354,
        0.5652833 ],
       [0.4987093 , 0.456418  , 0.46511737, ..., 0.6581471 , 0.6399298 ,
        0.5732062 ]], dtype=float32)

In [9]:
val_y = features.val_y
val_y = np.array(val_y)

display(val_y.shape)
display(val_y[:5])

(2430,)

array(['barswa', 'barswa', 'comsan', 'comsan', 'comsan'], dtype='<U7')

In [10]:
val_features = features.val_features

for key in val_features.keys():
  display(key)
  display(val_features[key].shape)
  display(val_features[key][0])

'mfcc'

(2430, 251, 20)

array([[0.03438473, 0.5409663 , 0.49516892, ..., 0.47214267, 0.54059166,
        0.31063035],
       [0.156398  , 0.5936351 , 0.34219292, ..., 0.5281046 , 0.4928256 ,
        0.45432433],
       [0.27276182, 0.5529174 , 0.35612273, ..., 0.520854  , 0.5686558 ,
        0.47363648],
       ...,
       [0.53652716, 0.533416  , 0.3665663 , ..., 0.36028466, 0.5355693 ,
        0.3084178 ],
       [0.54577094, 0.54470825, 0.35483423, ..., 0.40590182, 0.4965055 ,
        0.34188586],
       [0.5508863 , 0.53994864, 0.3564601 , ..., 0.43862337, 0.47689667,
        0.32920602]], dtype=float32)

'chroma'

(2430, 251, 12)

array([[0.7646548 , 0.6593877 , 1.        , ..., 0.7147495 , 0.7372933 ,
        0.5769873 ],
       [1.        , 0.9776191 , 0.7299414 , ..., 0.20515485, 0.25375524,
        0.4779138 ],
       [1.0000001 , 0.9596754 , 0.82782555, ..., 0.5209368 , 0.5438989 ,
        0.7936685 ],
       ...,
       [0.16291276, 0.14363815, 0.17468452, ..., 0.20659253, 0.23780039,
        0.16433807],
       [0.2022269 , 0.16572295, 0.19001824, ..., 0.13031365, 0.15897165,
        0.1746402 ],
       [0.14170276, 0.14602776, 0.20580296, ..., 0.14098944, 0.10827189,
        0.16469386]], dtype=float32)

'rms'

(2430, 251, 1)

array([[6.82546815e-05],
       [2.19398047e-04],
       [4.28390456e-04],
       [7.72096566e-04],
       [1.39056332e-03],
       [1.55082159e-03],
       [1.60539453e-03],
       [1.98320020e-03],
       [2.29190313e-03],
       [2.54758098e-03],
       [2.95388582e-03],
       [3.30832833e-03],
       [3.99361039e-03],
       [4.23852867e-03],
       [4.54276334e-03],
       [4.79059434e-03],
       [4.90209414e-03],
       [5.54794865e-03],
       [5.66226617e-03],
       [5.15686395e-03],
       [4.91325883e-03],
       [4.55542188e-03],
       [4.14918549e-03],
       [3.93603183e-03],
       [3.46300332e-03],
       [3.61404824e-03],
       [3.96857969e-03],
       [4.81661782e-03],
       [3.52358283e-03],
       [3.14934039e-03],
       [3.65255191e-03],
       [1.60486680e-02],
       [1.73435844e-02],
       [1.91237796e-02],
       [2.63790451e-02],
       [1.58232972e-02],
       [1.56497397e-02],
       [1.90841872e-02],
       [1.50972689e-02],
       [1.65513232e-02],


'spectral_centroid'

(2430, 251, 1)

array([[0.54656741],
       [0.37865166],
       [0.39853728],
       [0.39662769],
       [0.38410849],
       [0.37113322],
       [0.4095319 ],
       [0.42216998],
       [0.43033031],
       [0.42671694],
       [0.42471528],
       [0.40931995],
       [0.40313889],
       [0.41178189],
       [0.43122065],
       [0.4628318 ],
       [0.44912855],
       [0.44242352],
       [0.4639076 ],
       [0.47380169],
       [0.50822297],
       [0.52067792],
       [0.50788246],
       [0.44669293],
       [0.43140846],
       [0.45022976],
       [0.467945  ],
       [0.4820143 ],
       [0.45524179],
       [0.45646951],
       [0.45099943],
       [0.53541299],
       [0.73791611],
       [0.77111922],
       [0.70074055],
       [0.73957299],
       [0.73627294],
       [0.65617334],
       [0.69831437],
       [0.70299948],
       [0.66123072],
       [0.6271331 ],
       [0.66339409],
       [0.62175528],
       [0.46355516],
       [0.53311691],
       [0.58996036],
       [0.590

'melspectrogram'

(2430, 251, 20)

array([[0.08103681, 0.06281257, 0.04899508, ..., 0.0126223 , 0.01625842,
        0.02704358],
       [0.16806442, 0.15293223, 0.20864499, ..., 0.06657875, 0.05336493,
        0.05504674],
       [0.2551819 , 0.26873422, 0.27525806, ..., 0.1596486 , 0.15128297,
        0.15508932],
       ...,
       [0.43946368, 0.43952537, 0.46801436, ..., 0.3903185 , 0.38137046,
        0.37939993],
       [0.4319826 , 0.45017073, 0.4815686 , ..., 0.39779127, 0.3901048 ,
        0.37982506],
       [0.43024457, 0.43739873, 0.4742092 , ..., 0.37816513, 0.39697692,
        0.37451804]], dtype=float32)

# Encode Classes

In [11]:
label_encoder = LabelEncoder().fit(train_y)
train_y = label_encoder.transform(train_y)
val_y = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0,1,2]))
classes

['barswa', 'comsan', 'eaywag1']

In [12]:
display(len(train_y))
display(train_y[:10])

display(len(val_y))
display(val_y[:10])

4763

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

2430

array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2])

# Put the labels and features to one large dataframe

In [13]:
train_features['label'] = train_y
train_features

{'mfcc': array([[[0.2926833 , 0.5472208 , 0.51762646, ..., 0.42902297,
          0.5246572 , 0.39747417],
         [0.38219857, 0.63193434, 0.4692648 , ..., 0.4789598 ,
          0.5143834 , 0.35231233],
         [0.4352324 , 0.62795496, 0.4937711 , ..., 0.53823024,
          0.5205363 , 0.36665156],
         ...,
         [0.6295364 , 0.3740691 , 0.4684327 , ..., 0.74904454,
          0.6659465 , 0.13212648],
         [0.6729512 , 0.37194872, 0.47004578, ..., 0.63276273,
          0.6894711 , 0.25973636],
         [0.6984098 , 0.35604227, 0.50428545, ..., 0.49908003,
          0.694181  , 0.40737295]],
 
        [[0.571982  , 0.52721703, 0.35377806, ..., 0.31411108,
          0.45345846, 0.5325761 ],
         [0.57278824, 0.51283437, 0.33133256, ..., 0.44663262,
          0.59104645, 0.5540432 ],
         [0.5442881 , 0.50030595, 0.39622635, ..., 0.4177069 ,
          0.63780564, 0.5152073 ],
         ...,
         [0.65820706, 0.30996823, 0.5541669 , ..., 0.3272183 ,
          0.6759

In [14]:
val_features['label'] = val_y
val_features

{'mfcc': array([[[0.03438473, 0.5409663 , 0.49516892, ..., 0.47214267,
          0.54059166, 0.31063035],
         [0.156398  , 0.5936351 , 0.34219292, ..., 0.5281046 ,
          0.4928256 , 0.45432433],
         [0.27276182, 0.5529174 , 0.35612273, ..., 0.520854  ,
          0.5686558 , 0.47363648],
         ...,
         [0.53652716, 0.533416  , 0.3665663 , ..., 0.36028466,
          0.5355693 , 0.3084178 ],
         [0.54577094, 0.54470825, 0.35483423, ..., 0.40590182,
          0.4965055 , 0.34188586],
         [0.5508863 , 0.53994864, 0.3564601 , ..., 0.43862337,
          0.47689667, 0.32920602]],
 
        [[0.41281384, 0.67141247, 0.65520215, ..., 0.37753204,
          0.5337289 , 0.45304573],
         [0.61477315, 0.7969459 , 0.4981349 , ..., 0.59867835,
          0.59102464, 0.68530947],
         [0.67084944, 0.77747846, 0.53579843, ..., 0.7273464 ,
          0.61758906, 0.62314296],
         ...,
         [0.7601433 , 0.66117793, 0.5779958 , ..., 0.6609509 ,
          0.6841

In [15]:
merged_dict = {'train': train_features, 'val': val_features}

merged_dict

{'train': {'mfcc': array([[[0.2926833 , 0.5472208 , 0.51762646, ..., 0.42902297,
           0.5246572 , 0.39747417],
          [0.38219857, 0.63193434, 0.4692648 , ..., 0.4789598 ,
           0.5143834 , 0.35231233],
          [0.4352324 , 0.62795496, 0.4937711 , ..., 0.53823024,
           0.5205363 , 0.36665156],
          ...,
          [0.6295364 , 0.3740691 , 0.4684327 , ..., 0.74904454,
           0.6659465 , 0.13212648],
          [0.6729512 , 0.37194872, 0.47004578, ..., 0.63276273,
           0.6894711 , 0.25973636],
          [0.6984098 , 0.35604227, 0.50428545, ..., 0.49908003,
           0.694181  , 0.40737295]],
  
         [[0.571982  , 0.52721703, 0.35377806, ..., 0.31411108,
           0.45345846, 0.5325761 ],
          [0.57278824, 0.51283437, 0.33133256, ..., 0.44663262,
           0.59104645, 0.5540432 ],
          [0.5442881 , 0.50030595, 0.39622635, ..., 0.4177069 ,
           0.63780564, 0.5152073 ],
          ...,
          [0.65820706, 0.30996823, 0.5541669 , ..

# save the merged dict with labels and features to pkl

In [16]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_8_sec_audio_features_not_avgpooled.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

In [17]:
with open('/content/drive/MyDrive/project/train_val_features_pkl/train_val_8_sec_audio_features_not_avgpooled.pkl', 'rb') as file:
  merged_df = pickle.load(file)

In [18]:
assert merged_dict.keys() == merged_df.keys()

for top_level_key in merged_dict.keys():
  assert merged_dict[top_level_key].keys() == merged_df[top_level_key].keys()

  for bottom_level_key in merged_dict[top_level_key].keys():
    assert merged_dict[top_level_key][bottom_level_key].shape == merged_df[top_level_key][bottom_level_key].shape
    assert np.array_equal(merged_dict[top_level_key][bottom_level_key], merged_df[top_level_key][bottom_level_key])