# Import Libraries

In [None]:
# For drive access
from google.colab import drive
drive.mount('/content/drive')

# Standard libraries
import numpy as np
import pandas as pd
import time

# For audio
import librosa

# For preprocessing
import tensorflow as tf

# for saving the dataframe to disk
import pickle

Mounted at /content/drive


# Self Defined Class Methods For Feature Extraction

minor changes were made to the class to adjust for use on the test data

In [None]:
class Framed:

  def __init__(self,
               dataframe,
               sr=16000,
               window_size_s=5.0,
               hop_size_s=2.5,
               do_augment=False
               ):
    """
    Instantiate the Framed class, create train and val dfs.

    The extract_framed() method is automatically called when the class is
    instantiated to extract the framed audios and add to the train and val dfs.

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information.
      sr (int): Sample rate of the audio files.
      window_size_s (float): Window size in seconds for framing the audio.
      hop_size_s (float): Hop size in seconds for framing the audio.
      augment (bool): Whether to apply augmentation to the audio.
    """
    # instantiate class
    self.sr = sr
    self.frame_length = int(window_size_s * sr)
    self.frame_step = int(hop_size_s * sr)
    self.df = dataframe

    # only training data could be augmented, validation is never augmented
    self.extract_framed(self.df, augment=do_augment)


  ########################################
  # augment function is called by extract_framed function if do_augment=True
  ########################################
  def augment(self, audio_array):
    """
    Apply random augmentation if needed.

    Parameters:
      audio_array (npy): Training audio array.

    Returns:
      audio_array (npy): Augmented audio array.
    """
    choice = np.random.choice(['original','noise','shift','pitch'])

    # if the audio is short (less than 7 seconds in duration), stretch it first
    if len(audio_array) <= 7*self.sr:
      audio_array = librosa.effects.time_stretch(y=audio_array, rate=np.random.uniform(low=0, high=1))
    # if the audio is not short, apply a random augmentation at random magnitude
    elif choice == 'noise':
      noise_amp = np.random.normal(loc=0.0, scale=0.05)*np.amax(audio_array)
      audio_array = audio_array + noise_amp*np.random.normal(size=audio_array.shape[0])
    elif choice == 'shift':
      shift_range = int(np.random.uniform(low=-5, high=5)*1000)
      audio_array = np.roll(audio_array, shift_range)
    elif choice == 'pitch':
      audio_array = librosa.effects.pitch_shift(y=audio_array, sr=self.sr, n_steps=np.random.uniform(low=0, high=1))
    else:
      audio_array = audio_array

    return audio_array


  ########################################
  # extract_framed function is called when the class is instantiated to extract the framed audios
  ########################################
  def extract_framed(self, dataframe, augment):
    """
    Load audio files from the given DataFrame, extract framed audios,
    and add the framed audios to the given DataFrame

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information.
      augment (bool): Whether to apply augmentation to the audio.
    """
    # load audios
    audios = []
    for filename in dataframe['filename_npy']:
      audio = np.load('/content/drive/MyDrive/Projects/test_npy/' + filename)
      # if augment is true, call the augment function to apply augmentation to the audio
      if augment:
        audio = self.augment(audio)
      audios.append(audio)
    assert len(audios) == len(dataframe)

    # extract framed audios
    framed = []
    for audio in audios:
      framed_audio = tf.signal.frame(audio, self.frame_length, self.frame_step, pad_end=False)
      framed.append(framed_audio)
    assert len(framed) == len(audios)

    # add framed audios to df
    dataframe['framed'] = framed

# Load csv data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Projects/clean_test_df_w_npy.csv')

df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy
0,barswa,blank,53.2509,5.598,4.0,barswa/XC721711.ogg,19.069375,NL,EUROPE,barswa/XC721711.npy
1,comsan,call,48.8306,2.1999,4.0,comsan/XC496602.ogg,28.995938,FR,EUROPE,comsan/XC496602.npy
2,eaywag1,blank,43.3298,4.8364,4.0,eaywag1/XC718445.ogg,7.340438,FR,EUROPE,eaywag1/XC718445.npy
3,eaywag1,call,37.1357,-7.6138,4.5,eaywag1/XC481360.ogg,151.944,PT,EUROPE,eaywag1/XC481360.npy
4,barswa,blank,19.3551,-99.0467,5.0,barswa/XC698512.ogg,18.703688,MX,AMERICAS,barswa/XC698512.npy


In [None]:
len(df)

448

# Extract test dfs with framed audios

Drop the audios less than 8 seconds in length first

In [None]:
filtered = df[df['duration'] < 8.0]

filtered

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy
2,eaywag1,blank,43.3298,4.8364,4.0,eaywag1/XC718445.ogg,7.340438,FR,EUROPE,eaywag1/XC718445.npy
6,eaywag1,call,48.7448,-1.5638,4.0,eaywag1/XC614965.ogg,5.25,FR,EUROPE,eaywag1/XC614965.npy
14,barswa,song,49.0715,-106.5309,3.5,barswa/XC186847.ogg,6.739625,CA,AMERICAS,barswa/XC186847.npy
16,eaywag1,call,41.4875,2.3653,5.0,eaywag1/XC672267.ogg,5.903688,ES,EUROPE,eaywag1/XC672267.npy
19,comsan,call,51.3532,-0.1461,4.0,comsan/XC575672.ogg,7.026,GB,EUROPE,comsan/XC575672.npy
20,comsan,call,48.7221,-3.5738,3.0,comsan/XC469853.ogg,6.269,FR,EUROPE,comsan/XC469853.npy
41,comsan,call,59.3158,18.0588,3.0,comsan/XC558727.ogg,6.191,SE,EUROPE,comsan/XC558727.npy
44,comsan,blank,43.7412,-7.867,4.0,comsan/XC736767.ogg,5.825,ES,EUROPE,comsan/XC736767.npy
50,comsan,call,50.7193,3.2228,3.0,comsan/XC552598.ogg,5.4335,BE,EUROPE,comsan/XC552598.npy
55,eaywag1,blank,43.3298,4.8364,4.0,eaywag1/XC718444.ogg,5.146125,FR,EUROPE,eaywag1/XC718444.npy


In [None]:
df.drop(filtered.index, inplace=True)

In [None]:
assert len(df[df['duration'] < 8.0]) == 0

In [None]:
%%time

framed = Framed(df,
               window_size_s=8.0,
               hop_size_s=4.0,
               do_augment=False)

CPU times: user 3.75 s, sys: 3.06 s, total: 6.81 s
Wall time: 2min 57s


# Save the df with framed audios for future use

In [None]:
framed.df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,framed
0,barswa,blank,53.2509,5.598,4.0,barswa/XC721711.ogg,19.069375,NL,EUROPE,barswa/XC721711.npy,"((tf.Tensor(1.0793006e-05, shape=(), dtype=flo..."
1,comsan,call,48.8306,2.1999,4.0,comsan/XC496602.ogg,28.995938,FR,EUROPE,comsan/XC496602.npy,"((tf.Tensor(-1.02154445e-05, shape=(), dtype=f..."
3,eaywag1,call,37.1357,-7.6138,4.5,eaywag1/XC481360.ogg,151.944,PT,EUROPE,eaywag1/XC481360.npy,"((tf.Tensor(-1.7517013e-06, shape=(), dtype=fl..."
4,barswa,blank,19.3551,-99.0467,5.0,barswa/XC698512.ogg,18.703688,MX,AMERICAS,barswa/XC698512.npy,"((tf.Tensor(-2.801371e-06, shape=(), dtype=flo..."
5,comsan,call,56.0851,47.2602,5.0,comsan/XC492456.ogg,44.512687,RU,EUROPE,comsan/XC492456.npy,"((tf.Tensor(-1.2049219e-05, shape=(), dtype=fl..."


In [None]:
with open('/content/drive/MyDrive/Projects/test_csv_pkl/test_df_8_sec.pkl', 'wb') as file:
  pickle.dump(framed.df, file)

del file

## reload the pickle files to confirm they are the same as the original df

In [None]:
with open('/content/drive/MyDrive/Projects/test_csv_pkl/test_df_8_sec.pkl', 'rb') as file:
  loaded_df = pickle.load(file)

del file

In [None]:
loaded_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy,framed
0,barswa,blank,53.2509,5.598,4.0,barswa/XC721711.ogg,19.069375,NL,EUROPE,barswa/XC721711.npy,"((tf.Tensor(1.0793006e-05, shape=(), dtype=flo..."
1,comsan,call,48.8306,2.1999,4.0,comsan/XC496602.ogg,28.995938,FR,EUROPE,comsan/XC496602.npy,"((tf.Tensor(-1.02154445e-05, shape=(), dtype=f..."
3,eaywag1,call,37.1357,-7.6138,4.5,eaywag1/XC481360.ogg,151.944,PT,EUROPE,eaywag1/XC481360.npy,"((tf.Tensor(-1.7517013e-06, shape=(), dtype=fl..."
4,barswa,blank,19.3551,-99.0467,5.0,barswa/XC698512.ogg,18.703688,MX,AMERICAS,barswa/XC698512.npy,"((tf.Tensor(-2.801371e-06, shape=(), dtype=flo..."
5,comsan,call,56.0851,47.2602,5.0,comsan/XC492456.ogg,44.512687,RU,EUROPE,comsan/XC492456.npy,"((tf.Tensor(-1.2049219e-05, shape=(), dtype=fl..."


In [None]:
columns = ['primary_label', 'type', 'latitude', 'longitude', 'rating', 'filename', 'duration', 'country', 'filename_npy']

In [None]:
assert framed.df[columns].equals(loaded_df[columns])

Tensor objects cannot be directly compared using pd.equals() so they are compared one by one in below script

In [None]:
count = 0

for index in range(0,len(framed.df)):
  if not np.any(tf.equal(framed.df['framed'].iloc[index], loaded_df['framed'].iloc[index])):
    count +=1

assert count == 0