# Import Libraries

In [1]:
# drive access
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# standard libraries
import numpy as np
import pandas as pd

In [3]:
# for audio files
import librosa

In [4]:
# for os access
import os

# Load clean_train_df.csv and the clean_test_df.csv files

In [6]:
train_df = pd.read_csv('/content/drive/MyDrive/project/clean_train_df.csv')

train_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent
0,comsan,call,50.7542,4.5672,4.0,comsan/XC587730.ogg,5.746937,BE,EUROPE
1,eaywag1,call,69.3585,88.2378,3.0,eaywag1/XC675944.ogg,5.355,RU,EUROPE
2,comsan,call,41.1698,0.9761,5.0,comsan/XC664012.ogg,10.488,ES,EUROPE
3,barswa,call,,,2.5,barswa/XC268804.ogg,76.538813,UNKNOWN,UNKNOWN
4,barswa,song,53.9299,-2.9833,2.5,barswa/XC690498.ogg,83.0955,GB,EUROPE


In [7]:
test_df = pd.read_csv('/content/drive/MyDrive/project/clean_test_df.csv')

test_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent
0,barswa,blank,53.2509,5.598,4.0,barswa/XC721711.ogg,19.069375,NL,EUROPE
1,comsan,call,48.8306,2.1999,4.0,comsan/XC496602.ogg,28.995938,FR,EUROPE
2,eaywag1,blank,43.3298,4.8364,4.0,eaywag1/XC718445.ogg,7.340438,FR,EUROPE
3,eaywag1,call,37.1357,-7.6138,4.5,eaywag1/XC481360.ogg,151.944,PT,EUROPE
4,barswa,blank,19.3551,-99.0467,5.0,barswa/XC698512.ogg,18.703688,MX,AMERICAS


# Define functions

## Load audio using librosa

In [8]:
def extract_audio(filename, sr=16000):
  filepath = '/content/drive/MyDrive/project/train_audio/' + filename
  audio, _ = librosa.load(filepath, sr=sr)
  return audio

## run extract_audio function on an example

In [9]:
example = train_df['filename'].iloc[0]

example

'comsan/XC587730.ogg'

In [10]:
example_audio = extract_audio(example)

example_audio

array([-2.3946195e-06, -6.3855114e-06, -1.4239242e-05, ...,
       -7.7737495e-05,  3.6705611e-04, -1.1233051e-04], dtype=float32)

# build intuition for the save_audio_np function

In [11]:
folder = example.split('/')[0]
folder

'comsan'

In [12]:
filename = example.split('/')[1]
filename

'XC587730.ogg'

In [13]:
new_filename = filename.replace('.ogg', '.npy')
new_filename

'XC587730.npy'

In [14]:
os.makedirs('/content/drive/MyDrive/project/train_npy', exist_ok=False)

In [15]:
save_dir = f'/content/drive/MyDrive/project/train_npy/{folder}/'
save_dir

'/content/drive/MyDrive/project/train_npy/comsan/'

In [16]:
os.makedirs(save_dir, exist_ok=False)

In [17]:
save_filename = f'{save_dir}{new_filename}'
save_filename

'/content/drive/MyDrive/project/train_npy/comsan/XC587730.npy'

In [18]:
with open(save_filename, 'w') as file:
  pass

np.save(save_filename, example_audio)

In [19]:
np.load(save_filename)

array([-2.3946195e-06, -6.3855114e-06, -1.4239242e-05, ...,
       -7.7737495e-05,  3.6705611e-04, -1.1233051e-04], dtype=float32)

## save_audio_np function to save np array object for each audio file

In [26]:
def save_audio_np(filenames, is_train):

  # create directory to save the extracted np array
  if is_train:
    folder_dir = '/content/drive/MyDrive/project/train_npy'
  else:
    folder_dir = '/content/drive/MyDrive/project/test_npy'

  os.makedirs(folder_dir, exist_ok=False)

  # list to store filepaths
  filepaths = []

  # loop over the filenames and extract the np objects and save to respective directory
  for i, filename in enumerate(filenames):

    # extract audio
    audio = extract_audio(filename)

    # create new filepath
    folder = filename.split('/')[0]
    name = filename.split('/')[1]
    new_filename = name.replace('.ogg', '.npy')
    filepaths.append(f'{folder}/{new_filename}')

    # create save dir
    save_dir = f'{folder_dir}/{folder}/'

    if os.path.exists(save_dir):
      pass
    else:
      os.makedirs(save_dir, exist_ok=False)

    # create save filename
    save_filename = f'{save_dir}{new_filename}'

    # save .npy object
    with open(save_filename, 'w') as file:
      pass

    np.save(save_filename, audio)

  # assert all filenames have been looped through, and all files have been saved
  assert len(filenames) == len(filepaths)
  return filepaths

## Try using the save_audio_np function on 10 examples

In [21]:
example_filenames = train_df['filename'].head(10)
example_filenames

0     comsan/XC587730.ogg
1    eaywag1/XC675944.ogg
2     comsan/XC664012.ogg
3     barswa/XC268804.ogg
4     barswa/XC690498.ogg
5     barswa/XC317544.ogg
6     comsan/XC430800.ogg
7     barswa/XC458638.ogg
8     comsan/XC652588.ogg
9     barswa/XC337983.ogg
Name: filename, dtype: object

In [27]:
example_filepaths = save_audio_np(example_filenames, is_train=True)
example_filepaths

['comsan/XC587730.npy',
 'eaywag1/XC675944.npy',
 'comsan/XC664012.npy',
 'barswa/XC268804.npy',
 'barswa/XC690498.npy',
 'barswa/XC317544.npy',
 'comsan/XC430800.npy',
 'barswa/XC458638.npy',
 'comsan/XC652588.npy',
 'barswa/XC337983.npy']

# Extract and save np array objects from train_df

In [29]:
train_filenames = train_df['filename']

train_filenames

0        comsan/XC587730.ogg
1       eaywag1/XC675944.ogg
2        comsan/XC664012.ogg
3        barswa/XC268804.ogg
4        barswa/XC690498.ogg
                ...         
1039    eaywag1/XC569651.ogg
1040    eaywag1/XC317928.ogg
1041    eaywag1/XC656456.ogg
1042     comsan/XC579372.ogg
1043     comsan/XC644655.ogg
Name: filename, Length: 1044, dtype: object

In [30]:
train_filepaths = save_audio_np(train_filenames, is_train=True)

train_filepaths

['comsan/XC587730.npy',
 'eaywag1/XC675944.npy',
 'comsan/XC664012.npy',
 'barswa/XC268804.npy',
 'barswa/XC690498.npy',
 'barswa/XC317544.npy',
 'comsan/XC430800.npy',
 'barswa/XC458638.npy',
 'comsan/XC652588.npy',
 'barswa/XC337983.npy',
 'barswa/XC625824.npy',
 'barswa/XC452058.npy',
 'comsan/XC501617.npy',
 'comsan/XC588935.npy',
 'barswa/XC667239.npy',
 'barswa/XC135474.npy',
 'barswa/XC662037.npy',
 'barswa/XC472363.npy',
 'comsan/XC672721.npy',
 'barswa/XC422186.npy',
 'comsan/XC628409.npy',
 'eaywag1/XC687220.npy',
 'barswa/XC746604.npy',
 'comsan/XC582935.npy',
 'barswa/XC664997.npy',
 'comsan/XC678534.npy',
 'comsan/XC584477.npy',
 'barswa/XC644162.npy',
 'comsan/XC605377.npy',
 'comsan/XC676382.npy',
 'barswa/XC488933.npy',
 'eaywag1/XC744735.npy',
 'eaywag1/XC140811.npy',
 'comsan/XC524728.npy',
 'barswa/XC494922.npy',
 'barswa/XC248034.npy',
 'eaywag1/XC675540.npy',
 'barswa/XC672572.npy',
 'barswa/XC580258.npy',
 'comsan/XC651714.npy',
 'comsan/XC146703.npy',
 'barswa/XC

## Pick an example to confirm the np objects were saved successfully

In [31]:
example_before = extract_audio(train_df['filename'].iloc[350])

display(example_before)
display(example_before.shape)

array([-2.03143099e-06,  1.19953265e-05, -2.38223438e-05, ...,
       -3.73196090e-05,  5.52241108e-05,  1.08508859e-04], dtype=float32)

(130816,)

In [32]:
new_filename = train_df['filename'].iloc[350].replace('.ogg', '.npy')
new_filename

'comsan/XC667555.npy'

In [33]:
example_after = np.load(f'/content/drive/MyDrive/project/train_npy/{new_filename}')

display(example_after)
display(example_after.shape)

array([-2.03143099e-06,  1.19953265e-05, -2.38223438e-05, ...,
       -3.73196090e-05,  5.52241108e-05,  1.08508859e-04], dtype=float32)

(130816,)

In [34]:
np.array_equal(example_before, example_after)

True

## add new column in train_df for the new filepaths

In [35]:
train_df['filename_npy'] = train_filepaths

train_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy
0,comsan,call,50.7542,4.5672,4.0,comsan/XC587730.ogg,5.746937,BE,EUROPE,comsan/XC587730.npy
1,eaywag1,call,69.3585,88.2378,3.0,eaywag1/XC675944.ogg,5.355,RU,EUROPE,eaywag1/XC675944.npy
2,comsan,call,41.1698,0.9761,5.0,comsan/XC664012.ogg,10.488,ES,EUROPE,comsan/XC664012.npy
3,barswa,call,,,2.5,barswa/XC268804.ogg,76.538813,UNKNOWN,UNKNOWN,barswa/XC268804.npy
4,barswa,song,53.9299,-2.9833,2.5,barswa/XC690498.ogg,83.0955,GB,EUROPE,barswa/XC690498.npy


# Extract and save np array objects from test_df

In [36]:
test_filenames = test_df['filename']

test_filenames

0       barswa/XC721711.ogg
1       comsan/XC496602.ogg
2      eaywag1/XC718445.ogg
3      eaywag1/XC481360.ogg
4       barswa/XC698512.ogg
               ...         
443     comsan/XC554764.ogg
444    eaywag1/XC603005.ogg
445     barswa/XC288343.ogg
446     barswa/XC669759.ogg
447     barswa/XC253722.ogg
Name: filename, Length: 448, dtype: object

In [37]:
test_filepaths = save_audio_np(test_filenames, is_train=False)

test_filepaths

['barswa/XC721711.npy',
 'comsan/XC496602.npy',
 'eaywag1/XC718445.npy',
 'eaywag1/XC481360.npy',
 'barswa/XC698512.npy',
 'comsan/XC492456.npy',
 'eaywag1/XC614965.npy',
 'eaywag1/XC642038.npy',
 'barswa/XC669737.npy',
 'barswa/XC402778.npy',
 'eaywag1/XC616918.npy',
 'eaywag1/XC645112.npy',
 'eaywag1/XC725173.npy',
 'eaywag1/XC477547.npy',
 'barswa/XC186847.npy',
 'comsan/XC740418.npy',
 'eaywag1/XC672267.npy',
 'comsan/XC331036.npy',
 'barswa/XC496382.npy',
 'comsan/XC575672.npy',
 'comsan/XC469853.npy',
 'eaywag1/XC414959.npy',
 'comsan/XC422059.npy',
 'barswa/XC603606.npy',
 'eaywag1/XC337615.npy',
 'comsan/XC597503.npy',
 'eaywag1/XC622856.npy',
 'barswa/XC574535.npy',
 'eaywag1/XC435527.npy',
 'eaywag1/XC668780.npy',
 'barswa/XC337180.npy',
 'barswa/XC630247.npy',
 'eaywag1/XC722533.npy',
 'eaywag1/XC544236.npy',
 'comsan/XC738822.npy',
 'eaywag1/XC557770.npy',
 'eaywag1/XC675675.npy',
 'barswa/XC667954.npy',
 'eaywag1/XC526964.npy',
 'barswa/XC416955.npy',
 'eaywag1/XC306269.np

## Pick an example to confirm the np objects were saved successfully

In [38]:
example_before = extract_audio(test_df['filename'].iloc[250])

display(example_before)
display(example_before.shape)

array([ 9.7917382e-07, -7.9563815e-06,  1.9430037e-05, ...,
       -2.5169038e-06,  7.9648316e-07, -3.3645688e-06], dtype=float32)

(759008,)

In [39]:
new_filename = test_df['filename'].iloc[250].replace('.ogg', '.npy')
new_filename

'comsan/XC558769.npy'

In [40]:
example_after = np.load(f'/content/drive/MyDrive/project/test_npy/{new_filename}')

display(example_after)
display(example_after.shape)

array([ 9.7917382e-07, -7.9563815e-06,  1.9430037e-05, ...,
       -2.5169038e-06,  7.9648316e-07, -3.3645688e-06], dtype=float32)

(759008,)

In [41]:
np.array_equal(example_before, example_after)

True

## add new column in test_df for the new filepaths

In [42]:
test_df['filename_npy'] = test_filepaths

test_df.head()

Unnamed: 0,primary_label,type,latitude,longitude,rating,filename,duration,country,continent,filename_npy
0,barswa,blank,53.2509,5.598,4.0,barswa/XC721711.ogg,19.069375,NL,EUROPE,barswa/XC721711.npy
1,comsan,call,48.8306,2.1999,4.0,comsan/XC496602.ogg,28.995938,FR,EUROPE,comsan/XC496602.npy
2,eaywag1,blank,43.3298,4.8364,4.0,eaywag1/XC718445.ogg,7.340438,FR,EUROPE,eaywag1/XC718445.npy
3,eaywag1,call,37.1357,-7.6138,4.5,eaywag1/XC481360.ogg,151.944,PT,EUROPE,eaywag1/XC481360.npy
4,barswa,blank,19.3551,-99.0467,5.0,barswa/XC698512.ogg,18.703688,MX,AMERICAS,barswa/XC698512.npy


# save the train_df and test_df to csv files

In [43]:
train_df.to_csv('/content/drive/MyDrive/project/clean_train_df_w_npy.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/project/clean_test_df_w_npy.csv', index=False)