# Download the Dataset

In [1]:
!wget https://philharmonia-assets.s3-eu-west-1.amazonaws.com/uploads/2020/02/12112005/all-samples.zip

--2020-06-26 19:43:41--  https://philharmonia-assets.s3-eu-west-1.amazonaws.com/uploads/2020/02/12112005/all-samples.zip
Resolving philharmonia-assets.s3-eu-west-1.amazonaws.com (philharmonia-assets.s3-eu-west-1.amazonaws.com)... 52.218.88.56
Connecting to philharmonia-assets.s3-eu-west-1.amazonaws.com (philharmonia-assets.s3-eu-west-1.amazonaws.com)|52.218.88.56|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 260849422 (249M) [application/zip]
Saving to: ‘all-samples.zip’


2020-06-26 19:43:54 (20.2 MB/s) - ‘all-samples.zip’ saved [260849422/260849422]



# Extract zip files

In [2]:
from zipfile import ZipFile
with ZipFile('all-samples.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()

In [3]:
import os
dir_path='./all-samples'

In [4]:
for path, dir_list, file_list in os.walk(dir_path):
    for file_name in file_list:
        if file_name.endswith(".zip"):
            abs_file_path = os.path.join(path, file_name)

            parent_path = os.path.split(abs_file_path)[0]
            output_folder_name = os.path.splitext(abs_file_path)[0]
            output_path = os.path.join(parent_path, output_folder_name)

            zip_obj = ZipFile(abs_file_path, 'r')
            zip_obj.extractall(output_path)
            zip_obj.close()

# Start Trials

In [5]:
path_dataset='./all-samples/all-samples/'
import fnmatch
import librosa
import sys
import numpy as np

In [6]:
instruments = ['banjo','bass-clarinet','bassoon', 'cello','clarinet','contrabassoon',
               'english-horn','double-bass','flute','french-horn','guitar','mandolin',
               'oboe','percussion','saxophone','trombone','trumpet','tuba','viola','violin']
classes = []
durations = []
files_error = []

counter=1
for root, dirnames, filenames in os.walk(path_dataset):
    for filename in fnmatch.filter(filenames, '*.mp3'):
      print ("Get %d = %s"%(counter, filename))
      name_no_ext = os.path.join(root,os.path.splitext(filename)[0])
      try:
        y, sr = librosa.load(os.path.join(root, filename), sr=None)
        np.save(name_no_ext,y)
        duration =librosa.get_duration(y=y, sr=sr)
        durations.append(duration)
        if 'percussion' in root:
          classes.append('percussion')
        else:
          for instrument in instruments:
            if fnmatch.fnmatchcase(filename, instrument+'*'):
              classes.append(instrument)
        data_dictionary = {'filename': (name_no_ext+'.npy'), 'sampling rate' : sr, 'duration' : duration, 'class' : classes[counter-1]}
      # Here now we can save data_dictionary to a file using pickle
      except Exception as e:
            files_error.append({'file':os.path.join(root, filename),'error':sys.exc_info()})
            print("Error loading %s. Error: %s" % (filename,e))
      counter+=1


Get 1 = bassoon_C4_phrase_forte_fluttertonguing.mp3
Get 2 = bassoon_B4_025_fortissimo_normal.mp3
Get 3 = bassoon_Cs2_025_mezzo-piano_normal.mp3
Get 4 = bassoon_G3_phrase_mezzo-forte_staccato.mp3
Get 5 = bassoon_Fs4_05_piano_normal.mp3
Get 6 = bassoon_F3_phrase_mezzo-forte_tongued-slur.mp3
Get 7 = bassoon_G2_15_piano_normal.mp3
Get 8 = bassoon_Fs2_1_fortissimo_normal.mp3
Get 9 = bassoon_C4_05_forte_normal.mp3
Get 10 = bassoon_B1_025_fortissimo_normal.mp3
Get 11 = bassoon_D3_15_forte_normal.mp3
Get 12 = bassoon_As4_long_forte_minor-trill.mp3
Get 13 = bassoon_E2_very-long_cresc-decresc_normal.mp3
Get 14 = bassoon_G4_05_piano_normal.mp3
Get 15 = bassoon_B2_025_forte_normal.mp3
Get 16 = bassoon_E4_05_fortissimo_normal.mp3
Get 17 = bassoon_B1_1_piano_normal.mp3
Get 18 = bassoon_Ds2_025_forte_normal.mp3
Get 19 = bassoon_Fs4_1_mezzo-piano_normal.mp3
Get 20 = bassoon_B1_025_piano_normal.mp3
Get 21 = bassoon_As2_phrase_mezzo-forte_legato.mp3
Get 22 = bassoon_E4_025_mezzo-piano_normal.mp3
Get 23 

KeyboardInterrupt: ignored

In [None]:
import pandas as pd
ds_Classes = pd.Series(classes)
ds_Classes.value_counts()

The cells below is just to show you a way to learn and understand how to get information about the errors.

In [None]:
print(files_error)

In [None]:
import traceback
traceback.print_exception(files_error[0]['error'][0],files_error[0]['error'][1],files_error[0]['error'][2])

In [None]:
# Maximum duration
print(max(durations))
# Minumum duration
print(min(durations))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math
df_durations = pd.Series(durations)
w = 1
n = math.ceil((df_durations.max() - df_durations.min())/w)
plt.figure(figsize=(10,8))
df_durations.hist(grid=True, bins=n, rwidth=0.8,
                   color='#607c8e', histtype='bar', ec='black')
plt.xticks(np.arange(0,80,1), rotation=90);

The maximum duration is 77 seconds and the minimum is 0.078 seconds. the .value_counts will count just the numbers that are exaclty the same, we used .value_coutns for the list of strings to count how many times each string is present in the list. For the histogram it doesn't make sense to use .value_count()

Now we now, that most of the durations are lower than 22 seconds, so if we want we can zoom in and plot just the bins until 20 seconds but we must have in mind that there are audio files with the duration bigger than 20 seconds. Can you find out how many audio files have the duration higher than 20 seconds?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math
df_durations = pd.Series(durations)
w = 1
n = math.ceil((df_durations.max() - df_durations.min())/w)
plt.figure(figsize=(10,8))
df_durations.hist(grid=True, bins=np.arange(0,20,1), rwidth=0.8,
                   color='#607c8e', histtype='bar', ec='black')
plt.xticks(np.arange(0,20,1), rotation=90);

In [None]:
We see that the great majority of audio files has a duration of less than 1 second.

In [None]:
df_durations.le(1).value_counts()
# We see that 7339 audio files have the duration smaller than 1 second, and 6342 have duration higher than 6342

In [None]:
(df_durations >= 20)

In [None]:
(df_durations >= 20).value_counts()
# here we see that only 60 audio files have the duration higher than 20 seconds.

In [None]:
df_durations[df_durations >= 20]
# There are many ways to do the same kind of thing in Python, here are the files that have the duration higher than 20 seconds.

In [None]:
import numpy as np
my_array=np.array([0,1,2,3,4])
my_filename='my_filename'

# Saving an array:
with open(my_filename+'.npy', 'wb') as f:
  np.save(f, my_array)

In [None]:
# Loading the saved array
with open('my_filename.npy', 'rb') as f:
  my_loaded_array = np.load(f)

In [None]:
print(my_loaded_array)

In [None]:
# the file my_filename.npy was save in the working directory
!ls -l 

In [None]:
# We can see that the file my_filename.npy was saved and has 160 bytes

In [8]:
!ls  ./all-samples/all-samples/bassoon/


bassoon_A2_025_forte_normal.mp3
bassoon_A2_025_fortissimo_normal.mp3
bassoon_A2_025_mezzo-piano_normal.mp3
bassoon_A2_025_piano_normal.mp3
bassoon_A2_05_forte_normal.mp3
bassoon_A2_05_forte_normal.npy
bassoon_A2_05_fortissimo_normal.mp3
bassoon_A2_05_mezzo-forte_normal.mp3
bassoon_A2_05_mezzo-forte_normal.npy
bassoon_A2_05_mezzo-piano_normal.mp3
bassoon_A2_05_piano_normal.mp3
bassoon_A2_15_piano_normal.mp3
bassoon_A2_15_piano_normal.npy
bassoon_A2_1_forte_normal.mp3
bassoon_A2_1_mezzo-forte_normal.mp3
bassoon_A2_1_mezzo-piano_normal.mp3
bassoon_A2_1_piano_normal.mp3
bassoon_A3_025_forte_normal.mp3
bassoon_A3_025_fortissimo_normal.mp3
bassoon_A3_025_fortissimo_normal.npy
bassoon_A3_025_mezzo-piano_normal.mp3
bassoon_A3_025_piano_normal.mp3
bassoon_A3_025_piano_normal.npy
bassoon_A3_05_forte_normal.mp3
bassoon_A3_05_fortissimo_normal.mp3
bassoon_A3_05_mezzo-piano_normal.mp3
bassoon_A3_05_piano_normal.mp3
bassoon_A3_15_forte_normal.mp3
bassoon_A3_15_mezzo-piano_normal.mp3
bassoon_A3_15_pi