# Download the Dataset

In [1]:
!wget https://philharmonia-assets.s3-eu-west-1.amazonaws.com/uploads/2020/02/12112005/all-samples.zip

--2020-06-28 07:26:45--  https://philharmonia-assets.s3-eu-west-1.amazonaws.com/uploads/2020/02/12112005/all-samples.zip
Resolving philharmonia-assets.s3-eu-west-1.amazonaws.com (philharmonia-assets.s3-eu-west-1.amazonaws.com)... 52.218.101.216
Connecting to philharmonia-assets.s3-eu-west-1.amazonaws.com (philharmonia-assets.s3-eu-west-1.amazonaws.com)|52.218.101.216|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 260849422 (249M) [application/zip]
Saving to: ‘all-samples.zip’


2020-06-28 07:27:10 (10.5 MB/s) - ‘all-samples.zip’ saved [260849422/260849422]



# Extract zip files

In [2]:
from zipfile import ZipFile
with ZipFile('all-samples.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()

In [3]:
import os
dir_path='./all-samples/'

In [4]:
for path, dir_list, file_list in os.walk(dir_path):
    for file_name in file_list:
        if file_name.endswith(".zip"):
            abs_file_path = os.path.join(path, file_name)
            output_path = os.path.splitext(abs_file_path)[0]
            zip_obj = ZipFile(abs_file_path, 'r')
            zip_obj.extractall(output_path)
            zip_obj.close()
  

# Start Trials

In [5]:
path_dataset='./all-samples/'
import fnmatch
import librosa
import sys
import numpy as np
import pickle

In [6]:
instruments = ['banjo','bass-clarinet','bassoon', 'cello','clarinet','contrabassoon',
               'english-horn','double-bass','flute','french-horn','guitar','mandolin',
               'oboe','percussion','saxophone','trombone','trumpet','tuba','viola','violin']
classes = []
durations = []
files_error = []
files_mp3 = []
files_pkl = []
files_npy = []

counter=1
# Walk through all files in path_dataset
for root, dirnames, filenames in os.walk(path_dataset):
    # Get all mp3 files
    for filename in fnmatch.filter(filenames, '*.mp3'):
      print ("Get %d = %s"%(counter, filename))
      # Get path and filename without extension
      name_no_ext = os.path.join(root,os.path.splitext(filename)[0])
      try:
        # Load mp3
        y, sr = librosa.load(os.path.join(root, filename), sr=None)
        # Append mp3 file to list
        files_mp3.append(name_no_ext+'.mp3')
        # Save audio samples as numpy array
        np.save(name_no_ext,y)
        # Append .npy file to list
        files_npy.append(name_no_ext+'.npy')
        # Get duration of audio file
        duration =librosa.get_duration(y=y, sr=sr)
        # Append duration to the list
        durations.append(duration)
        # Get type os instrument from filename and append to list
        if 'percussion' in root:
          class_inst='percussion'
          classes.append(class_inst)
        else:
          for instrument in instruments:
            if fnmatch.fnmatchcase(filename, instrument+'*'):
              class_inst=instrument
              classes.append(class_inst)
        # Create a dictionary with data from file
        data_dictionary = {'filename': (name_no_ext+'.npy'), 'sampling rate' : sr, 'duration' : duration, 'class' : class_inst}
        # Save dictionary to a file .pkl
        with open(name_no_ext+'.pkl', 'wb') as f:
                pickle.dump(data_dictionary, f)
        # Append .pkl file to a list
        files_pkl.append(name_no_ext+'.pkl')
      except Exception as e:
            files_error.append({'file':os.path.join(root, filename),'error':sys.exc_info()[0]})
            print("Error loading %s. Error: %s" % (filename,e))
      counter+=1

# Save classes list
with open('classes.pkl', 'wb') as f:
  pickle.dump(classes, f)
# Save durations list
with open('durations.pkl', 'wb') as f:
  pickle.dump(durations, f)
# Save files error list
with open('files_errors.pkl', 'wb') as f:
  pickle.dump(files_error, f)
# Save files mp3 list
with open('files_mp3.pkl', 'wb') as f:
  pickle.dump(files_mp3, f)
# Save files pkl files list
with open('files_pkl.pkl', 'wb') as f:
  pickle.dump(files_pkl, f)
# Save files npy list
with open('files_npy.pkl', 'wb') as f:
  pickle.dump(files_npy, f)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Get 8686 = violin_G5_025_pianissimo_arco-normal.mp3
Get 8687 = violin_Gs5_1_mezzo-forte_arco-normal.mp3
Get 8688 = violin_E7_1_piano_arco-normal.mp3
Get 8689 = violin_As4_phrase_mezzo-forte_arco-legato.mp3
Get 8690 = violin_D5_15_forte_arco-normal.mp3
Get 8691 = violin_Cs5_05_fortissimo_arco-au-talon.mp3
Get 8692 = violin_Cs4_1_piano_arco-col-legno-tratto.mp3
Get 8693 = violin_Fs4_1_piano_arco-sul-tasto.mp3
Get 8694 = violin_A6_025_fortissimo_arco-normal.mp3
Get 8695 = violin_Cs4_long_piano_arco-sul-ponticello.mp3
Get 8696 = violin_Cs6_05_mezzo-forte_arco-normal.mp3
Get 8697 = violin_G3_phrase_mezzo-forte_arco-detache.mp3
Get 8698 = violin_C6_025_mezzo-piano_pizz-normal.mp3
Get 8699 = violin_G6_1_fortissimo_arco-normal.mp3
Get 8700 = violin_Gs5_phrase_piano_arco-punta-d'arco.mp3
Get 8701 = violin_E7_05_mezzo-forte_arco-normal.mp3
Get 8702 = violin_Ds6_15_pianissimo_arco-normal.mp3
Get 8703 = violin_G3_phrase_forte_arco-st

In [25]:
import pandas as pd
ds_Classes = pd.Series(classes)
print(ds_Classes.value_counts())
print(ds_Classes.value_counts().sum())

violin           1502
viola             973
tuba              972
bass-clarinet     944
cello             889
flute             878
double-bass       852
clarinet          846
trombone          831
saxophone         732
bassoon           720
contrabassoon     710
english-horn      691
french-horn       652
oboe              596
trumpet           485
percussion        148
guitar            106
mandolin           80
banjo              74
dtype: int64
13681


The cells below is just to show you a way to learn and understand how to get information about the errors.

In [24]:
print(len(classes))
print(len(durations))
print(len(files_error))
print(len(files_npy))
print(len(files_pkl))
print(len(files_mp3))

13681
13681
2
13681
13681
13681


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math
df_durations = pd.Series(durations)
w = 1
n = math.ceil((df_durations.max() - df_durations.min())/w)
plt.figure(figsize=(10,8))
df_durations.hist(grid=True, bins=n, rwidth=0.8,
                   color='#607c8e', histtype='bar', ec='black')
plt.xticks(np.arange(0,80,1), rotation=90);

The maximum duration is 77 seconds and the minimum is 0.078 seconds. the .value_counts will count just the numbers that are exaclty the same, we used .value_coutns for the list of strings to count how many times each string is present in the list. For the histogram it doesn't make sense to use .value_count()

Now we now, that most of the durations are lower than 22 seconds, so if we want we can zoom in and plot just the bins until 20 seconds but we must have in mind that there are audio files with the duration bigger than 20 seconds. Can you find out how many audio files have the duration higher than 20 seconds?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math
df_durations = pd.Series(durations)
w = 1
n = math.ceil((df_durations.max() - df_durations.min())/w)
plt.figure(figsize=(10,8))
df_durations.hist(grid=True, bins=np.arange(0,20,1), rwidth=0.8,
                   color='#607c8e', histtype='bar', ec='black')
plt.xticks(np.arange(0,20,1), rotation=90);

In [None]:
We see that the great majority of audio files has a duration of less than 1 second.

In [None]:
df_durations.le(1).value_counts()
# We see that 7339 audio files have the duration smaller than 1 second, and 6342 have duration higher than 6342

In [None]:
(df_durations >= 20)

In [None]:
(df_durations >= 20).value_counts()
# here we see that only 60 audio files have the duration higher than 20 seconds.

In [None]:
df_durations[df_durations >= 20]
# There are many ways to do the same kind of thing in Python, here are the files that have the duration higher than 20 seconds.

In [None]:
import numpy as np
my_array=np.array([0,1,2,3,4])
my_filename='my_filename'

# Saving an array:
with open(my_filename+'.npy', 'wb') as f:
  np.save(f, my_array)

In [None]:
# Loading the saved array
with open('my_filename.npy', 'rb') as f:
  my_loaded_array = np.load(f)

In [None]:
print(my_loaded_array)

In [None]:
# the file my_filename.npy was save in the working directory
!ls -l 

In [None]:
# We can see that the file my_filename.npy was saved and has 160 bytes

In [None]:
!ls  ./all-samples/all-samples/bassoon/
