### Load dependencies

In [None]:
import os
import shutil
import time
import pickle
import pandas as pd
import matplotlib.pyplot as pyplot
import librosa
import librosa.display
import gc
import numpy as np

def download_rec(uri, rec_dir):
    if not os.path.exists(rec_dir+'/'+uri.replace('/','_')):
        try:
            urllib.request.urlretrieve ("https://s3.amazonaws.com/arbimon2/"+uri, rec_dir+'/'+uri.replace('/','_'))
        except Exception as e:
            print(e)
            print('Error downloading '+uri)
        return
    

### Specify data paths:

1. train_set_dir - Folder where training data will be stored
2. recording_dir - Folder where audio recordings will be stored
3. sound_annotation_file - File storing template matching validation metadata
    

In [2]:
train_set_dir = '../data/train_tp/' # Folder where training data will be stored
recording_dir = '../recordings/' # Folder holding recordings

sound_annotation_files = ['./example_annotations.csv']
# File storing ROIs of detected sounds (animal calls) 
#     Required columns:
#          recording_id
#          species_id
#          t_min (start time of sound)
#          t_max (end time of sound)
#          uri (recording filename)

Total samples of combined training sets:	334900
After duplicate sample removal:			247482


### Run remaining cells to generate training data

In [None]:
if not os.path.exists(set_dir):
    os.mkdir(set_dir)
    for i in list(set(rois.species_id)):
        os.mkdir(set_dir+str(i)+'/')

In [3]:
if len(sound_annotation_files)==1:
    rois= pd.read_csv(sound_annotation_files)
elif len(sound_annotation_files)==0:
    print('Must provide an annotation file')
    break
elif len(sound_annotation_files)>1:
    rois = pd.read_csv(sound_annotation_files[0])
    for i in sound_annotation_files[1:]:
        tmp = pd.read_csv(sound_annotation_files[i])
        rois = pd.concat([rois,tmp])
rois.head()

Unnamed: 0,datetime,f_max,f_min,recording_id,score,songtype_id,species_id,species_name,t_max,t_min,tod,uri,validated
0,2019-04-03 14:24:00,10406.25,4781.25,5324133,0.3183961378,1,9741,Dendroica angelae,51.6373333333,49.2853333333,0.6,project_1060/site_1056/2019/4/5CA4C200.flac,0
1,2019-04-14 10:15:01,10406.25,4781.25,5387470,0.1040763322,1,9741,Dendroica angelae,3.9306666667,1.5786666667,0.427083,project_1060/site_1037/2019/4/5CB30825.flac,0
2,2019-04-01 18:29:52,10406.25,4781.25,5316518,0.1718479916,1,9741,Dendroica angelae,9.536,7.184,0.770139,project_1060/site_1002/2019/4/5CA258A0.flac,0
3,2019-04-04 08:09:08,10406.25,4781.25,5438328,0.1362196428,1,9741,Dendroica angelae,36.128,33.776,0.339583,project_1060/site_1070/2019/4/5CA5BBA4.flac,0
4,2019-04-07 11:14:37,10406.25,4781.25,5342902,0.2385318273,1,9741,Dendroica angelae,52.4053333333,50.0533333333,0.468056,project_1060/site_996/2019/4/5CA9DB9D.flac,0


In [4]:
print('Number of ROIs for each species\n')

for i in list(set(rois.species_id)):
    print(str(i)+'\t\t'+str(len(rois[rois.species_id==i])))

Number of ROIs for each species

12291		11918
5		8173
6		8942
7		2293
12297		14135
9741		15129
4111		32692
18		2372
12319		3345
34		11067
10153		18817
6338		13367
12356		5637
2887		4039
12360		13749
1999		3132
12240		11967
8534		10184
12386		2981
2789		8668
3817		8386
12395		14011
10487		10587
10494		11891


In [11]:
window_length = 2 # sample time-window length in seconds
k = 0
t0 = time.time()
rec_loaded = False
for i in list(set(rois.recording_id)): # loop over recordings
    
    k = k+1
    if k%200==0:
        print(k)
        
    tmp = rois[rois.recording_id==i]
    audio_filename = tmp.iloc[0].uri.replace('/','_')

    for c in range(len(tmp)): # loop over spectrogram ROIs
        
        try:

            sound_start, sound_end = [tmp.iloc[c].t_min, tmp.iloc[c].t_max]
            species_id = tmp.iloc[c].species_id

            shft = ((sound_end-sound_start)-window_length)/2
            start_sample = round(sampling_rate*(sound_start+shft))
            start_sample = max(start_sample, 0)
            filename = audio_filename.split('.')[0]+'_'+str(round(start_sample/sampling_rate,2))+'-'+str(round((start_sample/sampling_rate)+window_length,2))+'.png'
            
            if not os.path.exists(set_dir+str(species_id)+'/'+filename):
                if not rec_loaded:
                    if not os.path.exists(rec_dir+'/'+audio_filename):
                        download_rec(tmp.iloc[0].uri, rec_dir)
                    try:
                        audio_data, sampling_rate = librosa.load(rec_dir+audio_filename, sr=sampling_rate)
                        rec_loaded = True
                    except Exception as e:
                        print(e)
                        continue
                S = librosa.feature.melspectrogram(y = audio_data[start_sample: start_sample+round(sampling_rate*window_length)], 
                                               sr = sampling_rate,
                                               n_fft=2048, 
                                               hop_length=512, 
                                               win_length=1024)
                dpi=100
                fig = pyplot.figure(num=None, figsize=(300/dpi, 300/dpi), dpi=dpi)
                pyplot.subplot(222)
                ax = pyplot.axes()
                ax.set_axis_off()
                librosa.display.specshow(librosa.power_to_db(S, ref=np.max))
                pyplot.savefig(set_dir+str(species_id)+'/'+filename, bbox_inches='tight', transparent=True, pad_inches=0.0)
                pyplot.close()
                
        except Exception as e:
            print(e)
            continue
        
    rec_loaded = False    

200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200




[Errno 2] No such file or directory: '../recordings/project_1060_site_925_2019_3_5C863ADE.flac'
5400
5600
5800
6000
6200
6400
6600
6800
7000
7200
7400
7600
7800
8000
8200
8400
8600
8800
9000
9200
9400
9600
9800
10000
10200
10400
10600
10800
[Errno 2] No such file or directory: '../recordings/project_1060_site_914_2019_3_5C8FC788.flac'
[Errno 2] No such file or directory: '../recordings/project_1060_site_914_2019_3_5C8FC788.flac'
[Errno 2] No such file or directory: '../recordings/project_1060_site_914_2019_3_5C8FC788.flac'
11000
11200
11400
11600
11800
12000
12200
12400
12600
12800
13000
13200
13400
13600
13800
14000
14200
14400
14600
14800
15000
15200
15400
15600
15800
16000
16200
16400
16600
16800
17000
17200
17400
17600
17800
18000
18200
18400
18600
18800
19000
19200
19400
19600
19800
20000
20200
20400
20600
20800
21000
21200
21400
21600
21800
22000
22200
22400
22600
22800
23000
23200
23400
23600
23800
24000
24200
24400
24600
24800
25000
25200
25400
25600
25800
26000
26200
26400
266