### Import packages

In [1]:
import os
import glob
import numpy  as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import ipywidgets as widgets
import soundfile as sf
import yaml
import math
import datetime
import time

### Choose file to inspect

In [2]:
directory='/Users/saroltagabulya/git/Orca/'
wavs=glob.glob('timestamp_csv/*.csv')
file=[]

dropdown=widgets.Dropdown(
    options=wavs,
    description='CSV files:',
    disabled=False,
)

def dropdown_eventhandler(change):
    file.clear()
    file.append(change.new)
    print(change.new)
dropdown.observe(dropdown_eventhandler, names='value')
display(dropdown)

Dropdown(description='CSV files:', options=('timestamp_csv/time_stamps_2020-12-15--23-21-29--00-40-00--A.csv',…

timestamp_csv/time_stamps_2017-02-04--10-14-06--00-16-35--BC_2.csv


### Read csv file as pandas

In [3]:
filename = directory + file[0]
time_stamps=pd.read_csv(filename, index_col=0)

# Also extract recording name for later use
recording_name=file[0][26:-4]

In [5]:
recording_name

'2017-02-04--10-14-06--00-16-35--BC_2'

### Read metadata

In [6]:
# Set sample rate
try:
    wave_file=wave.open(file[0], "rb")
    sample_rate = wave_file.getframerate()
except:
    sample_rate=int(input('Please check sampling rate manually in the metadata file and set below in Hz! \n'))
    
# Set reference value
ref_value=int(input('Please check reference value in the files README and set below in uPa! \n'))

# Calibration value
cal_value=int(input('Please check calibration value in the files README and set below! \n what does the fullscale voltage correspond to in Pa? \n '))


# Set researcher
researcher= input('Please indicate the researcher who performs the crop by initials [SG, JR] \n ')

Please check sampling rate manually in the metadata file and set below in Hz! 
100000
Please check reference value in the files README and set below in uPa! 
1
Please check calibration value in the files README and set below! 
 what does the fullscale voltage correspond to in Pa? 
 1
Please indicate the researcher who performs the crop by initials [SG, JR] 
 SG


### Read in wav

In [7]:
recording_name=recording_name[:-2]

y, sr=librosa.load('wav_files/' +recording_name + '.wav', sr=sample_rate)

### Start time in seconds since epoch

In [8]:
start_datetime_str=recording_name[:20]
start_datetime_abs=datetime.datetime.strptime(start_datetime_str, '%Y-%m-%d--%H-%M-%S').timestamp()

### Onsets & offsets in csv are in seconds => Convert to samples by multiplying them with the fs

second * fs = samples


In [9]:
time_stamps.onset=time_stamps.onset.apply(lambda x: math.floor(x)) # Round down
time_stamps.offset=time_stamps.offset.apply(lambda x: math.ceil(x)) # Round up 

time_stamps['onset_samples']=time_stamps.onset.apply(lambda x: x*sr) 
time_stamps['offset_samples']=time_stamps.offset.apply(lambda x: x*sr) 

### Function for cropping wav and saving

In [10]:
try:
    os.mkdir('./call_segments/' + recording_name)
    
except Exception as e:
    print(e)

[Errno 17] File exists: './call_segments/2017-02-04--10-14-06--00-16-35--BC'


In [11]:
def crop_call(row, y, sr, recording_name):
    call_filename= recording_name + '_' + str(row.onset_samples) + '_' + str(row.offset_samples) + '.wav'
    call=y[row.onset_samples: row.offset_samples]  # Interval closed at the beginning and open at the end
    call_path='./call_segments/' + recording_name +'/' +  call_filename
    sf.write(call_path, call, sr) 
    return call_path

In [12]:
time_stamps['call_wav_file']=time_stamps.apply(lambda x: crop_call(x, y, sr, recording_name), axis=1)

### Prepare a metadata dataframe

In [13]:
def prep_metadata(time_stamps, sample_rate, ref_value, cal_value, researcher):
    meta_df=pd.DataFrame()
    
    meta_df['segment_file']=time_stamps.apply(lambda x: x.call_wav_file.split('/')[-1], axis=1)
    meta_df['wav_origin']=time_stamps.apply(lambda x: x.call_wav_file.split('/')[-1].split('_')[0] + '.wav', axis=1)
    
    meta_df['sample_rate']=pd.Series([sample_rate]*len(time_stamps))
    meta_df['reference_value']=pd.Series([ref_value]*len(time_stamps))
    meta_df['calibration_value']=pd.Series([cal_value]*len(time_stamps))
    meta_df['researcher']=pd.Series(['SG']*len(time_stamps))
    
    meta_df['onset_samples']=time_stamps.onset_samples
    meta_df['offset_samples']=time_stamps.offset_samples
    meta_df['number_of_samples']=time_stamps.offset_samples-time_stamps.onset_samples
    
    meta_df['onset_seconds']=time_stamps.onset
    meta_df['offset_seconds']=time_stamps.offset
    meta_df['duration_seconds']=meta_df.number_of_samples/meta_df.sample_rate
    
    meta_df['onset_absolute_time']=meta_df.onset_seconds + start_datetime_abs
    
    return meta_df

In [14]:
researcher='SG'
meta_df=prep_metadata(time_stamps, sample_rate, ref_value, cal_value, researcher)

In [15]:
meta_df

Unnamed: 0,segment_file,wav_origin,sample_rate,reference_value,calibration_value,researcher,onset_samples,offset_samples,number_of_samples,onset_seconds,offset_seconds,duration_seconds,onset_absolute_time
0,2017-02-04--10-14-06--00-16-35--BC_84200000_84...,2017-02-04--10-14-06--00-16-35--BC.wav,100000,1,1,SG,84200000,84500001,300001,842,845,3.00001,1486200000.0
1,2017-02-04--10-14-06--00-16-35--BC_84900000_85...,2017-02-04--10-14-06--00-16-35--BC.wav,100000,1,1,SG,84900000,85200001,300001,849,852,3.00001,1486200000.0
2,2017-02-04--10-14-06--00-16-35--BC_85600000_85...,2017-02-04--10-14-06--00-16-35--BC.wav,100000,1,1,SG,85600000,85900001,300001,856,859,3.00001,1486200000.0
3,2017-02-04--10-14-06--00-16-35--BC_87000000_87...,2017-02-04--10-14-06--00-16-35--BC.wav,100000,1,1,SG,87000000,87300001,300001,870,873,3.00001,1486200000.0
4,2017-02-04--10-14-06--00-16-35--BC_87900000_88...,2017-02-04--10-14-06--00-16-35--BC.wav,100000,1,1,SG,87900000,88200001,300001,879,882,3.00001,1486200000.0
5,2017-02-04--10-14-06--00-16-35--BC_92600000_93...,2017-02-04--10-14-06--00-16-35--BC.wav,100000,1,1,SG,92600000,93000001,400001,926,930,4.00001,1486200000.0
6,2017-02-04--10-14-06--00-16-35--BC_95200000_95...,2017-02-04--10-14-06--00-16-35--BC.wav,100000,1,1,SG,95200000,95600001,400001,952,956,4.00001,1486200000.0
7,2017-02-04--10-14-06--00-16-35--BC_97100000_97...,2017-02-04--10-14-06--00-16-35--BC.wav,100000,1,1,SG,97100000,97500001,400001,971,975,4.00001,1486200000.0


### Create metadata files

In [16]:
def write_metadata(row, output_dir):
    
    #Prepare filename
    metafilename=row.wav_origin[:-4] + '/'+ row.segment_file[:-3]  + 'yml'
    
    #Convert row to dictionary
    d = {k: (v if type(v).__module__ != 'numpy' else v.item()) for k,v in row.to_dict().items()}
    
    
    with open(output_dir + metafilename, 'w') as outfile:
        yaml.dump(d, outfile)

In [17]:
output_dir='./call_segments/' 
meta_df.apply(lambda x: write_metadata(x, output_dir), axis =1)

0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
dtype: object

### Save csv under new name

In [18]:
time_stamps.to_csv('call_data/call_data_' + recording_name + '.csv')

### Read in metadata

In [None]:
metafile=''

with open(metafile) as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    metadata = yaml.load(file, Loader=yaml.FullLoader)
    
print(metadata)