### Import packages

In [1]:
import os
import glob
import numpy  as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import ipywidgets as widgets
import soundfile as sf
import yaml
import math
import datetime
import time

### Choose file to inspect

In [2]:
directory='/Users/saroltagabulya/git/Orca/'
wavs=glob.glob('timestamp_csv/*.csv')
file=[]

dropdown=widgets.Dropdown(
    options=wavs,
    description='CSV files:',
    disabled=False,
)

def dropdown_eventhandler(change):
    file.clear()
    file.append(change.new)
    print(change.new)
dropdown.observe(dropdown_eventhandler, names='value')
display(dropdown)

Dropdown(description='CSV files:', options=('timestamp_csv/time_stamps_2020-12-15--23-21-29--00-40-00--A.csv',…

timestamp_csv/time_stamps_2017-02-04--10-25-15--00-05-25--C.csv


### Read csv file as pandas

In [3]:
filename = directory + file[0]
time_stamps=pd.read_csv(filename, index_col=0)

# Also extract recording name for later use
recording_name=file[0][26:-4]

In [4]:
recording_name= input('Is this the recoridng you want to use? If not enter new recoridng name below\n{} \n'.format(recording_name))

Is this the recoridng you want to use? If not enter new recoridng name below
2017-02-04--10-25-15--00-05-25--C 
2017-02-04--10-25-15--00-05-25--C


### Read metadata

In [5]:
# Set sample rate
try:
    wave_file=wave.open(file[0], "rb")
    sample_rate = wave_file.getframerate()
except:
    sample_rate=int(input('Please check sampling rate manually in the metadata file and set below in Hz! \n'))
    
# Set raw unit of signal
raw_signal_unit=input('Please check the raw unit of the signal and enter below \n ')
    
# Set reference value
ref_value=int(input('Please check reference value in the metadata files and set below!  \n'))

# Calibration value
cal_value=int(input('If available, please check calibration value of hydrophons in the metadata files and set below! \n what does the fullscale voltage correspond to in Pa? \n '))


# Set researcher
researcher= input('Please indicate the researcher who performs the crop by initials [SG, JR] \n ')

Please check sampling rate manually in the metadata file and set below in Hz! 
100000
Please check the raw unit of the signal and enter below 
 V
Please check reference value in the metadata files and set below!  
1
If available, please check calibration value of hydrophons in the metadata files and set below! 
 what does the fullscale voltage correspond to in Pa? 
 1
Please indicate the researcher who performs the crop by initials [SG, JR] 
 SG


### Read in wav

In [6]:
y_all, sr=librosa.load('wav_files/' + recording_name + '.wav', sr=sample_rate, mono=False)

In [7]:
#Select first track
y=y_all[0] #only first track

In [8]:
y

array([ 0.02192914,  0.00633857, -0.01353262, ..., -0.04321598,
       -0.04496078,  0.        ], dtype=float32)

### Start time in seconds since epoch

In [10]:
start_datetime_str=recording_name[:20]
start_datetime_abs=datetime.datetime.strptime(start_datetime_str, '%Y-%m-%d--%H-%M-%S').timestamp()

### Onsets & offsets in csv are in seconds => Convert to samples by multiplying them with the fs

second * fs = samples


In [11]:
time_stamps.onset=time_stamps.onset.apply(lambda x: math.floor(x)) # Round down
time_stamps.offset=time_stamps.offset.apply(lambda x: math.ceil(x)) # Round up 
time_stamps['onset_samples']=time_stamps.onset.apply(lambda x: x*sr) 
time_stamps['offset_samples']=time_stamps.offset.apply(lambda x: x*sr) 

### Function for cropping wav and saving

In [12]:
try:
    os.mkdir('./call_segments/' + recording_name)
    
except Exception as e:
    print(e)

[Errno 17] File exists: './call_segments/2017-02-04--10-25-15--00-05-25--C'


In [85]:
def segment_onset_utc(row, start_datetime_abs):
    # segment_onset_utc
    onset_segment_utc=start_datetime_abs+row.onset
    onset_utc_str=datetime.datetime.fromtimestamp(onset_segment_utc).strftime("%Y-%m-%d--%H-%M-%S")
    
    return onset_utc_str
    
def segment_duration(row):
    #duration_seconds
    duration_segment=row.offset-row.onset
    duration_utc_str=time.strftime('--%H-%M-%S', time.gmtime(duration_segment))
    
    return duration_utc_str
    
        
def recording_file_name(row, recording_name):
    
    call_filename= row.onset_utc_str + row.duration_utc_str
    
    #Extract and add if recorder info exists
    try:
        recorder=recording_name[30:]
        call_filename=call_filename+recorder + '.wav'
    except:
        print('recorder not specified')
        call_filename=call_filename  + '.wav'
        
    return call_filename


def crop_call(row, y, sr, recording_name, start_datetime_abs):
    call=y[row.onset_samples: row.offset_samples]  # Interval closed at the beginning and open at the end
    call_path='./call_segments/' + recording_name +'/' +  row.recording_file_name 
    
    sf.write(call_path, call, sr, format='WAV') 

    return call_path

### Apply functions

In [87]:
time_stamps['onset_utc_str']=time_stamps.apply(lambda x: segment_onset_utc(x, start_datetime_abs), axis=1)
time_stamps['duration_utc_str']=time_stamps.apply(lambda x: segment_duration(x), axis=1)
time_stamps['recording_file_name']=time_stamps.apply(lambda x: recording_file_name(x, recording_name), axis=1)
time_stamps['call_wav_path']= time_stamps.apply(lambda x: crop_call(x, y, sr, recording_name, start_datetime_abs), axis=1)

### Prepare a metadata dataframe

In [88]:
def prep_metadata(time_stamps, sample_rate, ref_value, cal_value, researcher, raw_signal_unit, recording_name):
    meta_df=pd.DataFrame()
    
    meta_df['segment_file']=time_stamps.apply(lambda x: x.recording_file_name, axis=1)
    meta_df['wav_origin']=pd.Series([recording_name + '.wav']*len(time_stamps))
    
    meta_df['sample_rate']=pd.Series([sample_rate]*len(time_stamps))
    meta_df['raw_signal_unit']=pd.Series([raw_signal_unit]*len(time_stamps))
    meta_df['reference_value']=pd.Series([ref_value]*len(time_stamps))
    meta_df['calibration_value']=pd.Series([cal_value]*len(time_stamps))
    meta_df['researcher']=pd.Series(['SG']*len(time_stamps))
    
    meta_df['start_samples']=time_stamps.onset_samples
    meta_df['end_samples']=time_stamps.offset_samples
    meta_df['number_of_samples']=time_stamps.offset_samples-time_stamps.onset_samples
    
    meta_df['start_seconds']=time_stamps.onset
    meta_df['end_seconds']=time_stamps.offset
    meta_df['duration_seconds']=meta_df.number_of_samples/meta_df.sample_rate
    
    meta_df['start_utc']=time_stamps.onset_utc_str
    
    return meta_df

In [89]:
researcher='SG'
meta_df=prep_metadata(time_stamps, sample_rate, ref_value, cal_value, researcher, raw_signal_unit, recording_name)

In [90]:
meta_df

Unnamed: 0,segment_file,wav_origin,sample_rate,raw_signal_unit,reference_value,calibration_value,researcher,start_samples,end_samples,number_of_samples,start_seconds,end_seconds,duration_seconds,start_utc
0,2017-02-04--10-28-03--00-00-01--C.wav,2017-02-04--10-25-15--00-05-25--C.wav,100000,V,1,1,SG,16800000,16900000,100000,168,169,1.0,2017-02-04--10-28-03
1,2017-02-04--10-28-08--00-00-04--C.wav,2017-02-04--10-25-15--00-05-25--C.wav,100000,V,1,1,SG,17300000,17700000,400000,173,177,4.0,2017-02-04--10-28-08
2,2017-02-04--10-28-15--00-00-03--C.wav,2017-02-04--10-25-15--00-05-25--C.wav,100000,V,1,1,SG,18000000,18300000,300000,180,183,3.0,2017-02-04--10-28-15
3,2017-02-04--10-28-22--00-00-04--C.wav,2017-02-04--10-25-15--00-05-25--C.wav,100000,V,1,1,SG,18700000,19100000,400000,187,191,4.0,2017-02-04--10-28-22
4,2017-02-04--10-28-36--00-00-03--C.wav,2017-02-04--10-25-15--00-05-25--C.wav,100000,V,1,1,SG,20100000,20400000,300000,201,204,3.0,2017-02-04--10-28-36
5,2017-02-04--10-28-45--00-00-03--C.wav,2017-02-04--10-25-15--00-05-25--C.wav,100000,V,1,1,SG,21000000,21300000,300000,210,213,3.0,2017-02-04--10-28-45
6,2017-02-04--10-29-32--00-00-04--C.wav,2017-02-04--10-25-15--00-05-25--C.wav,100000,V,1,1,SG,25700000,26100000,400000,257,261,4.0,2017-02-04--10-29-32
7,2017-02-04--10-29-52--00-00-03--C.wav,2017-02-04--10-25-15--00-05-25--C.wav,100000,V,1,1,SG,27700000,28000000,300000,277,280,3.0,2017-02-04--10-29-52
8,2017-02-04--10-29-58--00-00-03--C.wav,2017-02-04--10-25-15--00-05-25--C.wav,100000,V,1,1,SG,28300000,28600000,300000,283,286,3.0,2017-02-04--10-29-58
9,2017-02-04--10-30-18--00-00-04--C.wav,2017-02-04--10-25-15--00-05-25--C.wav,100000,V,1,1,SG,30300000,30700000,400000,303,307,4.0,2017-02-04--10-30-18


### Create metadata files

In [91]:
def write_metadata(row, output_dir):
    
    #Prepare filename
    metafilename=row.wav_origin[:-4] + '/' +row.segment_file[:-3]  + 'yml'
    
    #Convert row to dictionary
    d = {k: (v if type(v).__module__ != 'numpy' else v.item()) for k,v in row.to_dict().items()}
    
    
    with open(output_dir + metafilename, 'w') as outfile:
        yaml.dump(d, outfile)

In [92]:
output_dir='./call_segments/' 
meta_df.apply(lambda x: write_metadata(x, output_dir), axis =1)

0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
8    None
9    None
dtype: object

### Save csv under new name

In [19]:
time_stamps.to_csv('call_data/call_data_' + recording_name + '.csv')

### Read in metadata