In [3]:
import pandas as pd
import numpy as np
from pydub import AudioSegment
import glob
import shutil


In [4]:
def sort_usa(dataframe):
    "Sorts and renames the data columns for further use.This is for the USA data"
    
    #renames the columns which are used with names without blanks
    dataframe = dataframe.rename(columns = {" id":"id"," channel":"channel", " start_sec_file_seq":"start_sec_file_seq",
                                           ' duration':'duration',' min_freq':'min_freq', ' max_freq':'max_freq',
                                          ' bandwidth':'bandwidth',' tags':'tags' , ' file':'file' })
    
    #delets all the columns that are not needed
    del dataframe[' score']
    del dataframe[' rating']
    del dataframe[' notes']
    del dataframe['log']
    del dataframe['start_sec_in_file']
    return dataframe

In [5]:
def sort_noise(dataframe):
    "Sorts and renames the data columns for further use.This is for the USA data but which contain only noise"
    
    #gains the start and end time of an event
    dataframe["time"] = dataframe["time"].str.replace("[", "").str.replace("]", "")
    dataframe[["start_sec_file_seq","stop_sec_file_seq"]] = dataframe["time"].str.split(",", 
                                                                                  expand = True).astype(float).round(decimals=4)
    
    #gets the min and max frequency of an event
    dataframe["freq"] = dataframe["freq"].str.replace("[", "").str.replace("]", "")
    dataframe[["min_freq","max_freq"]] = dataframe["freq"].str.split(",", expand = True).astype(float).round(decimals=4)
    
    #gets the duration and bandwith of an evnt
    dataframe["duration"] = dataframe["stop_sec_file_seq"] -dataframe["start_sec_file_seq"]
    dataframe["bandwidth"] = dataframe["max_freq"] -dataframe["min_freq"]
    
    #because all the events are noise adds the column tag with the value Ano_1000 (noise) 
    dataframe["tags"] = 'Ano_1000'
    
    #delets the not needed columns
    del dataframe["time"]
    del dataframe["freq"]
   
    #rerange the index 
    dataframe = dataframe.reindex(columns=['id','channel', 'start_sec_file_seq', 'stop_sec_file_seq', 'duration', 
                        'min_freq', 'max_freq' ,'bandwidth', 'tags' ])
    
    return dataframe

In [13]:
def sort_scotland(dataframe):
    "Sorts and renames the data columns for further use. This is for the Scotland data"
    #renames the columns so they match the USA dataframe
    dataframe = dataframe.rename(columns ={'Begin Time (s)': "start_sec_file_seq",'End Time (s)':"stop_sec_file_seq", 
                             "tags":"detector_tags",'notes':"tags" })
    #drop all unlabeled events
    dataframe = dataframe.dropna(subset=['tags'])
    
    #makes the start and endtime readable
    dataframe["start_sec_file_seq"] = dataframe["start_sec_file_seq"]/1000000000
    dataframe["start_sec_file_seq"] = dataframe["start_sec_file_seq"].round(decimals=4)
    dataframe["stop_sec_file_seq"] = dataframe["stop_sec_file_seq"]/1000000000
    dataframe["stop_sec_file_seq"] = dataframe["stop_sec_file_seq"].round(decimals=4)
    #creates the duration and the file column
    dataframe["duration"] = dataframe["stop_sec_file_seq"] - dataframe["start_sec_file_seq"]
    dataframe["file"] = np.nan
    #deltes all not needed columns to have the same columns as the USA dataframe
    del dataframe['View']
    del dataframe['rating']
    del dataframe['score']
    del dataframe['detector_tags']  
    
    return dataframe.sort_values(by= "start_sec_file_seq")


In [9]:
#Load all the dataframes with the log information and sorts the dataset
datasets_usa =[]

del_list_1 = [2121,788,1938]

for number in range(5):
    number = number + 1
    #two different log information types are used. Each type gets sorted by their fitting function 
    dataframe = sort_usa(pd.read_csv("Path/to/USA_annonations"))
    dataframe_noise = sort_noise(pd.read_excel("MasterSoundLibrary_logs_DR/del_events_"+str(number)+".xlsx", 
                                           engine='openpyxl'))
    #merge the dataframes togehter
    dataframe = dataframe.merge(dataframe_noise, how = "outer")
    
    #duplicates, which don't be get sort out be drop_duplicates get manual sort out
    if number == 1:
        for i in del_list_1:
            dataframe = dataframe[dataframe["id"]!= i]
    elif number == 5:
        dataframe = dataframe[dataframe["id"]!= 434]
        
    #drops all the remaining duplicates and sort by start time    
    dataframe =  dataframe.drop_duplicates(subset=['start_sec_file_seq'], keep = "last")
    datasets_usa.append(dataframe.sort_values(by= "start_sec_file_seq"))

 

In [14]:
dirs = [20190502,20190614,20190616,20190702]
data_scotland = []
for number in dirs:
    #gets the dataframe for the Scottish data
    dataframe = sort_scotland(pd.read_excel("Path/to/Scotland_annonations", engine='openpyxl'))
    data_scotland.append(dataframe)


In [11]:
dirs_test = [1,2,3,4,5]
data_test = []
for number in dirs_test:
    #creates the dataframe for the test data which is also from Scotland
    dataframe= sort_scotland(pd.read_excel("Path/to/Test_annonations", engine='openpyxl'))
    data_test.append(dataframe)

In [9]:
def rename_file( iD,logs,starttime,endtime,usa):
    "The new files need a to have a name which fits the name template of Orca-Spot"
 
    #get the start and endtime of the event
    starttime = str(starttime).split(".")[0]
    endtime = str(endtime).split(".")[0]
    
    #get ths tapename
    file = logs["file"]
    tapename = file.split(".")[0].replace("_","-")
    #if the audio files are from the USA the year 2008 gets assign, for Scotland the year 2018 gets assigned 
    if usa:
        year = str(2008)
    else:
        year = str(2018)

    #with the calls the type of evtn gets determined
    tag = logs["tags"]

    if tag ==" Bac_3100" or tag== ' minke_pt'or tag== 'Minke' :
        type_of_call= "call-Minkwhale"
    else:
        type_of_call = "noise"
    #merge everything together for the filename 
    new_name_file =  type_of_call+ "_" +str(iD) + "_" +year+ "_" + tapename +"_" + starttime +"_" + endtime +".wav"

    return new_name_file

In [10]:
def audio_split(audio,iD,logs,duration,file_name,usa):
    "splits the audio file so only the events get exported"
         
        
        folder_dest = "Path/to/destination"
        duration_file = audio.duration_seconds
        step = 0
        duration_pre = 0
        count = 0
        
        #goes to all the events 
        for i in range(len(logs)):
            
            #gets the start and stop of the event
            start = df["start_sec_file_seq"].iloc[i]
            stop = df["stop_sec_file_seq"].iloc[i]
            if i < len(logs)-1:

                start_next = df["stop_sec_file_seq"].iloc[i+1]
                #checks if the next start smaller is as the current endtime, if so the events overlap.
                #To handel the overlap the next start is the new endtime
                if stop> start_next:
                        stop= start_next  
                starttime = ((start)*1000).round(decimals=4)
                endtime = ((stop)*1000).round(decimals=4)
                
                #checks from which file the event is 
                if start >= duration[step]:
                    logs["file"].iloc[i] = file_name[step+1] 
                    duration_pre = duration[step]
                    step += 1
                else:
                    logs["file"].iloc[i] = file_name[step]
                file = logs["file"].iloc[i]
                
                #extract the time of the event and makes the start- and endtime to the start of the file the came from
                split_audio = audio[starttime:endtime]
                startname = ((logs["start_sec_file_seq"].iloc[i]-duration_pre) *1000  ).round(decimals=4)
                endname = ((logs["stop_sec_file_seq"].iloc[i] -duration_pre) *1000).round(decimals=4)


                data = logs.iloc[i]
                duration_split= split_audio.duration_seconds
                #checks if the duration of the event is larger than 2.5 seconds
                if duration_split >2.5:
                        #gets the name template
                        filename = rename_file(iD,data,startname,endname,usa)
                        #export the audio data into the destionation folder
                        split_audio.export(folder_dest +  filename, format="wav")
                        iD +=1

                else:
                    #the same prodecudure as above but without the check if the events overlap, because this is
                    #the last event for one set. 
                    starttime = ((start)*1000).round(decimals=4)
                    endtime = ((stop)*1000).round(decimals=4)
                    logs["file"].iloc[i] = file_name[step]
                    file = logs["file"].iloc[i]

                    split_audio = audio[starttime:endtime]
                    startname = ((logs["start_sec_file_seq"].iloc[i]-duration_pre) *1000  ).round(decimals=4)
                    endname = ((logs["stop_sec_file_seq"].iloc[i] -duration_pre) *1000).round(decimals=4)


                    data = logs.iloc[i]
                    duration_split= split_audio.duration_seconds
                    if duration_split >2.5:
                            filename = rename_file(iD,data,startname,endname,usa)
                            split_audio.export(folder_dest +  filename, format="wav")
                            iD +=1
        #returns the ID so every file as an unique ID
        return iD

In [1]:
def merge_files(file_1,file_2):
    "merges two audio files together and returns them "
    return file_1 +AudioSegment.from_wav(file_2)

In [None]:
folder_path = glob.glob("Path/to/orginal_data/*")

file_1 = 0
files =[]  
#merges the data for each set together 
for i, file in enumerate(folder_path):
            if i == 0:
                file_1 = AudioSegment.from_wav(file)

            elif i < len(folder_path):
                file_1 = merge_files(file_1, file)
files.append(file_1)

In [None]:
iD= 1
#gets through all sets and creates the audio clips and export them to the destination folder
for n in range(5):

    folder_path = glob.glob("Path/to/orginal_data/*")
    df = datasets[n]

    #for storing the duration of a single file and the filename
    duration =[]
    file_name=[]
    dur = 0
    

    for i, file in enumerate(folder_path):
                #gets the duration of a single file and the filename
                file_1 = AudioSegment.from_wav(file)
                file_split = file.split("\\")
                dur += np.float32(file_1.duration_seconds)
            
                file_name.append(file_split[1])
                duration.append(dur)
   
    file_1 =files[n]
    iD = audio_split(file_1,iD,df,duration,file_name, usa=True)