In [1]:
# import the dependencies
import numpy as np
import scipy
import soundfile as sf
import csv
import h5py
import cv2

In [3]:
# if use google colab, run the following lines
from google.colab import drive
import os
# google colab might not have soundfile package installed in the enviroment
# if this happends, run the following line to install it
# !pip install soundfile

# give colab the right to access your file
drive.mount('/content/drive')
os.chdir('/content/drive/')

Mounted at /content/drive


# Parse the label file <br>

This is our approach to parse the label file. It is not necessary to do it in this way and the approach is based on how people store data in the label file. 

The main idea is to retrieve all the info to plot and chop the spectrogram, including path to the wav file, the start and end time, minimum and maximum frequency, sonotype, and taxonomic group for each label

In [11]:
# path to the label file
labelDir = "My Drive/Stethoscope/" # path to the folder storing label files
labelFile = "sample_labels.txt" # label file name
soundDir = "My Drive/Stethoscope/" # path to the folder storing wav files

# lists to store data
times = [] # times in the wav file
freqs = []
files = []
sonotypes = []
actTimes = [] # time of the day
groups = [] # taxonomic group
selection = [] # this is not required and only used to trace the labels

labelFilePath = labelDir + labelFile 
with open(labelFilePath) as file:
  # read file, info in each row is separated by tabs
  label_reader = csv.reader(file, delimiter='\t')
  
  for row in label_reader:
    # not use the first row: first row contains attributes
    # other rows should start with the selection number
    if (not row[0].isnumeric()):
      continue

    # in our label file, the begin and end time are not related 
    # to the actual position in the sound file, 
    # so we only use those info to retrieve duration of the sounds
    duration = float(row[4]) - float(row[3])
    
    # use the Delta Time to retrieve the actural start time
    # and convert to seconds in the day
    timeWeight = [3600,60,1,0.0001]
    actSta = sum([a*b for a,b in zip(timeWeight, map(int,row[10].replace('.',":").split(':')))])
    
    # use the duration to retrieve the end time in secons in the day
    actEnd = actSta + duration 
    
    # get file name for the labels
    fileName = soundDir + row[7].split("\\")[-1]

    # get the start time of wav file 
    # we store the start time of wav file in its name
    # to get the corresponding start time of the label
    # actual time is either in index 1 or 2 in splName
    splName = row[7].split("\\")[-1].split("_")
    strTime = splName[1]         
    # conver to seconds of the day
    recLis = [int(strTime[0]) * 10 + int(strTime[1]),
              int(strTime[2]) * 10 + int(strTime[3]), 
              int(strTime[4]) * 10 + int(strTime[5])]    
    recSta = sum([a*b for a,b in zip(timeWeight, recLis)])

    # start, end time used to corp the specs
    start = actSta - recSta
    end = start + duration

    try:
      sonotypes.append(int(row[11]))
      times.append([start,end])
      actTimes.append([actSta, actEnd])
      freqs.append([float(row[5]), float(row[6])])
      files.append(fileName) 
      groups.append(row[12])
      selection.append(int(row[0]))
    except:
      # exception: print file and selection number
      print(filename)
      print(row[0])
      pass

    # uncomment the following lines to print data related to this label
    # print("filename: %s" % (fileName))
    # print("Start time: %f, End time: %f" % (start, end))
    # print("Actual Start time: %f, Actual End time: %f" % (actSta, actEnd))
    # print("minimum frequency: %f, maximum frequency: %f \n" % (float(row[5]), float(row[6])))

# some stats
print("dataset size:  %i" % len(times))    
print("filename sample: " + files[0])

dataset size:  35
filename sample: My Drive/Stethoscope/20180908_060000_13A_24H.wav


#Creat H5 data storage


In [13]:
h5Path = 'My Drive/Stethoscope/sample_data.hdf5'
f = h5py.File(h5Path, 'w')

data = f.create_dataset("specs", (0,224,224,3,), maxshape=(None,None,None,None,), chunks=True)
data = f.create_dataset("sonotypes", (0,),  maxshape=(None,), chunks=True)
data = f.create_dataset("times", (0,2,), maxshape=(None,None,), chunks=True)
data = f.create_dataset("freqs", (0,2,), maxshape=(None,None,), chunks=True)
data = f.create_dataset("groups", (0,),  maxshape=(None,), chunks=True, dtype="S10")
data = f.create_dataset("selections", (0,),  maxshape=(None,), chunks=True)

f.close()

# Get the data according to the labels and store into database


In [20]:
# path to all the sound files
fileDirPath = ""
# fileDirNames = ["13AB/", "Control sites/"]

# all the files contained in the label file
# In the sample, we only have one file
usedFiles = np.unique(files)
# print(usedFiles)

for curFile in usedFiles:
  print("Processing " + curFile)
  
  filePath = fileDirPath + curFile
  
  # read the wav file
  try:
    audio, rate = sf.read(filePath)
  except:
    print("ERROR READING FILE")
    continue

  # used to store info after resizing the image
  specs_resized = []
  freqs_updated = []
  times_updated = []
  sonotypes_updated = []
  groups_updated = []
  selections_updated = []

  # plot the spectrogram
  # default setting of scipy, tukey window and 0.25 for shape parameter
  freq, t, spec = scipy.signal.spectrogram(audio, rate)

  # An alternative is using the plt spectrogram, but we do not use this
  # spec,freq, t,im = plt.specgram(audio,Fs=rate)
  # im = None
  # plt.clf() # clean the plt backend to clear up ram

  for i in range(len(files)):
    # chop the labels for cur wav file
    if(files[i] == curFile):
      # label time and frequency
      boxTime = times[i]
      boxFreq = freqs[i]

      # get low and high freq index of the box
      low_freq = np.argmin(np.abs(freq - boxFreq[0]))
      high_freq = np.argmin(np.abs(freq - boxFreq[1]))
      # get s tart and end time of the box
      start = np.argmin(np.abs(t - boxTime[0]))
      end = np.argmin(np.abs(t - boxTime[1]))

      # adjust the params to cover the whole box
      # be aware of index out of bound
      if freq[low_freq] > boxFreq[0]: 
        low_freq = max(low_freq -1,0)
      if freq[high_freq] < boxFreq[1]: 
        high_freq = high_freq + 1
      if t[start] > boxTime[0]: 
        start = max(start - 1, 0)
      if t[end] < boxTime[1]: 
        start = start + 1

      # make a copy to not alter the original spectrogram
      cur_spec = np.copy(spec)[low_freq:high_freq, start:end]

      # resize the image to 224*224*3
      if cur_spec.size != 0: # just in case
        # resize
        spec_resized = cv2.resize(cur_spec.astype('float32'),(224,224))
        spec_resized = np.expand_dims(spec_resized, 2)
        spec_resized = np.flip(spec_resized,0)
       
        # put to the list to add to h5df 
        specs_resized.append(spec_resized)
        freqs_updated.append(freqs[i])
        times_updated.append(actTimes[i])
        sonotypes_updated.append(sonotypes[i])
        groups_updated.append(groups[i]) 
        selections_updated.append(selection[i]) 
  
  # store to H5 data storage
  if len(specs_resized) != 0:
    f =  h5py.File(h5Path, "a")
    # here we use length of time_update for all the lengh used 
    # to resize and alter the h5df as the lenght of all the lists that we use 
    # are the same
    
    f["times"].resize((f["times"].shape[0] + len(times_updated)), axis = 0)
    f["times"][-len(times_updated):,:] = times_updated

    f["freqs"].resize((f["freqs"].shape[0] +  len(times_updated)), axis = 0)
    f["freqs"][-len(times_updated):,:] = freqs_updated

    f["specs"].resize((f["specs"].shape[0] +  len(times_updated)), axis = 0)
    f["specs"][-len(times_updated):,:,:,:] = specs_resized

    f["sonotypes"].resize((f["sonotypes"].shape[0] + len(times_updated)), axis = 0)
    f["sonotypes"][-len(times_updated):] = np.array(sonotypes_updated)

    f["groups"].resize((f["groups"].shape[0] + len(times_updated)), axis = 0)
    f["groups"][-len(times_updated):] = np.array(groups_updated, dtype="S")

    f["selections"].resize((f["selections"].shape[0] + len(times_updated)), axis = 0)
    f["selections"][-len(times_updated):] = np.array(selections_updated)

    f.close()
  else:
    # no spectrogram found for cur file, should be an exception
    # just print out the name
    print("No label found: ", curFile)


Processing My Drive/Stethoscope/20180908_060000_13A_24H.wav
