In [1]:
import numpy as np
from plots import *
import scipy.io
import pandas as pd
from pathlib import Path

def plot_mags(path, t_win=1, sf=False):
    wf = np.loadtxt(path)
    m = get_mags(wf, sr=44100, t_win=t_win, dict=True)
    mags = m['mags']
    freq_ax = m['freq_ax']
    plt.plot(freq_ax, np.log10(mags)*10)
    plt.title(str(path).split("\\")[-1])
    if sf:
        plt.savefig(str(path).split("\\")[-1].split(".")[0] + ".png")
    plt.show()
def plot_supp(path, save=False):
    data = np.loadtxt(path)
    freqs = data[:, 0]
    mags = data[:, 1]
    plt.plot(freqs, mags)
    plt.title(str(path).split("\\")[-1])
    if save:
        plt.savefig(str(path).split("\\")[-1].split(".")[0] + ".png")
    plt.show()

def has_supptone(data, cutoff=300):
    # find cutoff freq, check if anything above this has > 20 dB
    freqs = data[:, 0]
    mags = data[:, 1]
    # print(np.where(freqs > cutoff))
    i_cutoff = np.where(freqs > cutoff)[0][0]
    if np.max(mags[i_cutoff:]) > 20:
        return True
    else:
        return False
    
    
# get the main directory in my computer
main_path_str = "C:\\Users\\Owner\OneDrive\\Desktop\\SOAE Data\\"
main_path = Path(main_path_str)
# we'll process each subfolder separately since each is likely to have its own quirks

In [None]:
folder = "UWO Data"
    
# We'll build our dataframe by making a dictionary of lists and appending to them
dataframe = {
    'filepath': [],
    'freqs': [],
    'spectrum': [],
    'wf': [],
    'species': [],
    'sr': [],
}

# First navigate to our directory
directory_path = Path(main_path_str + folder)

# track which file we're on
n_files = sum(1 for _ in directory_path.rglob('*') if _.is_file())

# track how many of each filetype we have
n_current = 0
n_readme = 0
n_tube = 0
n_oral = 0
n_earsoae = 0
n_supptone = 0
n_suppgood = 0
n_wf = 0

unknownspecies = []
earsoae = []

# now loop through all files in that collection
for fp in directory_path.rglob('*'):
    # Make sure it's a file
    if fp.is_file() == False:  
        continue
    
    # Get various versions of the filepath/filename
    
    # Cut off the beginning of the filepath since it's unnecessary for our dataframe (fps = file path shortened)
    main_path = Path(main_path_str)
    fps = str(fp.relative_to(main_path))
    
    # Also get subfolder (if applicable)
    if len(fps.split("\\")) > 1:
        subfolder = fps.split("\\")[1]
    else:
        subfolder = "NA"
        
    # Get the filename itself (without its containing folders), extension, and uppercase version
    fn = fp.name
    ext = fp.suffix
    fnU = fn.upper()

    # Treat different files differently based on keywords in filename
    
    if "README" in fnU or ext not in ('.txt' '.mat'):
        # print(f"Skipping {fps} -- README or wrong extension")
        n_readme += 1
        continue
    
    elif "TUBE" in fnU:
        # print(f"Skipping {fps} -- Tube file")
        n_tube += 1
        continue
    
    elif "SUPP" in fnU and "SOAE" in fnU:
        if ext == '.txt':
            data = np.loadtxt(fp)
        else:
            raise ValueError(f"Supp file from {fps} isn't .txt!")
        try: 
            if data.shape[1] != 2:
                raise RuntimeError(f"Supp file from {fps} isn't two columns!")
        except:
            raise(f"Supp file from {fps} isn't 2D!")
        if has_supptone(data):
            # plot_supp(fp)
            print(f"Skipping {fps} -- true suppression tone!")
            n_supptone += 1
            if "NOSUPP" in fnU:
                raise RuntimeError(f"Our suppression tone detector is wrong! {fps} shouldn't have a suppression tone...")
            continue
        else:
            # pull out frequency axis and spectrum and add to dataframe dictionary
            freqs = data[:, 0]
            spectrum = data[:, 1]
            dataframe['freqs'].append(freqs)
            dataframe['spectrum'].append(spectrum)
            # add a samplerate of 0 and an empty value to the waveform to fill the space
            dataframe['sr'].append(0)
            dataframe['wf'].append(None)
            # record we got a good one
            n_suppgood += 1
    
    elif ("EAR" in fnU and "SOAE" in fnU) and ("WF" not in fnU and "WAVEFORM" not in fnU and "SUPP" not in fnU):
        print(f"Skipping {fps} -- Chris can't say if good or not")
        earsoae.append[fps]
        n_earsoae += 1
        continue
    
    elif "WF" in fnU or "WAVEFORM" in fnU:
    # we must have a waveform if we got here
        # Check if it's a .txt or .mat file
        try:
            if ext == '.mat':
                mat = scipy.io.loadmat(fp)
                if 'wf' in mat:
                    wf = np.squeeze(mat['wf'])
                else: 
                    print(f"Not sure how to process {fps}")
            if ext == '.txt':
                wf = np.loadtxt(fp)
            # Let's make sure this waveform is a 1D array
            if len(wf.shape) > 1:
                raise RuntimeError(f"Waveform from {fps} isn't 1D!")
            # add to the dataframe 
            dataframe['sr'].append(44100)
            dataframe['wf'].append(wf)
            # add empty values to the spectrum and freqs to fill the space
            dataframe['freqs'].append(None)
            dataframe['spectrum'].append(None)
            n_wf += 1
        except:
            f"Uh oh! Issue when loading {fps}"
    else:
        raise RuntimeError(f"UH OH {fps} didn't fall into any categories:")
        
            
            
    # Get species
    if fnU[0:2] == "TH":
        species = "Human"
    elif fnU[0:2] == "AP":
        species = "Human"
    # Confirmed human
    elif fnU[0:2] in ("VE", "LN", "RDR") or fnU[0:4]=="PERU":
        species = "Human"
    # Not sure
    elif fnU[0:2] in ("RS"):
        species = "Unknown"
    else:
        species = "Unknown"
        unknownspecies.append(fps)
        # raise ValueError(f"Couldn't find species for {fps}!")

    dataframe['filepath'].append(fps)
    dataframe['species'].append(species)
    
    # track which file we're on
    n_current += 1
    print(f"Processed file {n_current}/{n_files}: {fps}")

print(f"FP: {len(dataframe['filepath'])}, SR: {len(dataframe['sr'])}, Spectrum: {len(dataframe['spectrum'])}, wf: {len(dataframe['wf'])}, freqs: {len(dataframe['freqs'])}, Species: {len(dataframe['species'])}"), 
print(f"Finished! Ignored {n_oral} mouth/oral files, {n_readme} README files, {n_tube} tube files, {n_earsoae} earsoae files, {n_supptone} suppression tone files")
print(f"We kept {n_suppgood} good suppression files and {n_wf} waveform files.")

print()

print(f"Here's all the ignored Ear + SOAE files:")
for f in earsoae:
    print(f)
    

print()
print()
print("Unknown (and unknown that unknown) species:")
for f in unknownspecies:
    print(f)

# turn this into a pandas dataframe
df = pd.DataFrame(dataframe)
# save this as a parquet file for efficient dataframe storage (use pyarrow since the 'wf' column has different length lists)
df.to_parquet(f'{folder}.parquet', engine='pyarrow')

In [None]:
folder = "York Data"
    
# We'll build our dataframe by making a dictionary of lists and appending to them
dataframe = {
    'filepath': [],
    'freqs': [],
    'spectrum': [],
    'wf': [],
    'species': [],
    'sr': [],
}

# First navigate to our directory
directory_path = Path(main_path_str + folder)

# track which file we're on
n_files = sum(1 for _ in directory_path.rglob('*') if _.is_file())

# track how many of each filetype we have
n_current = 0
n_readme = 0
n_tube = 0
n_oral = 0
n_earsoae = 0
n_supptone = 0
n_suppgood = 0
n_dual = 0
n_wf = 0

unknownspecies = []
earsoae = []

# now loop through all files in that collection
for fp in directory_path.rglob('*'):
    # Make sure it's a file
    if fp.is_file() == False:  
        continue
    
    if n_current > 500:
        break

    # Get various versions of the filepath/filename
    
    # Cut off the beginning of the filepath since it's unnecessary for our dataframe (fps = file path shortened)
    main_path = Path(main_path_str)
    fps = str(fp.relative_to(main_path))
    
    # Also get subfolder (if applicable)
    if len(fps.split("\\")) > 1:
        subfolder = fps.split("\\")[1]
    else:
        subfolder = "NA"
        
    # Get the filename itself (without its containing folders), extension, and uppercase version
    fn = fp.name
    ext = fp.suffix
    fnU = fn.upper()

    # Treat different files differently based on keywords in filename
    
    if "README" in fnU or ext not in ('.txt' '.mat'):
        # print(f"Skipping {fps} -- README or wrong extension")
        n_readme += 1
        continue
    
    elif "TUBE" in fnU:
        # print(f"Skipping {fps} -- Tube file")
        n_tube += 1
        continue
    
    elif "ORAL" in fnU or "MOUTH" in fnU:
        # print(f"Skipping {fps} -- Oral file")
        n_oral += 1
        continue
    
    elif "DUAL" in fnU and "LEFT" not in fnU and "RIGHT" not in fnU:
        # These files have two columns, one for each ear. We'll process them as separate waveforms.
        if ext == '.txt':
            wfs = np.loadtxt(fp)
        else:
            raise ValueError(f"Waveform from {fps} isn't .txt!")
        for i in (0, 1):
            # add both to the dataframe 
            dataframe['sr'].append(44100)
            dataframe['wf'].append(wfs[:, i])
            # add empty values to the spectrum and freqs to fill the space
            dataframe['freqs'].append(None)
            dataframe['spectrum'].append(None)
            # now we also must add the species and filepath
            dataframe['species'].append("Anolis")
            dataframe['filepath'].append(fps + f" Column {i + 1}")
            n_wf += 1
        # then wrap things up and continue so this doesn't happen twice
        n_current += 1
        print(f"Processed file {n_current}/{n_files}: {fps}")
        continue
    
    elif "SUPP" in fnU and "SOAE" in fnU:
        if ext == '.txt':
            data = np.loadtxt(fp)
        else:
            raise ValueError(f"Supp file from {fps} isn't .txt!")
        try: 
            if data.shape[1] != 2:
                raise RuntimeError(f"Supp file from {fps} isn't two columns!")
        except:
            raise(f"Supp file from {fps} isn't 2D!")
        if has_supptone(data):
            # plot_supp(fp)
            print(f"Skipping {fps} -- true suppression tone!")
            n_supptone += 1
            if "NOSUPP" in fnU:
                raise RuntimeError(f"Our suppression tone detector is wrong! {fps} shouldn't have a suppression tone...")
            continue
        else:
            # pull out frequency axis and spectrum and add to dataframe dictionary
            freqs = data[:, 0]
            spectrum = data[:, 1]
            dataframe['freqs'].append(freqs)
            dataframe['spectrum'].append(spectrum)
            # add a samplerate of 0 and an empty value to the waveform to fill the space
            dataframe['sr'].append(0)
            dataframe['wf'].append(None)
            # record we got a good one
            n_suppgood += 1
    
    elif ("EAR" in fnU and "SOAE" in fnU) and ("WF" not in fnU and "WAVEFORM" not in fnU and "SUPP" not in fnU):
        print(f"Skipping {fps} -- Chris can't say if good or not")
        n_earsoae += 1
        earsoae.append(fps)
        continue
    
    elif "WF" in fnU or "WAVEFORM" in fnU:
    # we must have a waveform if we got here
        # Check if it's a .txt or .mat file
        try:
            if ext == '.mat':
                mat = scipy.io.loadmat(fp)
                if 'wf' in mat:
                    wf = np.squeeze(mat['wf'])
                else: 
                    print(f"Not sure how to process {fps}")
            if ext == '.txt':
                wf = np.loadtxt(fp)
            # Let's make sure this waveform is a 1D array
            if len(wf.shape) > 1:
                raise RuntimeError(f"Waveform from {fps} isn't 1D!")
            # add to the dataframe 
            dataframe['sr'].append(44100)
            dataframe['wf'].append(wf)
            # add empty values to the spectrum and freqs to fill the space
            dataframe['freqs'].append(None)
            dataframe['spectrum'].append(None)
            n_wf += 1
        except:
            raise RuntimeError(f"Uh oh! Issue when loading {fps}")
    else:
        raise RuntimeError(f"UH OH {fps} didn't fall into any categories:")
        
            
            
    # Get species
    if fn[0:2] == "AC" or "ACsb" in fn:
        species = "Anolis"
    # these ones definitely look like humans
    elif fn[0:2] in ("EWB", "ED"):
        species = "Human"
    # these ones Chris said are very likely humans
    elif fn[0:2] in ("AL", "NT", "EA", "CB", "JL", "LS", "KH", "LM", "JI"):
        species = "Human"
    elif fn[0:3] in ("CPB", "CVR"):
        species = "Human"
    # these ones I can't tell
    elif fn[0:2] in ("AA", "SR", "AZ"):
        species = "Unknown"
    else:
        print(f"Couldn't find species for {fps}!")
        species = "Unknown"
        unknownspecies.append(fps)

    dataframe['filepath'].append(fps)
    dataframe['species'].append(species)
    
    # track which file we're on
    n_current += 1
    print(f"Processed file {n_current}/{n_files}: {fps}")

print(f"FP: {len(dataframe['filepath'])}, SR: {len(dataframe['sr'])}, Spectrum: {len(dataframe['spectrum'])}, wf: {len(dataframe['wf'])}, freqs: {len(dataframe['freqs'])}, Species: {len(dataframe['species'])}"), 
print(f"Finished! Ignored {n_oral} mouth/oral files, {n_readme} README files, {n_tube} tube files, {n_earsoae} earsoae files, {n_supptone} suppression tone files")
print(f"We kept {n_suppgood} good suppression files and {n_wf} waveform files.")

print()

print(f"Here's all the ignored Ear + SOAE files:")
for f in earsoae:
    print(f)
    

print()
print()
print("Unknown (and unknown that unknown) species:")
for f in unknownspecies:
    print(f)


# # turn this into a pandas dataframe
df = pd.DataFrame(dataframe)
# save this as a parquet file for efficient dataframe storage (use pyarrow since the 'wf' column has different length lists)
df.to_parquet(f'{folder}.parquet', engine='pyarrow')

In [None]:
folder = "Pre-2014 Data"
    
# We'll build our dataframe by making a dictionary of lists and appending to them
dataframe = {
    'filepath': [],
    'freqs': [],
    'spectrum': [],
    'wf': [],
    'species': [],
    'sr': [],
}

# First navigate to our directory
directory_path = Path(main_path_str + folder)

# track which file we're on
n_files = sum(1 for _ in directory_path.rglob('*') if _.is_file())

# track how many of each filetype we have
n_current = 0
n_readme = 0
n_tube = 0
n_earsoae = 0
n_supptone = 0
n_suppgood = 0
n_wf = 0
unknownspecies = []
# now loop through all files in that collection
for fp in directory_path.rglob('*'):
    # Make sure it's a file
    if fp.is_file() == False:  
        continue
    
    # Get various versions of the filepath/filename
    
    # Cut off the beginning of the filepath since it's unnecessary for our dataframe (fps = file path shortened)
    main_path = Path(main_path_str)
    fps = str(fp.relative_to(main_path))
    
    # Also get subfolder (if applicable)
    if len(fps.split("\\")) > 1:
        subfolder = fps.split("\\")[1]
    else:
        subfolder = "NA"
        
    # Get the filename itself (without its containing folders), extension, and uppercase version
    fn = fp.name
    ext = fp.suffix
    fnU = fn.upper()

    # Treat different files differently based on keywords in filename
    
    if "README" in fnU or ext not in ('.txt' '.mat'):
        # print(f"Skipping {fps} -- README or wrong extension")
        n_readme += 1
        continue
    
    elif "TUBE" in fnU:
        # print(f"Skipping {fps} -- Tube file")
        n_tube += 1
        continue
    
    elif "SUPP" in fnU and "SOAE" in fnU:
        if ext == '.txt':
            data = np.loadtxt(fp)
        else:
            raise ValueError(f"Supp file from {fps} isn't .txt!")
        try: 
            if data.shape[1] != 2:
                raise RuntimeError(f"Supp file from {fps} isn't two columns!")
        except:
            raise(f"Supp file from {fps} isn't 2D!")
        if has_supptone(data):
            # plot_supp(fp)
            print(f"Skipping {fps} -- true suppression tone!")
            n_supptone += 1
            if "NOSUPP" in fnU:
                raise RuntimeError(f"Our suppression tone detector is wrong! {fps} shouldn't have a suppression tone...")
            continue
        else:
            # pull out frequency axis and spectrum and add to dataframe dictionary
            freqs = data[:, 0]
            spectrum = data[:, 1]
            dataframe['freqs'].append(freqs)
            dataframe['spectrum'].append(spectrum)
            # add a samplerate of 0 and an empty value to the waveform to fill the space
            dataframe['sr'].append(0)
            dataframe['wf'].append(None)
            # record we got a good one
            n_suppgood += 1
    
    elif ("EAR" in fnU and "SOAE" in fnU) and ("WF" not in fnU and "WAVEFORM" not in fnU and "SUPP" not in fnU):
        print(f"Skipping {fps} -- Chris can't say if good or not")
        n_earsoae += 1
        continue
    
    elif "WF" in fnU or "WAVEFORM" in fnU:
    # we must have a waveform if we got here
        # Check if it's a .txt or .mat file
        try:
            if ext == '.mat':
                mat = scipy.io.loadmat(fp)
                if 'wf' in mat:
                    wf = np.squeeze(mat['wf'])
                else: 
                    print(f"Not sure how to process {fps}")
            if ext == '.txt':
                wf = np.loadtxt(fp)
            # Let's make sure this waveform is a 1D array
            if len(wf.shape) > 1:
                raise RuntimeError(f"Waveform from {fps} isn't 1D!")
            # add to the dataframe 
            dataframe['sr'].append(44100)
            dataframe['wf'].append(wf)
            # add empty values to the spectrum and freqs to fill the space
            dataframe['freqs'].append(None)
            dataframe['spectrum'].append(None)
            n_wf += 1
        except:
            f"Uh oh! Issue when loading {fps}"
    else:
        raise RuntimeError(f"UH OH {fps} didn't fall into any categories:")
        
            
            
    # Get species
    subfolder_species = subfolder.split(" ")[0]
    match subfolder_species:
        case 'Geckos' | 'Lizards':
            species = "Lizard"
        case 'Tigers':
            species = "Tiger"
        case _:
            species = subfolder_species

    dataframe['filepath'].append(fps)
    dataframe['species'].append(species)
    
    # track which file we're on
    n_current += 1
    print(f"Processed file {n_current}/{n_files}: {fps}")

print(f"FP: {len(dataframe['filepath'])}, SR: {len(dataframe['sr'])}, Spectrum: {len(dataframe['spectrum'])}, wf: {len(dataframe['wf'])}, freqs: {len(dataframe['freqs'])}, Species: {len(dataframe['species'])}"), 
print(f"Finished! Ignored {n_oral} mouth/oral files, {n_readme} README files, {n_tube} tube files, {n_earsoae} earsoae files, {n_supptone} suppression tone files")
print(f"We kept {n_suppgood} good suppression files and {n_wf} waveform files.")

print()

print(f"Here's all the ignored Ear + SOAE files:")
for f in earsoae:
    print(f)
    

print()
print()
print("Unknown species:")
for f in unknownspecies:
    print(f)

# turn this into a pandas dataframe
df = pd.DataFrame(dataframe)
# save this as a parquet file for efficient dataframe storage (use pyarrow since the 'wf' column has different length lists)
df.to_parquet(f'{folder}.parquet', engine='pyarrow')

In [None]:
folder = "Extra Owl"
    
# We'll build our dataframe by making a dictionary of lists and appending to them
dataframe = {
    'filepath': [],
    'wf': [],  
    'species': [],
    'sr': [],
}

# First navigate to our directory
directory_path = Path(main_path_str + folder)

# track which file we're on
n_files = sum(1 for _ in directory_path.rglob('*') if _.is_file())
n_current=0

# now loop through all files in that collection
for fp in directory_path.rglob('*'):
    # Check if it's a file
    if fp.is_file():  
        # Cut off the beginning of the filepath since it's unnecessary for our dataframe (fps = file path shortened)
        main_path = Path(main_path_str)
        fps = str(fp.relative_to(main_path))
        
        # Get the filename itself (without its containing folders)
        fn = fp.name
        # Also uppercase
        fnU = fn.upper()  
        
        n_current += 1
        print(f"Processing file {n_current}/{n_files}")
        
        # now we actually open the waveform here
        # Check if it's a .txt or .mat file
        try:
            if fp.suffix == '.mat':
                mat = scipy.io.loadmat(fp)
                if 'wf' in mat:
                    wf = np.squeeze(mat['wf'])
                else: 
                    print(f"Not sure how to process {fp}")
            if fp.suffix == '.txt':
                wf = np.loadtxt(fp)
            # Let's make sure this waveform is a 1D array
            if len(wf.shape) > 1:
                print(f"Waveform from {fps} isn't 1D!")
        except:
            f"Uh oh! Issue when loading {fp}"
            
        if str(fps).split("\\")[1]=='Oldenberg Data (2013) (44.1kHz)':
            sr = 44100
            species = "Owl"
        elif str(fps).split("\\")[1]=='Pim owl files (48 kHz)':
            sr = 48000
            species = "Owl"
        else:
            print("UH OH WHERE ARE WE")
        
            
                
        # add everything to our df dict
        dataframe['filepath'].append(fps)
        dataframe['wf'].append(wf)
        dataframe['species'].append(species)
        dataframe['sr'].append(sr)

# turn this into a pandas dataframe
df = pd.DataFrame(dataframe)
# save this as a parquet file for efficient dataframe storage (use pyarrow since the 'wf' column has different length lists)
df.to_parquet(f'{folder}.parquet', engine='pyarrow')

In [None]:
folder = "Lots of Data"
    
# We'll build our dataframe by making a dictionary of lists and appending to them
dataframe = {
    'filepath': [],
    'wf': [],  
    'species': [],
    'sr': [],
}

# First navigate to our directory
directory_path = Path(main_path_str + folder)

# track which file we're on
n_files = sum(1 for _ in directory_path.rglob('*') if _.is_file())
n_current=0

# now loop through all files in that collection
for fp in directory_path.rglob('*'):
    # Check if it's a file
    if fp.is_file():  
        # Cut off the beginning of the filepath since it's unnecessary for our dataframe (fps = file path shortened)
        main_path = Path(main_path_str)
        fps = str(fp.relative_to(main_path))
        
        # Also get ubfolder (if applicable)
        if len(fps.split("\\")) > 1:
            subfolder = fps.split("\\")[1]
        else:
            subfolder = "NA"
        
        # Get the filename itself (without its containing folders)
        fn = fp.name
        
        n_current += 1
        print(f"Processing file {n_current}/{n_files}")

        # now we actually open the waveform here
        # Check if it's a .txt or .mat file
        try:
            if fp.suffix == '.mat':
                mat = scipy.io.loadmat(fp)
                if 'wf' in mat:
                    wf = np.squeeze(mat['wf'])
                else: 
                    print(f"Not sure how to process {fp}")
            if fp.suffix == '.txt':
                wf = np.loadtxt(fp)
            # Let's make sure this waveform is a 1D array
            if len(wf.shape) > 1:
                print(f"Waveform from {fps} isn't 1D!")
        except:
            f"Uh oh! Issue when loading {fp}"
            
        # Get species
        subfolder_species = subfolder.split(".")[3]
        
        match subfolder_species:
            case 'tokay':
                species = "Tokay"
            case 'tegu':
                species = "Tegu"
            case 'human':
                species = "Human"
            case 'skink':
                species = "Skink"
            case 'owl':
                species = "Owl"
            case 'anolis':
                species = "Anolis"
            case 'ACsb42':
                species = "Anolis"
            case _:
                print(f"Couldn't find the species of {fn}")
        
        # These all should have the standard sample rate
        sr = 44100
                
        # add everything to our df dict
        dataframe['filepath'].append(fps)
        dataframe['wf'].append(wf)
        dataframe['species'].append(species)
        dataframe['sr'].append(sr)

# turn this into a pandas dataframe
df = pd.DataFrame(dataframe)
# save this as a parquet file for efficient dataframe storage (use pyarrow since the 'wf' column has different length lists)
df.to_parquet(f'{folder}.parquet', engine='pyarrow')

In [None]:
subfolder = "Curated Data"
    
# We'll build our dataframe by making a dictionary of lists and appending to them
dataframe = {
    'filepath': [],
    'wf': [],  
    'species': [],
    'sr': [],
}

# First navigate to our directory
directory_path = Path(main_path_str + subfolder)

# track which file we're on
n_files = sum(1 for _ in directory_path.rglob('*') if _.is_file())
i=0

# now loop through all files in that collection
for fp in directory_path.rglob('*'):
    # Check if it's a file
    if fp.is_file():  
        # Cut off the beginning of the filepath since it's unnecessary for our dataframe (fps = file path shortened)
        main_path = Path(main_path_str)
        fps = str(fp.relative_to(main_path))
        
        # Get the filename itself (without its containing folders)
        
        fn = fp.name
        # print out which file we're on
        i += 1
        print(f"Processing file {i}/{n_files}")
        
        # now we actually open the waveform here
        # Check if it's a .txt or .mat file
        try:
            if fp.suffix == '.mat':
                mat = scipy.io.loadmat(fp)
                if 'wf' in mat:
                    wf = np.squeeze(mat['wf'])
                else: 
                    print(f"Not sure how to process {fp}")
            if fp.suffix == '.txt':
                wf = np.loadtxt(fp)
            # Let's make sure this waveform is a 1D array
            if len(wf.shape) > 1:
                print(f"Waveform from {fps} isn't 1D!")
                continue
        except:
            f"Uh oh! Issue when loading {fp}"
            
            
        # try and get the species name
        fn_species = fn.split("_")[0]
        
        match fn_species:
            case 'anole':
                species = "Anolis"
            case 'cricket':
                species = "Cricket"
            case 'human':
                species = "Human"
            case 'owl':
                species = "Owl"
            case _:
                species = ""
        
        # do some manual processing
        if len(fn.split("_")) > 1 and fn.split("_")[1][0:3] == "TAG":
            sr = 48000
            species = "Owl"
        else:
            sr = 44100
        
        if len(fps.split("/")) > 1 and fps.split("/")[1] == "Other":
            species = "Unknown"
            sr = 0
        
        match fn:
            case 'TT1learSOAEwf5.mat':
                species = "Tegu"
                sr = 44100
            case 'TT3li.mat':
                species = "Tegu"
                sr = 44100
            case 've10re01.mat':
                species = "Varanid"
            
                
        # add everything to our df dict
        dataframe['filepath'].append(fps)
        dataframe['wf'].append(wf)
        dataframe['species'].append(species)
        dataframe['sr'].append(sr)

# turn this into a pandas dataframe
df = pd.DataFrame(dataframe)
# save this as a parquet file for efficient dataframe storage (use pyarrow since the 'wf' column has different length lists)
df.to_parquet(f'{subfolder}.parquet', engine='pyarrow')