In [3]:
import numpy as np
from plots import *
import scipy.io
import pandas as pd
from pathlib import Path

def plot_mags(path, t_win=1, sf=False):
    wf = np.loadtxt(path)
    m = get_mags(wf, sr=44100, t_win=t_win, dict=True)
    mags = m['mags']
    freq_ax = m['freq_ax']
    plt.plot(freq_ax, np.log10(mags)*10)
    plt.title(str(path).split("\\")[-1])
    if sf:
        plt.savefig(str(path).split("\\")[-1].split(".")[0] + ".png")
    plt.show()
def plot_supp(path, sf=False):
    data = np.loadtxt(path)
    freqs = data[:, 0]
    mags = data[:, 1]
    plt.plot(freqs, mags)
    plt.title(str(path).split("\\")[-1])
    if sf:
        plt.savefig(str(path).split("\\")[-1].split(".")[0] + ".png")
    plt.show()

def has_supptone(mags):
    # find cutoff freq, check if anything above this has > 20 dB
    cutoff = mags[0]
    return True
    
    
# get the main directory in my computer
main_path_str = "C:\\Users\\Owner\OneDrive\\Desktop\\SOAE Data\\"
# we'll process each subfolder separately since each is likely to have its own quirks

In [None]:
# path1 = "C:\\Users\\Owner\OneDrive\\Desktop\\SOAE Data\\Pre-2014 Data\\Geckos et al MIT\\"
# path2 = "01.26.05\\"
# path1 = r"C:\Users\Owner\OneDrive\Desktop\SOAE Data\Pre-2014 Data\Human (UofA S&A via Wiggio)"
# path2 = "\\07.02.09"
path1 = r"C:\Users\Owner\OneDrive\Desktop\SOAE Data\Pre-2014 Data\Lizards CUMC2011"
path2 = r"\05.16.11"
# path1 = r"C:\Users\Owner\OneDrive\Desktop\SOAE Data\York Data"
# path2 = r"\04.12.17"
sf = False
for fp in Path(path1+path2).rglob('*'):
    fn = fp.name
    ext = fp.suffix
    if "README" in fn or ext in ('.rtf' '.pdf'):
        print(f"Skipping {fn}")
        continue
    try:
        plot_supp(fp, sf=sf)
    except:
        print("Waveform detecting, calculating mags")
        plot_mags(fp, t_win=1, sf=sf)
        

In [None]:
folder = "Pre-2014 Data"
    
# We'll build our dataframe by making a dictionary of lists and appending to them
data = {
    'filepath': [],
    'data': [],  
    'species': [],
    'sr': [],
}

# First navigate to our directory
directory_path = Path(main_path_str + folder)

# track which file we're on
n_files = sum(1 for _ in directory_path.rglob('*') if _.is_file())
n_current=0
n_readme = 0
n_tube = 0
n_earsoae = 0
n_supptone = 0
n_suppgood = 0
n_wf = 0
# now loop through all files in that collection
for fp in directory_path.rglob('*'):
    # Make sure it's a file
    if fp.is_file() == False:  
        continue
    
    # track which file we're on
    n_current += 1
    print(f"Processing file {n_current}/{n_files}")
    # Get various versions of the filepath/filename
    # Cut off the beginning of the filepath since it's unnecessary for our dataframe (fps = file path shortened)
    main_path = Path(main_path_str)
    fps = str(fp.relative_to(main_path))
    # Also get subfolder (if applicable)
    if len(fps.split("\\")) > 1:
        subfolder = fps.split("\\")[1]
    else:
        subfolder = "NA"
    # Get the filename itself (without its containing folders), extension, and uppercase version
    fn = fp.name
    ext = fp.suffix
    fnU = fn.upper()
    
    if "README" in fnU or ext not in ('.txt' '.mat'):
        # print(f"Skipping {fps} -- README or wrong extension")
        n_readme += 1
        continue
    
    elif "TUBE" in fnU:
        # print(f"Skipping {fps} -- Tube file")
        n_tube += 1
        continue
    
    elif "SUPP" in fnU:
        if ext == '.txt':
            mags = np.loadtxt(fp)
        try: 
            if mags.shape[1] != 2:
                raise RuntimeError(f"Supp file from {fps} isn't two columns!")
        except:
            raise(f"Supp file from {fps} isn't 2D!")
        if has_supptone(mags):
            print(f"Skipping {fps} -- true suppression tone!")
            n_supptone += 1
            if "NOSUPP" in fnU:
                raise RuntimeError(f"Our suppression tone detector is wrong! {fps} shouldn't have a suppression tone...")
        else:
            # add a samplerate of 0 to indicate this has been pre-fft'd
            data['sr'].append(0)
            data['data'].append(mags)
            n_suppgood += 1
    
    elif ("EAR" in fnU and "SOAE" in fnU) and ("WF" not in fnU and "WAVEFORM" not in fnU and "SUPP" not in fnU):
        print(f"Skipping {fps} -- Chris can't say if good or not")
        n_readme += 1
    
    elif "WF" in fnU or "WAVEFORM" in fnU:
    # we must have a waveform if we got here
        # Check if it's a .txt or .mat file
        try:
            if ext == '.mat':
                mat = scipy.io.loadmat(fp)
                if 'wf' in mat:
                    wf = np.squeeze(mat['wf'])
                else: 
                    print(f"Not sure how to process {fps}")
            if ext == '.txt':
                wf = np.loadtxt(fp)
            # Let's make sure this waveform is a 1D array
            if len(wf.shape) > 1:
                raise RuntimeError(f"Waveform from {fps} isn't 1D!")
            
            # add to the dataframe 
            data['sr'].append(44100)
            data['data'].append(wf)
            n_wf += 1
        except:
            f"Uh oh! Issue when loading {fps}"
    else:
        raise RuntimeError(f"UH OH {fps} didn't fall into any categories:")
        
            
            

            
            
    # Get species
    subfolder_species = subfolder.split(" ")[0]
    match subfolder_species:
        case 'Geckos' | 'Lizards':
            species = 'Lizard'
        case 'Tigers':
            species = 'Tiger'
        case _:
            species = subfolder_species

    data['filepath'].append(fps)
    data['species'].append(species)

# turn this into a pandas dataframe
df = pd.DataFrame(data)
# save this as a parquet file for efficient dataframe storage (use pyarrow since the 'wf' column has different length lists)
df.to_parquet(f'{folder}.parquet', engine='pyarrow')

Processing file 1/4247
Processing file 2/4247
Processing file 3/4247
Processing file 4/4247
Skipping Pre-2014 Data\Ferrets (Nottingham 2012)\06.25.12\MP1learSOAEsupp1.txt -- true suppression tone!
Processing file 5/4247
Skipping Pre-2014 Data\Ferrets (Nottingham 2012)\06.25.12\MP1learSOAEsupp2.txt -- true suppression tone!
Processing file 6/4247
Skipping Pre-2014 Data\Ferrets (Nottingham 2012)\06.25.12test\CBrearSOAEsupp1.txt -- true suppression tone!
Processing file 7/4247
Skipping Pre-2014 Data\Ferrets (Nottingham 2012)\06.25.12testB\AOrearSOAEsupp1.txt -- true suppression tone!
Processing file 8/4247
Skipping Pre-2014 Data\Ferrets (Nottingham 2012)\06.26.12\MP2learSOAEsupp1.txt -- true suppression tone!
Processing file 9/4247
Skipping Pre-2014 Data\Ferrets (Nottingham 2012)\06.26.12\MP2learSOAEsupp2.txt -- true suppression tone!
Processing file 10/4247
Skipping Pre-2014 Data\Ferrets (Nottingham 2012)\06.26.12\MP3learSOAEsupp1.txt -- true suppression tone!
Processing file 11/4247
Ski

RuntimeError: UH OH Pre-2014 Data\Geckos et al MIT\01.26.05\CrearBT2soae1.txt didn't fall into any categories:

Processing file 1/26


KeyError: 'data'

In [None]:
folder = "Extra Owl"
    
# We'll build our dataframe by making a dictionary of lists and appending to them
data = {
    'filepath': [],
    'wf': [],  
    'species': [],
    'sr': [],
}

# First navigate to our directory
directory_path = Path(main_path_str + folder)

# track which file we're on
n_files = sum(1 for _ in directory_path.rglob('*') if _.is_file())
n_current=0

# now loop through all files in that collection
for fp in directory_path.rglob('*'):
    # Check if it's a file
    if fp.is_file():  
        n_current += 1
        print(f"Processing file {n_current}/{n_files}")
        # Cut off the beginning of the filepath since it's unnecessary for our dataframe (fps = file path shortened)
        main_path = Path(main_path_str)
        fps = str(fp.relative_to(main_path))
        
        # Get the filename itself (without its containing folders)
        fn = fp.name
        # Also uppercase
        fnU = fn.upper()
        
        # now we actually open the waveform here
        # Check if it's a .txt or .mat file
        try:
            if fp.suffix == '.mat':
                mat = scipy.io.loadmat(fp)
                if 'wf' in mat:
                    wf = np.squeeze(mat['wf'])
                else: 
                    print(f"Not sure how to process {fp}")
            if fp.suffix == '.txt':
                wf = np.loadtxt(fp)
            # Let's make sure this waveform is a 1D array
            if len(wf.shape) > 1:
                print(f"Waveform from {fps} isn't 1D!")
        except:
            f"Uh oh! Issue when loading {fp}"
            
        if str(fps).split("\\")[1]=='Oldenberg Data (2013) (44.1kHz)':
            sr = 44100
            species = "Owl"
        elif str(fps).split("\\")[1]=='Pim owl files (48 kHz)':
            sr = 48000
            species = "Owl"
        else:
            print("UH OH WHERE ARE WE")
        
            
                
        # add everything to our df dict
        data['filepath'].append(fps)
        data['data'].append(wf)
        data['species'].append(species)
        data['sr'].append(sr)

# turn this into a pandas dataframe
df = pd.DataFrame(data)
# save this as a parquet file for efficient dataframe storage (use pyarrow since the 'wf' column has different length lists)
df.to_parquet(f'{folder}.parquet', engine='pyarrow')

In [None]:
folder = "Lots of Data"
    
# We'll build our dataframe by making a dictionary of lists and appending to them
data = {
    'filepath': [],
    'wf': [],  
    'species': [],
    'sr': [],
}

# First navigate to our directory
directory_path = Path(main_path_str + folder)

# track which file we're on
n_files = sum(1 for _ in directory_path.rglob('*') if _.is_file())
n_current=0

# now loop through all files in that collection
for fp in directory_path.rglob('*'):
    # Check if it's a file
    if fp.is_file():  
        n_current += 1
        print(f"Processing file {n_current}/{n_files}")
        # Cut off the beginning of the filepath since it's unnecessary for our dataframe (fps = file path shortened)
        main_path = Path(main_path_str)
        fps = str(fp.relative_to(main_path))
        
        # Also get ubfolder (if applicable)
        if len(fps.split("\\")) > 1:
            subfolder = fps.split("\\")[1]
        else:
            subfolder = "NA"
        
        # Get the filename itself (without its containing folders)
        fn = fp.name
        # now we actually open the waveform here
        # Check if it's a .txt or .mat file
        try:
            if fp.suffix == '.mat':
                mat = scipy.io.loadmat(fp)
                if 'wf' in mat:
                    wf = np.squeeze(mat['wf'])
                else: 
                    print(f"Not sure how to process {fp}")
            if fp.suffix == '.txt':
                wf = np.loadtxt(fp)
            # Let's make sure this waveform is a 1D array
            if len(wf.shape) > 1:
                print(f"Waveform from {fps} isn't 1D!")
        except:
            f"Uh oh! Issue when loading {fp}"
            
        # Get species
        subfolder_species = subfolder.split(".")[3]
        
        match subfolder_species:
            case 'tokay':
                species = "Tokay"
            case 'tegu':
                species = "Tegu"
            case 'human':
                species = "Human"
            case 'skink':
                species = "Skink"
            case 'owl':
                species = "Owl"
            case 'anolis':
                species = "Anolis"
            case 'ACsb42':
                species = "Anolis"
            case _:
                print(f"Couldn't find the species of {fn}")
        
        # These all should have the standard sample rate
        sr = 44100
                
        # add everything to our df dict
        data['filepath'].append(fps)
        data['data'].append(wf)
        data['species'].append(species)
        data['sr'].append(sr)

# turn this into a pandas dataframe
df = pd.DataFrame(data)
# save this as a parquet file for efficient dataframe storage (use pyarrow since the 'wf' column has different length lists)
df.to_parquet(f'{folder}.parquet', engine='pyarrow')