# Code to create HDF5 datasets (at different SNRs) for vIHC-based PSD data 
Notebook outline: 
1. Load required packages 
2. Define a function to create dataset for a single SNR 
3. Loop through SNRs to create different hdf5 files

## 1. Load required packages

In [10]:
import os 
import numpy as np 
import h5py
import scipy.io
import matlab_helpers as mh 

## 2. Define a function to create dataset for a single SNR 
Steps are 
* Read SNR 
* Define the right input and output directories/files 
* Read all data 
* Save HDF5 (X, Y, and filenames)

In [13]:
def save_snr_data(snr_value, forceReDo):
    root_in_dir= '../Datasets/psd_gp_vIHC_mat/'
    root_out_dir= '../Datasets/psd_gp_vIHC_mat/PyData/'
    if not os.path.exists(root_out_dir):
        os.makedirs(root_out_dir)
    
    if np.isinf(snr_value):
        root_matdata_dir = root_in_dir + 'level65_dBspl_clean/'
        out_allfiles_txt_fname = root_out_dir + 'PSD_data_list_clean.txt'
        out_psd_data_file = root_out_dir + 'PSD_data_clean.h5'
        pre_search_str = '_clean/'
    else:
        root_matdata_dir = root_in_dir + 'level65_dBspl_SNR' + snr_value.astype('int').astype('str') + '_white/'
        out_allfiles_txt_fname = root_out_dir + 'PSD_data_list_SNR' + snr_value.astype('int').astype('str') + '.txt'
        out_psd_data_file = root_out_dir + 'PSD_data_SNR' + snr_value.astype('int').astype('str') + '.h5'
        pre_search_str = '_white/'

    post_search_str = '/psd_'

    if (not os.path.exists(out_psd_data_file)) or forceReDo:
        if os.path.exists(root_matdata_dir):
            print(f"Working on: {root_matdata_dir}")
        else:
            print(f"Nooooooooo: {root_matdata_dir}")

        valid_datadirs = ['Chut', 'HighWhistle', 'Rumble', 'Tchatter', 'Wheek', 'Whine']
        calls2use = ['Chut', 'Rumble', 'Wheek', 'Whine']

        all_files_nameonly= []
        psd_data_list= []

        for cur_call_dir in valid_datadirs:
            cur_call_path = root_matdata_dir + cur_call_dir + '/' 
            # print(cur_call_path)
            cur_dir_files = [f for f in os.listdir(cur_call_path) if os.path.isfile(os.path.join(cur_call_path, f))]
            all_files_nameonly = all_files_nameonly + cur_dir_files
            psd_data_list = psd_data_list + [cur_call_path + f for f in cur_dir_files]

        print(f"--> all_files_nameonly: len lines= {len(all_files_nameonly)}, first line = {all_files_nameonly[0]}")
        print(f"--> psd_data_list: len lines= {len(psd_data_list)}, first line = {psd_data_list[0]}")

        with open(out_allfiles_txt_fname, 'w') as f:
            for line in psd_data_list:
                f.write(f"{line}\n")

        data_psd_x = []
        for fName in psd_data_list:
            data = mh.loadmat(fName)
            data_psd_x.append(data["psd_data"]["psd"]) 
            
        data_label_name = [item[item.rfind(pre_search_str)+len(pre_search_str):item.rfind(post_search_str)] for item in psd_data_list]
        data_label_y = len(calls2use)*np.ones((len(data_psd_x),1))
        unq_vals, unq_counts = np.unique(data_label_y, return_counts=True)
        print(dict(zip(unq_vals,unq_counts)))

        for ind, cur_call in zip(np.arange(len(data_label_name)),data_label_name):
            if cur_call in calls2use: 
                data_label_y[ind,0] = calls2use.index(cur_call)

        data_psd_x = np.array(data_psd_x)
        data_label_y = np.array(data_label_y).astype(int)
        psd_data_list = np.array(psd_data_list)
        print(f"data_psd_x={type(data_psd_x)}&{len(data_psd_x)},data_label_y={type(data_label_y)}&{data_label_y.shape},\
            psd_data_list={type(psd_data_list)}&{psd_data_list.shape}")

        hf = h5py.File(out_psd_data_file, "w")
        try:
            hf.create_dataset('data_psd_x', data=data_psd_x)
            hf.create_dataset('data_label_y', data=data_label_y)
            # hf.create_dataset('data_filename', data=psd_data_list)
            hf.close()
            print("Saved file" + out_psd_data_file)

        except:
            print(f"Error trying save {out_psd_data_file}")
            hf.close()

    else: 
        print("File (" + out_psd_data_file + ") already exists")

## 3. Loop through SNRs to create different hdf5 files

In [12]:
all_snrs= np.arange(-20.0, 11, 5)
all_snrs= np.append(all_snrs, np.inf)
forceReDo = True
for snr_val in all_snrs:
    save_snr_data(snr_val,forceReDo)

Working on: ../Datasets/psd_gp_vIHC_mat/level65_dBspl_SNR-20_white/
--> all_files_nameonly: len lines= 1605, first line = psd_Chut_2_Feb_07_2022_51861688_ms_101198_101787.mat
--> psd_data_list: len lines= 1605, first line = ../Datasets/psd_gp_vIHC_mat/level65_dBspl_SNR-20_white/Chut/psd_Chut_2_Feb_07_2022_51861688_ms_101198_101787.mat
{4.0: 1605}
data_psd_x=<class 'numpy.ndarray'>&1605,data_label_y=<class 'numpy.ndarray'>&(1605, 1),            psd_data_list=<class 'numpy.ndarray'>&(1605,)
Saving file../Datasets/psd_gp_vIHC_mat/PyData/PSD_data_SNR-20.h5
Working on: ../Datasets/psd_gp_vIHC_mat/level65_dBspl_SNR-15_white/
--> all_files_nameonly: len lines= 1605, first line = psd_Chut_2_Feb_07_2022_51861688_ms_101198_101787.mat
--> psd_data_list: len lines= 1605, first line = ../Datasets/psd_gp_vIHC_mat/level65_dBspl_SNR-15_white/Chut/psd_Chut_2_Feb_07_2022_51861688_ms_101198_101787.mat
{4.0: 1605}
data_psd_x=<class 'numpy.ndarray'>&1605,data_label_y=<class 'numpy.ndarray'>&(1605, 1),     