In [4]:
import os 
import numpy as np 
import h5py
import scipy.io
import matlab_helpers as mh 
from sklearn.model_selection import train_test_split

Read unique files and create training and testing dataset 

In [5]:
from numpy import int8

root_in_dir= 'D:/Dropbox/Python/MLmodels/Datasets/sgRNN_vIHC/vIHC_40ms/'
root_out_dir= 'D:/Dropbox/Python/MLmodels/Datasets/sgRNN_vIHC/PyData/'
clean_call_in_dir= root_in_dir + 'level65_dBspl_clean/'
if not os.path.exists(root_out_dir):
    os.makedirs(root_out_dir)

out_sgRNN_data_train_file= root_out_dir + 'sgRNN_data_train_file.npz'
out_sgRNN_data_train_listfile= root_out_dir + 'sgRNN_train_list.txt'
out_sgRNN_data_test_listfile= root_out_dir + 'sgRNN_test_file.txt'

valid_datadirs = ['Chut', 'HighWhistle', 'Rumble', 'Tchatter', 'Wheek', 'Whine']
calls2use = ['Chut', 'Rumble', 'Wheek', 'Whine']

files_all_calls_fullpath = []
files_all_calls_nameonly = []
for cur_call_dir in valid_datadirs:
    cur_call_path = clean_call_in_dir + cur_call_dir + '/' 
    cur_dir_files = [f for f in os.listdir(cur_call_path) if os.path.isfile(os.path.join(cur_call_path, f))]
    files_all_calls_fullpath = files_all_calls_fullpath + [cur_call_path + item for item in cur_dir_files]
    files_all_calls_nameonly = files_all_calls_nameonly + [item for item in cur_dir_files]

print(f"{len(files_all_calls_fullpath)} total unique files: ex = {files_all_calls_fullpath[0]}")

training_files, testing_files, training_inds, testing_inds = \
    train_test_split(files_all_calls_fullpath, np.arange(len(files_all_calls_fullpath)), test_size= 0.25, random_state=1)

with open(out_sgRNN_data_train_listfile, 'w') as f:
    for line in training_files:
        f.write(f"{line}\n")

with open(out_sgRNN_data_test_listfile, 'w') as f:
    for line in testing_files:
        f.write(f"{line}\n")

all_snrs= np.arange(-20, 11, 5)
all_snrs= np.append(all_snrs, np.inf)
snr_training_weights= np.flip(len(all_snrs)-np.arange(len(all_snrs)))/len(all_snrs)

temp_data_sgRNN_x = []
data_label_y = []
data_filename = []

pre_search_str = '_clean/'
post_search_str = '/sgRNN_'
for snr_value,cur_weight in zip(all_snrs, snr_training_weights):
    print(f"snr_value={snr_value} with weight = {cur_weight}")
    if np.isinf(snr_value):
        cur_snr_in_dir= root_in_dir + 'level65_dBspl_clean/' 
    else:
        cur_snr_in_dir= root_in_dir + 'level65_dBspl_SNR' + snr_value.astype('int').astype('str') + '_white/' 

    if cur_weight<1:
        cur_snr_train_inds, _ = train_test_split(np.arange(len(training_inds)), test_size= 1-cur_weight, random_state=1)
    else:
        cur_snr_train_inds = training_inds

    for ind in cur_snr_train_inds:
        cur_file = files_all_calls_fullpath[ind]
        cur_call = cur_file[cur_file.rfind(pre_search_str)+len(pre_search_str):cur_file.rfind(post_search_str)]
        # print(f"cur_file={cur_file} | cur_call={cur_call}")
        cur_filename= cur_snr_in_dir + cur_call + '/' + files_all_calls_nameonly[ind]

        new_data = mh.loadmat(cur_filename)
        temp_data_sgRNN_x.append(new_data["sgRNN_data"]["pow_dB"])

        if cur_call in calls2use:
            data_label_y.append(calls2use.index(cur_call))
        else:
            data_label_y.append(len(calls2use))
        
        data_filename.append(cur_filename)

        # print(cur_filename)

num_seg = [item.shape for item in temp_data_sgRNN_x]

if (not os.path.exists(out_sgRNN_data_train_file)):
    print("Saving file" + out_sgRNN_data_train_file)
    data_sgRNN_x = np.empty(len(temp_data_sgRNN_x), object)
    data_sgRNN_x[:] = temp_data_sgRNN_x
    np.savez(out_sgRNN_data_train_file, data_sgRNN_x=data_sgRNN_x, data_label_y=data_label_y, data_filename=data_filename)
else: 
    print("File (" + out_sgRNN_data_train_file + ") already exists")

1604 total unique files: ex = D:/Dropbox/Python/MLmodels/Datasets/sgRNN_vIHC/vIHC_40ms/level65_dBspl_clean/Chut/sgRNN_Chut_2_Feb_07_2022_51861688_ms_101198_101787.mat
snr_value=-20.0 with weight = 0.125
snr_value=-15.0 with weight = 0.25
snr_value=-10.0 with weight = 0.375
snr_value=-5.0 with weight = 0.5
snr_value=0.0 with weight = 0.625
snr_value=5.0 with weight = 0.75
snr_value=10.0 with weight = 0.875
snr_value=inf with weight = 1.0
Saving fileD:/Dropbox/Python/MLmodels/Datasets/sgRNN_vIHC/PyData/sgRNN_data_train_file.npz


Create test dataset 

In [6]:
for snr_value in all_snrs:
    if np.isinf(snr_value):
        cur_snr_in_dir= root_in_dir + 'level65_dBspl_clean/' 
    else:
        cur_snr_in_dir= root_in_dir + 'level65_dBspl_SNR' + snr_value.astype('int').astype('str') + '_white/' 

    temp_data_sgRNN_x = []
    data_label_y = []
    data_filename = []

    if np.isinf(snr_value):
        out_sgRNN_data_test_file= root_out_dir + 'sgRNN_data_test_snrInf.npz'
    else: 
        out_sgRNN_data_test_file= root_out_dir + 'sgRNN_data_test_snr' + str(np.int_(snr_value)) + '.npz'
    # print(out_sgRNN_data_test_file)

    if (not os.path.exists(out_sgRNN_data_test_file)):

        for ind in testing_inds:
            cur_file = files_all_calls_fullpath[ind]
            cur_call = cur_file[cur_file.rfind(pre_search_str)+len(pre_search_str):cur_file.rfind(post_search_str)]
            # print(f"cur_file={cur_file} | cur_call={cur_call}")
            cur_filename= cur_snr_in_dir + cur_call + '/' + files_all_calls_nameonly[ind]

            new_data = mh.loadmat(cur_filename)
            temp_data_sgRNN_x.append(new_data["sgRNN_data"]["pow_dB"])

            if cur_call in calls2use:
                data_label_y.append(calls2use.index(cur_call))
            else:
                data_label_y.append(len(calls2use))
            
            data_filename.append(cur_filename)

        # data_sgRNN_x = np.array(data_sgRNN_x).astype(int8)
        # data_label_y = np.array(data_label_y).astype(int)

        data_sgRNN_x = np.empty(len(temp_data_sgRNN_x), object)
        data_sgRNN_x[:] = temp_data_sgRNN_x
        print(f"Saving file {out_sgRNN_data_test_file}, len={len(data_sgRNN_x)}")
        np.savez(out_sgRNN_data_test_file, data_sgRNN_x=data_sgRNN_x, data_label_y=data_label_y, data_filename=data_filename)

        # print("Saving file" + out_sgRNN_data_test_file)
        # hf = h5py.File(out_sgRNN_data_test_file, "w")
        # hf.create_dataset('data_sgRNN_x',data=data_sgRNN_x)
        # hf.create_dataset('data_label_y',data=data_label_y)
        # hf.close()
    else: 
        print("File (" + out_sgRNN_data_test_file + ") already exists")

Saving file D:/Dropbox/Python/MLmodels/Datasets/sgRNN_vIHC/PyData/sgRNN_data_test_snr-20.npz, len=401
Saving file D:/Dropbox/Python/MLmodels/Datasets/sgRNN_vIHC/PyData/sgRNN_data_test_snr-15.npz, len=401
Saving file D:/Dropbox/Python/MLmodels/Datasets/sgRNN_vIHC/PyData/sgRNN_data_test_snr-10.npz, len=401
Saving file D:/Dropbox/Python/MLmodels/Datasets/sgRNN_vIHC/PyData/sgRNN_data_test_snr-5.npz, len=401
Saving file D:/Dropbox/Python/MLmodels/Datasets/sgRNN_vIHC/PyData/sgRNN_data_test_snr0.npz, len=401
Saving file D:/Dropbox/Python/MLmodels/Datasets/sgRNN_vIHC/PyData/sgRNN_data_test_snr5.npz, len=401
Saving file D:/Dropbox/Python/MLmodels/Datasets/sgRNN_vIHC/PyData/sgRNN_data_test_snr10.npz, len=401
Saving file D:/Dropbox/Python/MLmodels/Datasets/sgRNN_vIHC/PyData/sgRNN_data_test_snrInf.npz, len=401
