# Notebook to create PSD (power spectral density) dataset in H5 format. 
Load required packages 

In [None]:
import os 
import numpy as np 
import h5py
import scipy.io
import matlab_helpers as mh 
import tensorflow as tf 

Read all txt files used to train/test the feature-based model. Create a dataset 

In [None]:
root_prevlist_dir = 'D:/Dropbox/Pitts_files/ShiTongShare/ANcode_GPlynx/GPlynx_list_cluster/'
cluster_datadir_name = '/ix/ssadagopan/sap245/GPfiles_MIFanalysis/cog_gp/vIHC/level65_dBspl_clean/'
all_call_dirnames = [x for x in os.listdir(root_prevlist_dir)]
# all_call_dirnames= all_call_dirnames[1:]
print(all_call_dirnames)

files_in_test = []
for cur_call_dir in all_call_dirnames:
    with open(root_prevlist_dir + cur_call_dir + '/inclass_test.txt') as f_in:
        cur_lines = f_in.readlines()
        cur_lines = [item.replace(item[:1+item.rfind('/')], '') for item in cur_lines]
        cur_lines = [item.replace('\n', '') for item in cur_lines]
        files_in_test = files_in_test + cur_lines
    with open(root_prevlist_dir + cur_call_dir + '/outclass_test.txt') as f_out:
        cur_lines = f_out.readlines()
        cur_lines = [item.replace(item[:1+item.rfind('/')], '') for item in cur_lines]
        cur_lines = [item.replace('\n', '') for item in cur_lines]
        files_in_test = files_in_test + cur_lines
    
files_in_test = list(set(files_in_test)) # Only keep unique entries 
print(f"len lines= {len(files_in_test)}, first five lines = \n {files_in_test[0:5]}")

Read the names of all mat-files 

In [None]:
root_matdata_dir = 'D:/Dropbox/Python/MLmodels/Datasets/psd_gp_vIHC_mat/level65_dBspl_clean/'
valid_datadirs = ['Chut', 'HighWhistle', 'Rumble', 'Tchatter', 'Wheek', 'Whine']

all_files_nameonly= []
all_files_fullname= []

for cur_call_dir in valid_datadirs:
    cur_call_path = root_matdata_dir + cur_call_dir + '/' 
    # print(cur_call_path)
    cur_dir_files = [f for f in os.listdir(cur_call_path) if os.path.isfile(os.path.join(cur_call_path, f))]
    all_files_nameonly = all_files_nameonly + cur_dir_files
    all_files_fullname = all_files_fullname + [cur_call_path + f for f in cur_dir_files]
    

files_in_test_prefixed = ['psd_' + item for item in files_in_test]
# test_data_inds = [i for i, e in enumerate(files_in_test_prefixed) if e in set(all_files_nameonly)]
test_data_inds = [i for i, e in enumerate(files_in_test_prefixed) if e in set(all_files_nameonly)]
train_data_inds = [i for i in np.arange(len(all_files_fullname)) if i not in test_data_inds] 

print(f"type={type(test_data_inds)},len={len(train_data_inds)} | first three elements = {train_data_inds[0:3]}")

train_data_list = [all_files_fullname[i] for i in train_data_inds]
test_data_list = [all_files_fullname[i] for i in test_data_inds]

print(f"--> all_files_fullname: len lines= {len(all_files_fullname)}, first line ={all_files_fullname[0]}")
print(f"--> all_files_nameonly: len lines= {len(all_files_nameonly)}, first line = {all_files_nameonly[0]}")
print(f"--> files_in_test_prefixed: len lines= {len(files_in_test_prefixed)}, first line = {files_in_test_prefixed[0]}")
print(f"--> train_data_list: len lines= {len(train_data_list)}, first line = {train_data_list[0]}")

out_train_txt_fname = 'D:/Dropbox/Python/MLmodels/Datasets/train_data_list.txt'
with open(out_train_txt_fname, 'w') as f:
    for line in train_data_list:
        f.write(f"{line}\n")

out_test_txt_fname = 'D:/Dropbox/Python/MLmodels/Datasets/test_data_list.txt'
with open(out_test_txt_fname, 'w') as f:
    for line in test_data_list:
        f.write(f"{line}\n")



Read Mat files and then create .h5 files for training 

In [None]:
out_trainfile = 'D:/Dropbox/Python/MLmodels/Datasets/train_data.h5'
data_psd_train_x = []
pre_search_str = 'clean/'
post_search_str = '/psd_'

for fName in train_data_list:
    data = mh.loadmat(fName)
    data_psd_train_x.append(data["psd_data"]["psd"]) 
    
data_label_train_name = [item[item.rfind(pre_search_str)+len(pre_search_str):item.rfind(post_search_str)] for item in train_data_list]
data_label_train_y = len(all_call_dirnames)*np.ones((len(data_psd_train_x),1))
unq_vals, unq_counts = np.unique(data_label_train_y, return_counts=True)
print(dict(zip(unq_vals,unq_counts)))

for ind, cur_call in zip(np.arange(len(data_label_train_name)),data_label_train_name):
    if cur_call in all_call_dirnames: 
        data_label_train_y[ind,0] = all_call_dirnames.index(cur_call)

data_psd_train_x = np.array(data_psd_train_x)
data_label_train_y = np.array(data_label_train_y).astype(int)
print(f"data_psd_train_x={type(data_psd_train_x)}&{len(data_psd_train_x)},data_label_train_y={type(data_label_train_y)}&{data_label_train_y.shape},")

hf = h5py.File(out_trainfile, "w")
hf.close()

if (not os.path.exists(out_trainfile)):
    print("Saving file" + out_trainfile)
    hf = h5py.File(out_trainfile, "w")
    hf.create_dataset('data_psd_train_x',data=data_psd_train_x)
    hf.create_dataset('data_label_train_y',data=data_label_train_y)
    hf.close()
else: 
    print("File (" + out_trainfile + ") already exists")




Read Mat files and then create .h5 files for testing  

In [None]:
out_testfile = 'D:/Dropbox/Python/MLmodels/Datasets/test_data.h5'
data_psd_test_x = []

for fName in test_data_list:
    data = mh.loadmat(fName)
    data_psd_test_x.append(data["psd_data"]["psd"]) 
    
data_label_test_name = [item[item.rfind(pre_search_str)+len(pre_search_str):item.rfind(post_search_str)] for item in test_data_list]
data_label_test_y = len(all_call_dirnames)*np.ones((len(data_psd_test_x),1))

for ind, cur_call in zip(np.arange(len(data_label_test_name)),data_label_test_name):
    if cur_call in all_call_dirnames: 
        data_label_test_y[ind,0] = all_call_dirnames.index(cur_call)

data_psd_test_x = np.array(data_psd_test_x)
data_label_test_y = np.array(data_label_test_y).astype(int)
print(f"data_psd_test_x={type(data_psd_test_x)}&{len(data_psd_test_x)},data_label_train_y={type(data_label_test_y)}&{data_label_test_y.shape},")

if not os.path.exists(out_testfile):
    print("Saving file" + out_testfile)
    hf = h5py.File(out_testfile, "w")
    hf.create_dataset('data_psd_test_x',data=data_psd_test_x)
    hf.create_dataset('data_label_test_y',data=data_label_test_y)
    hf.close()
else: 
    print("File (" + out_testfile + ") already exists")