In [None]:
import os 
import numpy as np 
import h5py
import scipy.io
import matlab_helpers as mh 
from sklearn.model_selection import train_test_split

## Read unique files and create training dataset 

In [None]:
from numpy import int8

root_in_dir= '../Datasets/mps_gp_sgram/level65_dBspl_ds_int8/'
root_out_dir= '../Datasets/mps_gp_sgram/PyData/'
if not os.path.exists(root_out_dir):
    os.makedirs(root_out_dir)

out_dsMPS_data_train_file= root_out_dir + 'dsMPS_data_train_file.h5'
out_dsMPS_data_train_listfile= root_out_dir + 'dsMPS_train_list.txt'
out_dsMPS_data_test_listfile= root_out_dir + 'dsMPS_test_file.txt'

valid_datadirs = ['Chut', 'HighWhistle', 'Rumble', 'Tchatter', 'Wheek', 'Whine']
calls2use = ['Chut', 'Rumble', 'Wheek', 'Whine']

uniq_files_all_calls = []
for cur_call_dir in valid_datadirs:
    cur_call_path = root_in_dir + cur_call_dir + '/' 
    cur_dir_files = [f for f in os.listdir(cur_call_path) if os.path.isfile(os.path.join(cur_call_path, f))]
    filenames= [item[0:item.rfind('_snr')] for item in cur_dir_files]
    unq_filenames= list(set(filenames))
    uniq_files_all_calls = uniq_files_all_calls + unq_filenames
    print(f"total={len(filenames)} | unique={len(unq_filenames)} | math = {len(filenames)/len(unq_filenames)}")

print(f"{len(uniq_files_all_calls)} total unique files")

training_files, testing_files, training_inds, testing_inds = \
    train_test_split(uniq_files_all_calls, np.arange(len(uniq_files_all_calls)), test_size= 0.25, random_state=1)

with open(out_dsMPS_data_train_listfile, 'w') as f:
    for line in training_files:
        f.write(f"{line}\n")

with open(out_dsMPS_data_test_listfile, 'w') as f:
    for line in testing_files:
        f.write(f"{line}\n")

all_snrs= np.arange(-20, 11, 5)
all_snrs= np.append(all_snrs, np.inf)
snr_training_weights= np.flip(len(all_snrs)-np.arange(len(all_snrs)))/len(all_snrs)

data_dsMPS_x = []
data_label_y = []
data_filename = []

for snr_value,cur_weight in zip(all_snrs, snr_training_weights):
    print(f"snr_value={snr_value} with weight = {cur_weight}")
    if cur_weight<1:
        cur_snr_train_inds, _ = train_test_split(np.arange(len(training_inds)), test_size= 1-cur_weight, random_state=1)
    else:
        cur_snr_train_inds = training_inds

    for ind in cur_snr_train_inds:
        cur_file = uniq_files_all_calls[ind]
        cur_call = cur_file[:cur_file.find('_')]
        if np.isinf(snr_value):
            cur_filename= root_in_dir + cur_call + '/' + cur_file + '_snrInf.mat'  # because matlab uses Inf not inf 
        else:
            cur_filename= root_in_dir + cur_call + '/' + cur_file + '_snr' + str(np.int_(snr_value)).replace('-', 'm') + '.mat'

        new_data = mh.loadmat(cur_filename)
        data_dsMPS_x.append(new_data["mps_struct"]["mps_pow_dB"])

        if cur_call in calls2use:
            data_label_y.append(calls2use.index(cur_call))
        else:
            data_label_y.append(len(calls2use))
        
        data_filename.append(cur_filename)

        print(cur_filename)

data_dsMPS_x = np.array(data_dsMPS_x).astype(int8)
data_label_y = np.array(data_label_y).astype(int)

if (not os.path.exists(out_dsMPS_data_train_file)):
    print("Saving file" + out_dsMPS_data_train_file)
    hf = h5py.File(out_dsMPS_data_train_file, "w")
    hf.create_dataset('data_dsMPS_x',data=data_dsMPS_x)
    hf.create_dataset('data_label_y',data=data_label_y)
    hf.close()
else: 
    print("File (" + out_dsMPS_data_train_file + ") already exists")

## Create test dataset (for different SNRs)

In [None]:
for snr_value in all_snrs:
    print(f"snr_value={snr_value} with weight = {cur_weight}")
    data_dsMPS_x = []
    data_label_y = []
    data_filename = []

    if np.isinf(snr_value):
        out_dsMPS_data_test_file= root_out_dir + 'dsMPS_data_test_snrInf.h5'
    else: 
        out_dsMPS_data_test_file= root_out_dir + 'dsMPS_data_test_snr' + str(np.int_(snr_value)) + '.h5'
    print(out_dsMPS_data_test_file)
    if (not os.path.exists(out_dsMPS_data_test_file)):

        for ind in testing_inds:
            cur_file = uniq_files_all_calls[ind]
            cur_call = cur_file[:cur_file.find('_')]
            if np.isinf(snr_value):
                cur_filename= root_in_dir + cur_call + '/' + cur_file + '_snrInf.mat'  # because matlab uses Inf not inf 
            else:
                cur_filename= root_in_dir + cur_call + '/' + cur_file + '_snr' + str(np.int_(snr_value)).replace('-', 'm') + '.mat'

            new_data = mh.loadmat(cur_filename)
            data_dsMPS_x.append(new_data["mps_struct"]["mps_pow_dB"])

            if cur_call in calls2use:
                data_label_y.append(calls2use.index(cur_call))
            else:
                data_label_y.append(len(calls2use))
            
            data_filename.append(cur_filename)

        data_dsMPS_x = np.array(data_dsMPS_x).astype(int8)
        data_label_y = np.array(data_label_y).astype(int)

        print("Saving file" + out_dsMPS_data_test_file)
        hf = h5py.File(out_dsMPS_data_test_file, "w")
        hf.create_dataset('data_dsMPS_x',data=data_dsMPS_x)
        hf.create_dataset('data_label_y',data=data_label_y)
        hf.close()
    else: 
        print("File (" + out_dsMPS_data_test_file + ") already exists")