# Code to create HDF5 datasets (at different SNRs) for vIHC-based 2D-Carrier/Modulation power data 
Notebook outline: 
1. Load required packages 
2. Define a function to create dataset for a single SNR 
3. Loop through SNRs to create different hdf5 files

## 1. Load required packages

In [1]:
import os 
import numpy as np 
import h5py
import scipy.io
import matlab_helpers as mh 
from keras.utils import to_categorical

## 2. Define a function to create dataset for a single SNR 
Steps are 
* Read SNR 
* Define the right input and output directories/files 
* Read all data 
* Save HDF5 (X, Y, and filenames)

In [2]:

def save_snr_data(snr_value):
    root_in_dir= '../Datasets/CarModPow/'
    root_out_dir= '../Datasets/CarModPow/PyData/'
    if not os.path.exists(root_out_dir):
        os.makedirs(root_out_dir)
    
    post_search_str = '/CarMod_'
    if np.isinf(snr_value):
        pre_search_str = 'clean/'
        root_matdata_dir = root_in_dir + 'level65_dBspl_clean/'
        out_allfiles_txt_fname = root_out_dir + 'CarMod_data_list_clean.txt'
        out_CarMod_data_file = root_out_dir + 'CarMod_data_clean.h5'
    else:
        pre_search_str = '_white/'
        root_matdata_dir = root_in_dir + 'level65_dBspl_SNR' + snr_value.astype('int').astype('str') + '_white/'
        out_allfiles_txt_fname = root_out_dir + 'CarMod_data_list_SNR' + snr_value.astype('int').astype('str') + '.txt'
        out_CarMod_data_file = root_out_dir + 'CarMod_data_SNR' + snr_value.astype('int').astype('str') + '.h5'

    if (not os.path.exists(out_CarMod_data_file)):
        if os.path.exists(root_matdata_dir):
            print(f"Working on: {root_matdata_dir}")
        else:
            print(f"Nooooooooo: {root_matdata_dir}")

        valid_datadirs = ['Chut', 'HighWhistle', 'Rumble', 'Tchatter', 'Wheek', 'Whine']
        calls2use = ['Chut', 'Rumble', 'Wheek', 'Whine']

        all_files_nameonly= []
        CarMod_data_list= []

        for cur_call_dir in valid_datadirs:
            cur_call_path = root_matdata_dir + cur_call_dir + '/' 
            # print(cur_call_path)
            cur_dir_files = [f for f in os.listdir(cur_call_path) if os.path.isfile(os.path.join(cur_call_path, f))]
            all_files_nameonly = all_files_nameonly + cur_dir_files
            CarMod_data_list = CarMod_data_list + [cur_call_path + f for f in cur_dir_files]

        print(f"--> all_files_nameonly: len lines= {len(all_files_nameonly)}, first line = {all_files_nameonly[0]}")
        print(f"--> CarMod_data_list: len lines= {len(CarMod_data_list)}, first line = {CarMod_data_list[0]}")

        with open(out_allfiles_txt_fname, 'w') as f:
            for line in CarMod_data_list:
                f.write(f"{line}\n")


        data_label_name = [item[item.rfind(pre_search_str)+len(pre_search_str):item.rfind(post_search_str)] for item in CarMod_data_list]
        data_label_y = len(calls2use)*np.ones((len(data_label_name),1))
        print(f"len: CarMod_data_list = {len(CarMod_data_list)} || data_label_name = {len(data_label_name)}")        
        print(f"first ex: CarMod_data_list = {CarMod_data_list[0]} || data_label_name = {data_label_name[0]}")
        # data_label_y= to_categorical(data_label_y)
        print(data_label_y.shape)
        

        for ind, cur_call in zip(np.arange(len(data_label_name)),data_label_name):
            # print(f"ind={ind}|cur_call={cur_call}|call2use={calls2use}")
            if cur_call in calls2use: 
                data_label_y[ind,0] = calls2use.index(cur_call)
        data_label_y = np.array(data_label_y).astype(int)                
        unq_vals, unq_counts = np.unique(data_label_y, return_counts=True)
        print(dict(zip(unq_vals,unq_counts)))

        print(len(data_label_y))

        chunk_size = 5 
        n_chunks = np.ceil(len(data_label_y)/chunk_size).astype('int')
        print(f"n_chunks={n_chunks}")
        hf = h5py.File(out_CarMod_data_file, "a")
        print("Saving file" + out_CarMod_data_file)
        
        for chunkVar in range(n_chunks):
            cur_chunk_CarMod_x = []
            cur_chunk_label_y = []
            cur_chunk_fName = []
            for inds_in_chunk in range(chunk_size):
                file_ind = chunkVar*chunk_size+inds_in_chunk
                
                fName = CarMod_data_list[file_ind]
                new_data = mh.loadmat(fName)
                cur_chunk_CarMod_x.append(np.array(new_data["CarMod_power"]["CarMod_power"]))
                cur_chunk_label_y.append(data_label_y[file_ind])
                cur_chunk_fName.append(fName)
                # print(f"chunkVar={chunkVar}: inds_in_chunk={file_ind}")
            
            cur_chunk_CarMod_x= np.array(cur_chunk_CarMod_x)
            cur_chunk_label_y = np.array(cur_chunk_label_y).astype(int)                        
            cur_chunk_fName = np.array(cur_chunk_fName)
            # print(f"type={type(cur_chunk_label_y)}")
            print(f"chunkVar={chunkVar} data_CarMod_x.shape={cur_chunk_CarMod_x.shape} | data_label_y.shape={cur_chunk_label_y.shape}, cur_chunk_fName={cur_chunk_fName.shape}")

            if chunkVar == 0:
                # Create the dataset at first
                hf.create_dataset('data_CarMod_x', data=cur_chunk_CarMod_x, compression="gzip", chunks=True, maxshape=(None,67,9))
                hf.create_dataset('data_label_y', data=cur_chunk_label_y, compression="gzip", chunks=True, maxshape=(None,1)) 
                # hf.create_dataset('data_filename', data=cur_chunk_fName, compression="gzip", maxshape=(None,)) 
            else:
                # Append new data to it
                hf['data_CarMod_x'].resize((hf['data_CarMod_x'].shape[0] + chunk_size), axis=0)
                hf['data_CarMod_x'][-chunk_size:] = cur_chunk_CarMod_x

                hf['data_label_y'].resize((hf['data_label_y'].shape[0] + chunk_size), axis=0)
                hf['data_label_y'][-chunk_size:] = cur_chunk_label_y

                # hf['data_filename'].resize((hf['data_filename'].shape[0] + chunk_size), axis=0)
                # hf['data_filename'][-chunk_size:] = cur_chunk_fName

        hf.close()

    else: 
        print("File (" + out_CarMod_data_file + ") already exists")

# all_snrs= np.arange(-20.0, 11, 5)
# all_snrs= np.append(all_snrs, np.inf)
# for snr_val in all_snrs:
#     save_snr_data(snr_val)


Working on: D:/Dropbox/Python/MLmodels/Datasets/CarModPow/level65_dBspl_SNR-20_white/
--> all_files_nameonly: len lines= 1605, first line = CarMod_psd_Chut_2_Feb_07_2022_51861688_ms_101198_101787.mat
--> CarMod_data_list: len lines= 1605, first line = D:/Dropbox/Python/MLmodels/Datasets/CarModPow/level65_dBspl_SNR-20_white/Chut/CarMod_psd_Chut_2_Feb_07_2022_51861688_ms_101198_101787.mat
len: CarMod_data_list = 1605 || data_label_name = 1605
first ex: CarMod_data_list = D:/Dropbox/Python/MLmodels/Datasets/CarModPow/level65_dBspl_SNR-20_white/Chut/CarMod_psd_Chut_2_Feb_07_2022_51861688_ms_101198_101787.mat || data_label_name = Chut
(1605, 1)
{0: 432, 1: 253, 2: 372, 3: 457, 4: 91}
1605
n_chunks=321
Saving fileD:/Dropbox/Python/MLmodels/Datasets/CarModPow/PyData/CarMod_data_SNR-20.h5
chunkVar=0 data_CarMod_x.shape=(5, 67, 9) | data_label_y.shape=(5, 1), cur_chunk_fName=(5,)
chunkVar=1 data_CarMod_x.shape=(5, 67, 9) | data_label_y.shape=(5, 1), cur_chunk_fName=(5,)
chunkVar=2 data_CarMod_x

## 3. Loop through SNRs to create different hdf5 files

In [3]:
all_snrs= np.arange(-20.0, 11, 5)
all_snrs= np.append(all_snrs, np.inf)
for snr_val in all_snrs[1:]:
    save_snr_data(snr_val)