# Code for training an MLP to classify calls using [Carrier X Modultion] power 
Steps to follow 
* Load required packages 
* Read all mat-files 
* Add labels (call names)
* Split into train/test datasets
* Create batches for training 
* Define model 
* Train model 
* Test model and report accuracy 

In [57]:
import os 
import numpy as np 
import h5py
import scipy.io
import matlab_helpers as mh 
from keras.utils import to_categorical, Sequence
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import tensorflow as tf 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Read the names of all mat-files 

In [58]:
root_matdata_dir = 'D:/Dropbox/Python/MLmodels/Datasets/CarModPow/'
out_ml_dir = root_matdata_dir + 'ml_data/'
if not os.path.exists(out_ml_dir):
    os.makedirs(out_ml_dir)

clean_in_data_dir = root_matdata_dir + 'level65_dBspl_clean/'
valid_datadirs = ['Chut', 'HighWhistle', 'Rumble', 'Tchatter', 'Wheek', 'Whine']
calls2use = ['Chut', 'Rumble', 'Wheek', 'Whine']

all_files_nameonly= []
CarModPow_data_list= []

for cur_call_dir in valid_datadirs:
    cur_call_path = clean_in_data_dir + cur_call_dir + '/' 
    # print(cur_call_path)
    cur_dir_files = [f for f in os.listdir(cur_call_path) if os.path.isfile(os.path.join(cur_call_path, f))]
    all_files_nameonly = all_files_nameonly + cur_dir_files
    CarModPow_data_list = CarModPow_data_list + [cur_call_path + f for f in cur_dir_files]
    

print(f"--> all_files_nameonly: len lines= {len(all_files_nameonly)}, first line = {all_files_nameonly[0]}")
print(f"--> CarModPow_data_list: len lines= {len(CarModPow_data_list)}, first line = {CarModPow_data_list[0]}")

out_allfiles_txt_fname = out_ml_dir + 'CarMod_data_list.npy'
np.save(out_allfiles_txt_fname, CarModPow_data_list)

with open(out_allfiles_txt_fname, 'w') as f:
    for line in CarModPow_data_list:
        f.write(f"{line}\n")

# qq = np.ravel(mh.loadmat(CarModPow_data_list[0])["CarMod_power"]["CarMod_power"])
# print(qq.shape)

--> all_files_nameonly: len lines= 1605, first line = CarMod_psd_Chut_2_Feb_07_2022_51861688_ms_101198_101787.mat
--> CarModPow_data_list: len lines= 1605, first line = D:/Dropbox/Python/MLmodels/Datasets/CarModPow/level65_dBspl_clean/Chut/CarMod_psd_Chut_2_Feb_07_2022_51861688_ms_101198_101787.mat


This time, let's not read all mat-files because the files would be huge. Instead, we save training, validation filenames and load data in minibatches. 

In [59]:
pre_search_str = 'clean/'
post_search_str = '/CarMod'

    
data_label_name = [item[item.rfind(pre_search_str)+len(pre_search_str):item.rfind(post_search_str)] for item in CarModPow_data_list]
data_label_y = len(calls2use)*np.ones((len(data_label_name),1))
unq_vals, unq_counts = np.unique(data_label_y, return_counts=True)
print(dict(zip(unq_vals,unq_counts)))

for ind, cur_call in zip(np.arange(len(data_label_name)),data_label_name):
    if cur_call in calls2use: 
        data_label_y[ind,0] = calls2use.index(cur_call)

CarMod_data_label = np.array(data_label_y).astype(int)
CarMod_data_label_oh= to_categorical(CarMod_data_label)

out_allfiles_txt_label = out_ml_dir + 'CarMod_data_label.npy'
out_allfiles_txt_label_oh = out_ml_dir + 'CarMod_data_label_onehot.npy'
np.save(out_allfiles_txt_label, CarMod_data_label)
np.save(out_allfiles_txt_label_oh, CarMod_data_label_oh)


{4.0: 1605}


Let's create file/label lists with shuffeld indices. 

In [60]:
CarMod_data_list_shuffled, CarMod_data_label_shuffled = shuffle(CarModPow_data_list, CarMod_data_label)
out_allfiles_txt_label_shuffled = out_ml_dir + 'all_CarMod_data_label_shuffled.npy'
out_allfiles_txt_label_oh_shuffled = out_ml_dir + 'all_CarMod_data_label_onehot_shuffled.npy'
np.save(out_allfiles_txt_label_shuffled, CarMod_data_label_shuffled)
np.save(out_allfiles_txt_label_oh_shuffled, CarMod_data_list_shuffled)

print(CarMod_data_label_shuffled[0:4])
print(CarMod_data_list_shuffled[0:4])


[[0]
 [2]
 [4]
 [0]]
['D:/Dropbox/Python/MLmodels/Datasets/CarModPow/level65_dBspl_clean/Chut/CarMod_psd_Chut_3_Feb_11_2022_58942456_ms_37472_37803.mat', 'D:/Dropbox/Python/MLmodels/Datasets/CarModPow/level65_dBspl_clean/Wheek/CarMod_psd_Wheek_2_Mar_20_2022_55617305_ms_31113_32067.mat', 'D:/Dropbox/Python/MLmodels/Datasets/CarModPow/level65_dBspl_clean/HighWhistle/CarMod_psd_HighWhistle_2_Mar_16_2022_50426XXX_ms_12733_13508.mat', 'D:/Dropbox/Python/MLmodels/Datasets/CarModPow/level65_dBspl_clean/Chut/CarMod_psd_Chut_3_Mar_14_2022_54636590_ms_127_1340.mat']


Now let's split the data into train/"validation" (Note: test for now, will make it train/dev/test later)

In [61]:
CarMod_data_list_shuffled_numpy = np.array(CarMod_data_list_shuffled)
print(CarMod_data_list_shuffled_numpy[0])

X_train_filenames, X_val_filenames, y_train, y_val = train_test_split(
    CarMod_data_list_shuffled_numpy, CarMod_data_label_shuffled, test_size=0.2, random_state=1)

print(X_train_filenames.shape) # (1284,)
print(y_train.shape)           # (1284, 5)

print(X_val_filenames.shape)   # (321,)
print(y_val.shape)             # (321, 5)

# You can save these files as well. As you will be using them later for training and validation of your model.
out_dir = root_matdata_dir + 'ml_data/'
np.save(out_dir + 'X_train_filenames.npy', X_train_filenames)
np.save(out_dir + 'y_train.npy', y_train)

np.save(out_dir + 'X_val_filenames.npy', X_val_filenames)
np.save(out_dir + 'y_val.npy', y_val)

D:/Dropbox/Python/MLmodels/Datasets/CarModPow/level65_dBspl_clean/Chut/CarMod_psd_Chut_3_Feb_11_2022_58942456_ms_37472_37803.mat
(1284,)
(1284, 1)
(321,)
(321, 1)


Now let's create batches of data because all images are too big to fit in memory

In [62]:
class My_Custom_Generator(Sequence):
  
  def __init__(self, CarMod_filenames, labels, batch_size) :
    self.CarMod_filenames = CarMod_filenames
    self.labels = labels
    self.batch_size = batch_size
    
    
  def __len__(self) :
    return (np.ceil(len(self.CarMod_filenames) / float(self.batch_size))).astype(np.int)
  
  
  def __getitem__(self, idx) :
    batch_x = self.CarMod_filenames[idx * self.batch_size : (idx+1) * self.batch_size]
    batch_y = self.labels[idx * self.batch_size : (idx+1) * self.batch_size]
    
    return np.array( [np.ravel(mh.loadmat(X_train_filenames[0])["CarMod_power"]["CarMod_power"]) for file_name in batch_x]), np.array(batch_y)

batch_size = 32 
my_training_batch_generator = My_Custom_Generator(X_train_filenames, y_train, batch_size)
my_validation_batch_generator = My_Custom_Generator(X_val_filenames, y_val, batch_size)


Let's define the model! 

In [63]:
tf.random.set_seed(1234)  # applied to achieve consistent results
NNmodel = Sequential()
NNmodel.add(tf.keras.Input(shape=(603,)))
NNmodel.add(Dense(5, activation = 'relu',   name = "L1"))
NNmodel.add(Dense(5, activation = 'linear',   name = "L2"))

# Compile the model 
NNmodel.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(0.01),
)
print(NNmodel.output_shape)
NNmodel.summary()



(None, 5)
Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 L1 (Dense)                  (None, 5)                 3020      
                                                                 
 L2 (Dense)                  (None, 5)                 30        
                                                                 
Total params: 3,050
Trainable params: 3,050
Non-trainable params: 0
_________________________________________________________________


Let's train the model 

In [64]:
NNmodel.fit_generator(generator=my_training_batch_generator,
                   steps_per_epoch = int(len(y_train) // batch_size),
                   epochs = 10,
                   verbose = 1,
                   validation_data = my_validation_batch_generator,
                   validation_steps = int(len(y_val) // batch_size))

  NNmodel.fit_generator(generator=my_training_batch_generator,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return (np.ceil(len(self.CarMod_filenames) / float(self.batch_size))).astype(np.int)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2157f3641c0>

## look at accuracy 

In [None]:
y_train_pred_NN = np.argmax(NNmodel.predict(X_train), axis=1)
y_test_pred_NN = np.argmax(NNmodel.predict(X_test), axis=1)

print(f"Unique values in y_train={np.unique(y_train_pred_NN)}")
print(f"Unique values in y_test_pred={np.unique(y_test_pred_NN)}")