These code are based on https://medium.com/@mrgarg.rajat/training-on-large-datasets-that-dont-fit-in-memory-in-keras-60a974785d71
Load required packages 

In [66]:
import os 
import numpy as np 
import h5py
import scipy.io
import matlab_helpers as mh 
from keras.utils import to_categorical, Sequence
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from keras.models import Sequential, Model, load_model
from keras.layers import Input, Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import BatchNormalization 
import tensorflow as tf 

Read the names of all mat-files 

In [67]:
root_matdata_dir = '../Datasets/mps_gp_sgram/'
clean_matdata_dir = root_matdata_dir + 'level65_dBspl_clean_ds/'
out_ml_dir= root_matdata_dir + 'ds_ml_data/'

if not os.path.exists(out_ml_dir):
    os.makedirs(out_ml_dir)

valid_datadirs = ['Chut', 'HighWhistle', 'Rumble', 'Tchatter', 'Wheek', 'Whine']
calls2use = ['Chut', 'Rumble', 'Wheek', 'Whine']

all_files_nameonly= []
mps_data_list= []

for cur_call_dir in valid_datadirs:
    cur_call_path = clean_matdata_dir + cur_call_dir + '/' 
    # print(cur_call_path)
    cur_dir_files = [f for f in os.listdir(cur_call_path) if os.path.isfile(os.path.join(cur_call_path, f))]
    all_files_nameonly = all_files_nameonly + cur_dir_files
    mps_data_list = mps_data_list + [cur_call_path + f for f in cur_dir_files]
    

print(f"--> all_files_nameonly: len lines= {len(all_files_nameonly)}, first line = {all_files_nameonly[0]}")
print(f"--> mps_data_list: len lines= {len(mps_data_list)}, first line = {mps_data_list[0]}")

out_allfiles_txt_fname = out_ml_dir + 'dsMPS_data_list.npy'
np.save(out_allfiles_txt_fname, mps_data_list)

with open(out_allfiles_txt_fname, 'w') as f:
    for line in mps_data_list:
        f.write(f"{line}\n")

--> all_files_nameonly: len lines= 1605, first line = ds_Chut_2_Feb_07_2022_51861688_ms_101198_101787.mat
--> mps_data_list: len lines= 1605, first line = D:/Dropbox/Python/MLmodels/Datasets/mps_gp_sgram/level65_dBspl_clean_ds/Chut/ds_Chut_2_Feb_07_2022_51861688_ms_101198_101787.mat


This time, let's not read all mat-files because the files would be huge. Instead, we save training, validation filenames and load data in minibatches. 

In [68]:
pre_search_str = 'clean_ds/'
post_search_str = '/ds_'
    
data_label_name = [item[item.rfind(pre_search_str)+len(pre_search_str):item.rfind(post_search_str)] for item in mps_data_list]
data_label_y = len(calls2use)*np.ones((len(data_label_name),1))
unq_vals, unq_counts = np.unique(data_label_y, return_counts=True)
print(dict(zip(unq_vals,unq_counts)))

for ind, cur_call in zip(np.arange(len(data_label_name)),data_label_name):
    if cur_call in calls2use: 
        data_label_y[ind,0] = calls2use.index(cur_call)

mps_data_label = np.array(data_label_y).astype(int)
mps_data_label_oh= to_categorical(mps_data_label)

print(np.unique(data_label_y, axis=0))
# print(np.unique(mps_data_label, axis=0))

out_allfiles_txt_label = out_ml_dir + 'dsMPS_data_label.npy'
out_allfiles_txt_label_oh = out_ml_dir + 'dsMPS_data_label_onehot.npy'
np.save(out_allfiles_txt_label, mps_data_label)
np.save(out_allfiles_txt_label_oh, mps_data_label_oh)


{4.0: 1605}
[[0.]
 [1.]
 [2.]
 [3.]
 [4.]]


Let's create file/label lists with shuffeld indices. 

In [69]:
mps_data_list_shuffled, mps_data_label_oh_shuffled = shuffle(mps_data_list, mps_data_label_oh)
out_allfiles_txt_label_shuffled = out_ml_dir + 'all_MPS_data_label_shuffled.npy'
out_allfiles_txt_label_oh_shuffled = out_ml_dir + 'all_MPS_data_label_onehot_shuffled.npy'
np.save(out_allfiles_txt_label_shuffled, mps_data_label_oh_shuffled)
np.save(out_allfiles_txt_label_oh_shuffled, mps_data_list_shuffled)


Now let's split the data into train/"validation" (Note: test for now, will make it train/dev/test later)

In [70]:
mps_data_list_shuffled_numpy = np.array(mps_data_list_shuffled)

X_train_filenames, X_val_filenames, y_train, y_val = train_test_split(
    mps_data_list_shuffled_numpy, mps_data_label_oh_shuffled, test_size=0.2, random_state=1)

print(X_train_filenames.shape) # (1284,)
print(y_train.shape)           # (1284, 5)

print(X_val_filenames.shape)   # (321,)
print(y_val.shape)             # (321, 5)

# You can save these files as well. As you will be using them later for training and validation of your model.
np.save(out_ml_dir + 'X_train_filenames.npy', X_train_filenames)
np.save(out_ml_dir + 'y_train.npy', y_train)

np.save(out_ml_dir + 'X_val_filenames.npy', X_val_filenames)
np.save(out_ml_dir + 'y_val.npy', y_val)

(1284,)
(1284, 5)
(321,)
(321, 5)


In [71]:
print(np.unique(y_val, axis=0))

[[0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]]


Now let's create batches of data because all images are too big to fit in memory

In [72]:
class My_Custom_Generator(Sequence):
  
  def __init__(self, mps_filenames, labels, batch_size) :
    self.mps_filenames = mps_filenames
    self.labels = labels
    self.batch_size = batch_size
    
    
  def __len__(self) :
    return (np.ceil(len(self.mps_filenames) / float(self.batch_size))).astype(np.int)
  
  
  def __getitem__(self, idx) :
    batch_x = self.mps_filenames[idx * self.batch_size : (idx+1) * self.batch_size]
    batch_y = self.labels[idx * self.batch_size : (idx+1) * self.batch_size]
    
    return np.array( [mh.loadmat(X_train_filenames[0])["mps_struct"]["mps_pow_dB"] for file_name in batch_x]), np.array(batch_y)

batch_size = 32 
my_training_batch_generator = My_Custom_Generator(X_train_filenames, y_train, batch_size)
my_validation_batch_generator = My_Custom_Generator(X_val_filenames, y_val, batch_size)



Let's define the model! 

In [73]:
# model = Sequential()

# model.add(Conv2D(filters = 16, kernel_size = (5,5), padding='same', activation ='relu',input_shape=(80,69,1)))
# model.add(BatchNormalization(axis=-1))
# model.add(Conv2D(filters = 16, kernel_size = (5,5), padding='same', activation ='relu'))
# model.add(MaxPooling2D(pool_size=(2,2)))
# model.add(BatchNormalization(axis=-1))
# model.add(Dropout(0.25))

# model.add(Conv2D(filters = 32, kernel_size = (5,5), padding='same', activation ='relu'))
# model.add(BatchNormalization(axis=-1))
# model.add(Conv2D(filters = 32, kernel_size = (5,5), padding='same', activation ='relu'))
# model.add(MaxPooling2D(pool_size=(2,2)))
# model.add(BatchNormalization(axis=-1))
# model.add(Dropout(0.25))

# model.add(Conv2D(filters = 64, kernel_size = (5,5), padding='same', activation ='relu'))
# model.add(BatchNormalization(axis=-1))
# model.add(Conv2D(filters = 64, kernel_size = (5,5), padding='same', activation ='relu'))
# model.add(MaxPooling2D(pool_size=(2,2)))
# model.add(BatchNormalization(axis=-1))
# model.add(Dropout(0.1))

# model.add(Flatten())

# model.add(Dense(20, activation = "relu")) #Fully connected layer
# model.add(BatchNormalization())
# model.add(Dropout(0.5))

# model.add(Dense(5, activation = "softmax")) #Classification layer or output layer

# model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['accuracy'])

# model.summary()

model = tf.keras.Sequential([
    ## ZeroPadding2D with padding 3, input shape of 80 x 69 x 1
    ## Conv2D with 32 5x5 filters and stride of 1
    ## BatchNormalization for axis 3
    ## ReLU
    ## Max Pooling 2D with default parameters
    ## Flatten layer
    ## Dense layer with 1 unit for output & 'sigmoid' activation            
        
    # YOUR CODE STARTS HERE
    tf.keras.layers.ZeroPadding2D(padding=3,input_shape=(80,69,1)),
    tf.keras.layers.Conv2D(filters=32, kernel_size=(5,5),padding='same', strides=(1,1)),
    tf.keras.layers.BatchNormalization(axis=-1),
    tf.keras.layers.ReLU(),
    tf.keras.layers.MaxPool2D(pool_size=(2,2)),
    tf.keras.layers.Flatten(), 
    tf.keras.layers.Dense(units=5, activation='softmax'),
    # YOUR CODE ENDS HERE
    ])

model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 zero_padding2d_6 (ZeroPaddi  (None, 86, 75, 1)        0         
 ng2D)                                                           
                                                                 
 conv2d_54 (Conv2D)          (None, 86, 75, 32)        832       
                                                                 
 batch_normalization_62 (Bat  (None, 86, 75, 32)       128       
 chNormalization)                                                
                                                                 
 re_lu_6 (ReLU)              (None, 86, 75, 32)        0         
                                                                 
 max_pooling2d_30 (MaxPoolin  (None, 43, 37, 32)       0         
 g2D)                                                            
                                                     

Let's train the model 

In [74]:
model.fit_generator(generator=my_training_batch_generator,
                   steps_per_epoch = int(len(y_train) // batch_size),
                   epochs = 10,
                   verbose = 1,
                   validation_data = my_validation_batch_generator,
                   validation_steps = int(len(y_val) // batch_size))

  model.fit_generator(generator=my_training_batch_generator,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return (np.ceil(len(self.mps_filenames) / float(self.batch_size))).astype(np.int)


Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 