# Data generators

The goal of this notebook will be to be able to create a data generators since the data is too big to be able to be stored in the RAM. 

In [1]:
#Let's first import the modules wee need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns
from tensorflow.keras.models import load_model
import os
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model

In [2]:
#Constants
PATH_FEATURES_FOLDER = './Features/'
PATH_MELSPEC_313_128_FOLDER = PATH_FEATURES_FOLDER + 'melspec_313_128/'

## Train, test, validation dataset

For the custom data generator we will need to separate the data into those three datasets.<br />
The spectrogram are stored in the folder Features/melspec_313_128/. Each machinery has its folder fan/, valve/ etc... <br />
Each audio sample has its own mespectrogram flattened stored as a .npy file. So the file is one line of 313*128 = 40064 features

In [3]:
#Get file paths and labels
path_files = []
labels = []

#Walk through melspectrogram folders
for subdirectory, directory, files in os.walk(PATH_MELSPEC_313_128_FOLDER):
    
    #Get label using directory folder name
    label = subdirectory.split('/')[-1]
    
    #Loop through files
    for file in files:
        path_file = subdirectory + '/' + file
        path_files.append(path_file)
        labels.append(label)

In [4]:
#Encoding labels
label_encoder = LabelEncoder()
label_encoded = label_encoder.fit_transform(labels)

In [5]:
#Separate into three datasets
path_files_train, path_files_test, y_train, y_test = train_test_split(
    path_files, label_encoded, test_size=0.1, stratify=label_encoded)

path_files_train, path_files_valid, y_train, y_valid = train_test_split(
    path_files_train, y_train, test_size=0.1, stratify=y_train)

## Data Generator (Working)

Since the datasets are quite big, let's make a data generator.<br/>
- First we will need to get all filepaths and labels corresponding to those files
- Next separate train, validation and test datasets
- Create a custom class of data generator

In [6]:
class CustomDataGenerator(tf.keras.utils.Sequence):
    
    def __init__(self, list_path_files, labels, batch_size=300, dim=40064,
                 n_channels=1, shuffle=True, n_classes=7):
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_path_files = list_path_files
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.n_classes = n_classes
        self.on_epoch_end()
        
    def on_epoch_end(self):
        #Shuffle indexes for not having the same batches each epoch
        self.indexes = np.arange(len(self.list_path_files))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
            
    def __data_generation(self, indexes):
        
        # Initialization
        X = np.empty((self.batch_size, self.dim))
        y = np.empty((self.batch_size), dtype=int)
        
        # Generate data
        for i, index in enumerate(indexes):
            # Store sample
            X[i,] = np.load(self.list_path_files[index])
            
            # Store class
            y[i] = self.labels[index]
            
        return X, tf.keras.utils.to_categorical(y, num_classes=self.n_classes)
    
    def __len__(self):
        #Denotes the number of batches per epoch
        return int(np.floor(len(self.list_path_files) / self.batch_size))
    
    def __getitem__(self, index):
        
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        
        # Generate data
        X, y = self.__data_generation(indexes)
        return X, y
    
    
#Inspired by https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly

## Model

In [11]:
#Initializaiton of the layers
inputs = Input(shape=40064, name='Inputs')
first_layer = Dense(128, activation='relu', kernel_initializer ='normal', name='First_layer')
second_layer = Dense(128, activation='relu', kernel_initializer ='normal', name='Second_layer')
third_layer = Dense(64, activation='relu', kernel_initializer ='normal', name='Third_layer')
fourth_layer = Dense(7, activation='softmax', kernel_initializer ='normal', name='Output_layer')

#Construct the layer's order
x = first_layer(inputs)
x = second_layer(x)
x = third_layer(x)
outputs = fourth_layer(x)

#Initialize the model
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Inputs (InputLayer)         [(None, 40064)]           0         
                                                                 
 First_layer (Dense)         (None, 128)               5128320   
                                                                 
 Second_layer (Dense)        (None, 128)               16512     
                                                                 
 Third_layer (Dense)         (None, 64)                8256      
                                                                 
 Output_layer (Dense)        (None, 7)                 455       
                                                                 
Total params: 5,153,543
Trainable params: 5,153,543
Non-trainable params: 0
_________________________________________________________________


In [8]:
# Generators
training_generator = CustomDataGenerator(path_files_train, y_train)
validation_generator = CustomDataGenerator(path_files_valid, y_valid)

In [None]:
# Train model on dataset
model.fit(
    training_generator, 
    epochs=2,
    #workers=6,
    #use_multiprocessing=True
) 

Epoch 1/2


If you use the multiprocessing parameters it directly goes into a deadlock and don't manage to process. <br />
Without multi processing, the training is laborious and takes too much time. <br />
Let's try to use a different way: tf.data.Dataset generators


# Sandbox test (Not working)

In [8]:
z = list(range(len(path_files_train)))

In [10]:
len(z)

20412

In [11]:
dataset = tf.data.Dataset.from_generator(lambda: z, tf.uint8)

In [12]:
dataset

<FlatMapDataset element_spec=TensorSpec(shape=<unknown>, dtype=tf.uint8, name=None)>

In [26]:
def get_X_y_from_index(idx, data, label):
    idx = idx.numpy() # Decoding from the EagerTensor object
    y = label[idx]
    x = np.load(data[idx])
    return tf.convert_to_tensor(x), tf.convert_to_tensor(y)

In [33]:
z = list(range(len(path_files_train))) # The index generator

dataset = tf.data.Dataset.from_generator(lambda: z, tf.uint8)

dataset = dataset.shuffle(buffer_size=len(z), seed=0,  
                          reshuffle_each_iteration=True)

dataset = dataset.map(lambda i: tf.py_function(func=get_X_y_from_index, 
                                               inp=[i, path_files_train, y_train], 
                                               Tout=[tf.float32,
                                                     tf.int64]
                                               ), 
                      num_parallel_calls=tf.data.AUTOTUNE)

dataset = dataset.batch(32)

In [34]:
dataset

<BatchDataset element_spec=(TensorSpec(shape=<unknown>, dtype=tf.float32, name=None), TensorSpec(shape=<unknown>, dtype=tf.int64, name=None))>

In [35]:
model.fit(dataset,
          epochs=10)

Epoch 1/10


TypeError: 'NoneType' object is not callable

# Let's try another thing (Not working)

In [72]:
def load(path):
    print(path)
    array = np.load(path)
    return tf.convert_to_tensor(array, dtype=tf.float64)

In [73]:
dataset = tf.data.Dataset.from_tensors(path_files_train)
print(dataset)

<TensorDataset element_spec=TensorSpec(shape=(20412,), dtype=tf.string, name=None)>


In [74]:
dataset = dataset.map(
    lambda x: tf.py_function(load, [x], [tf.float32]), 
    num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [75]:
for i in dataset.take(1):
    print('a')

tf.Tensor(
[b'./Features/melspec_313_128/ToyCar/section_02_source_train_normal_0605_v1pat_04.npy'
 b'./Features/melspec_313_128/slider/section_00_source_train_normal_0716_pat_00.npy'
 b'./Features/melspec_313_128/ToyTrain/section_00_source_train_normal_0143_pat_00.npy'
 ...
 b'./Features/melspec_313_128/ToyTrain/section_00_source_train_normal_0975_pat_01.npy'
 b'./Features/melspec_313_128/gearbox/section_00_source_train_normal_0929_pat_01.npy'
 b'./Features/melspec_313_128/ToyCar/section_02_source_train_normal_0471_v1pat_04.npy'], shape=(20412,), dtype=string)


InvalidArgumentError: TypeError: expected str, bytes or os.PathLike object, not tensorflow.python.framework.ops.EagerTensor
Traceback (most recent call last):

  File "C:\Users\Quentin\anaconda3\envs\tensorflow_keras\lib\site-packages\tensorflow\python\ops\script_ops.py", line 269, in __call__
    return func(device, token, args)

  File "C:\Users\Quentin\anaconda3\envs\tensorflow_keras\lib\site-packages\tensorflow\python\ops\script_ops.py", line 147, in __call__
    outputs = self._call(device, args)

  File "C:\Users\Quentin\anaconda3\envs\tensorflow_keras\lib\site-packages\tensorflow\python\ops\script_ops.py", line 154, in _call
    ret = self._func(*args)

  File "C:\Users\Quentin\anaconda3\envs\tensorflow_keras\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 642, in wrapper
    return func(*args, **kwargs)

  File "C:\Users\Quentin\AppData\Local\Temp\ipykernel_5036\4581593.py", line 3, in load
    array = np.load(path)

  File "C:\Users\Quentin\anaconda3\envs\tensorflow_keras\lib\site-packages\numpy\lib\npyio.py", line 407, in load
    fid = stack.enter_context(open(os_fspath(file), "rb"))

TypeError: expected str, bytes or os.PathLike object, not tensorflow.python.framework.ops.EagerTensor


	 [[{{node EagerPyFunc}}]] [Op:IteratorGetNext]

In [None]:
# https://python.tutorialink.com/loading-a-large-dataset-from-csv-files-in-tensorflow/

### Let's try again another thing (Not working)

In [8]:
len(y_train)

20412

In [9]:
np.stack(y_valid)

array([2, 4, 1, ..., 4, 4, 5], dtype=int64)

In [17]:

    

def tf_parse_filename(filename_batch):
    
    def get_label(filename):
        if('bearing' in filename):
            return 0
        elif('fan' in filename):
            return 1
        elif('gearbox' in filename):
            return 2
        elif('slider' in filename):
            return 3
        elif('ToyCar' in filename):
            return 4
        elif('ToyTrain' in filename):
            return 5
        elif('valve' in filename):
            return 6

    def parse_filename(filename_batch):
        data = []
        labels = []
        for filename in filename_batch:
            # Read data
            filename_str = filename.numpy().decode()
            # Read .csv file 
            data_point= np.load(filename_str)

            # Create label
            current_label = get_label(filename)
            label = np.zeros(7, dtype=np.float32)
            label[current_label] = 1.0

            data.append(data_point)
            labels.append(label)

        return np.stack(data), np.stack(labels)


    x, y = tf.py_function(parse_filename, [filename_batch], [tf.float32, tf.float32])
    
    tf.ensure_shape(
        x, (len(filename_batch), 40064), name=None
    )
    
    tf.ensure_shape(
        y, (len(filename_batch), 7), name=None
    )
    
    return x, y




In [18]:
train_ds = tf.data.Dataset.from_tensor_slices(path_files_train)
train_ds = train_ds.batch(32)
train_ds = train_ds.map(tf_parse_filename, num_parallel_calls=tf.data.experimental.AUTOTUNE)
train_ds = train_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

TypeError: in user code:

    File "C:\Users\Quentin\AppData\Local\Temp\ipykernel_6892\347323597.py", line 41, in tf_parse_filename  *
        tf.ensure_shape(
    File "<string>", line 3, in raise_from
        

    TypeError: Dimension value must be integer or None or have an __index__ method, got value '<tf.Tensor 'strided_slice:0' shape=() dtype=int32>' with type '<class 'tensorflow.python.framework.ops.Tensor'>'


In [None]:
model.fit(
    train_ds, 
    epochs=2,
    #workers=6,
    #use_multiprocessing=True
)

## Let's try once again a different approach (Working)

In [39]:
def data_generator(file_list, label_list, batch_size):
    
    #Index used too go over file list 
    index = 0
    
    #Infinite loop
    while True:
        
        #Case we looped over all the files
        if((index + 1) * batch_size >= len(file_list)):
            #Reinit variables for a next round
            index = 0
            np.random.shuffle(file_list)
            
        #Loop over files from index * batch size to (index + 1) * batch size
        else:
            #Get files paths
            file_chunk = file_list[index*batch_size:(index+1)*batch_size]
            label_chunk = label_list[index*batch_size:(index+1)*batch_size]
            
            #Init data and labels list
            data = []
            labels = []
            
            #Loop over batch files
            for file, label in zip(file_chunk, label_chunk):
                data.append(np.load(file).reshape(40064))
                labels.append(tf.keras.utils.to_categorical(label, num_classes=7))
                
            data = np.asarray(data)
            labels = np.asarray(labels)
            yield data, labels
            index = index + 1

In [19]:
y_train

array([1, 4, 4, ..., 0, 5, 4], dtype=int64)

In [24]:
np.load(path_files_train[0]).dtype

dtype('float32')

In [32]:
np.load(path_files_train[0]).reshape(40064)

array([ -9.363078 ,  -8.736167 ,  -7.9711294, ..., -39.11569  ,
       -39.8356   , -42.039524 ], dtype=float32)

In [30]:

tf.keras.utils.to_categorical(y_train[0], num_classes=7).shape

(7,)

In [28]:
np.unique(y_train)

array([0, 1, 2, 3, 4, 5, 6], dtype=int64)

In [41]:
generated_data = data_generator(path_files_train, y_train, batch_size = 10)
num = 0
for data, labels in generated_data:
    print(data.shape, labels.shape)
    print(data.dtype, labels.dtype)
    print(labels, "<--Labels")  # Just to see the lables
    print()
    num = num + 1
    if num > 5: break

(10, 40064) (10, 7)
float32 float32
[[0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]] <--Labels

(10, 40064) (10, 7)
float32 float32
[[0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0.]] <--Labels

(10, 40064) (10, 7)
float32 float32
[[0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]] <--Labels

(10, 40064) (10, 7)
float32 float32
[[1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0.

In [42]:
batch_size = 15
dataset = tf.data.Dataset.from_generator(
    data_generator,
    args= [path_files_train, y_train, batch_size],
    output_types = (tf.float32, tf.float32),
    output_shapes = ((batch_size, 40064),(batch_size, 7))
)

In [43]:
num = 0
for data, labels in dataset:
    print(data.shape, labels.shape)
    print(labels)
    print()
    num = num + 1
    if num > 7: break

(15, 40064) (15, 7)
tf.Tensor(
[[0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0.]], shape=(15, 7), dtype=float32)

(15, 40064) (15, 7)
tf.Tensor(
[[0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]], shape=(15, 7), dtype=float32)

(15, 40064) (15, 7)
tf.Tensor(
[[1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 

In [44]:
batch_size = 32
train_dataset = tf.data.Dataset.from_generator(
    data_generator,
    args= [path_files_train, y_train, batch_size],
    output_types = (tf.float32, tf.float32),
    output_shapes = ((batch_size, 40064),(batch_size, 7))
)

validation_dataset = tf.data.Dataset.from_generator(
    data_generator,
    args= [path_files_valid, y_valid, batch_size],
    output_types = (tf.float32, tf.float32),
    output_shapes = ((batch_size, 40064),(batch_size, 7))
)

test_dataset = tf.data.Dataset.from_generator(
    data_generator,
    args= [path_files_test, y_test, batch_size],
    output_types = (tf.float32, tf.float32),
    output_shapes = ((batch_size, 40064),(batch_size, 7))
)

In [45]:
#Initializaiton of the layers
inputs = Input(shape=40064, name='Inputs')
first_layer = Dense(128, activation='relu', kernel_initializer ='normal', name='First_layer')
second_layer = Dense(128, activation='relu', kernel_initializer ='normal', name='Second_layer')
third_layer = Dense(64, activation='relu', kernel_initializer ='normal', name='Third_layer')
fourth_layer = Dense(7, activation='softmax', kernel_initializer ='normal', name='Output_layer')

#Construct the layer's order
x = first_layer(inputs)
x = second_layer(x)
x = third_layer(x)
outputs = fourth_layer(x)

#Initialize the model
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Inputs (InputLayer)         [(None, 40064)]           0         
                                                                 
 First_layer (Dense)         (None, 128)               5128320   
                                                                 
 Second_layer (Dense)        (None, 128)               16512     
                                                                 
 Third_layer (Dense)         (None, 64)                8256      
                                                                 
 Output_layer (Dense)        (None, 7)                 455       
                                                                 
Total params: 5,153,543
Trainable params: 5,153,543
Non-trainable params: 0
_________________________________________________________________


In [47]:
steps_per_epoch = np.int32(np.ceil(len(path_files_train)/batch_size))
validation_steps = np.int32(np.ceil(len(path_files_valid)/batch_size))
print("steps_per_epoch = ", steps_per_epoch)
print("validation_steps = ", validation_steps)

steps_per_epoch =  638
validation_steps =  71


In [48]:
model.fit(train_dataset, validation_data = validation_dataset, steps_per_epoch = steps_per_epoch,
         validation_steps = validation_steps, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x22005b113d0>

In [None]:
#Inspired by https://www.kaggle.com/code/biswajitsahoo1111/reading-multiple-csv-files-in-tensorflow-2/notebook