<a href="https://colab.research.google.com/github/satvik-venkatesh/data-gen-keras/blob/master/data-gen-keras-train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Generators with Keras and Tensorflow on Google Colab


In [2]:
import numpy as np
import tensorflow as tf
import math
import glob
import pickle
import os

In [3]:
"""
Mount Google Drive into Colab.
"""
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Here, I am cloning the GitHub repository that contains the data. You can extract the zip files by specifying the location in your personal Google Drive. The data has already been pre-processed for you. I will not go into the detail of the data because that is not the objective of this tutorial.

In [180]:
!git clone "https://github.com/satvik-venkatesh/data-gen-keras.git"

Cloning into 'data-gen-keras'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 16 (delta 0), reused 13 (delta 0), pack-reused 0[K
Unpacking objects: 100% (16/16), done.
Checking out files: 100% (8/8), done.


In [181]:
"""
Extract train data
"""
from zipfile import ZipFile

blocks = glob.glob("/content/data-gen-keras/train/*")

for b in blocks:
  zip_name = b
  with ZipFile(zip_name, 'r') as zip:
    zip.extractall('train data')
    print("Extracted all sound files into the folder 'train data'")

Extracted all sound files into the folder 'train data'
Extracted all sound files into the folder 'train data'
Extracted all sound files into the folder 'train data'
Extracted all sound files into the folder 'train data'


In [182]:
"""
Extract validation data
"""
from zipfile import ZipFile

blocks = glob.glob("/content/data-gen-keras/val/*")

for b in blocks:
  zip_name = b
  with ZipFile(zip_name, 'r') as zip:
    zip.extractall('validation data')
    print("Extracted all sound files into the folder 'validation data'")

Extracted all sound files into the folder 'validation data'
Extracted all sound files into the folder 'validation data'
Extracted all sound files into the folder 'validation data'


In [183]:
import tensorflow as tf
import keras

class DataGenerator(tf.compat.v2.keras.utils.Sequence):
  def __init__(self, list_examples, batch_size=64, dim=(802, 80),
                n_classes=2, shuffle=True):
    # Constructor of the data generator.
    self.dim = dim
    self.batch_size = batch_size
    self.list_examples = list_examples
    self.n_classes = n_classes
    self.shuffle = shuffle
    self.on_epoch_end()

  def __len__(self):
    # Denotes the number of batches per epoch
    return int(np.floor(len(self.list_examples) / self.batch_size))

  def __getitem__(self, index):
    # Generate one batch of data
    indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

    # Find list of IDs
    list_IDs_temp = [self.list_examples[k] for k in indexes]

    # Generate data
    X, y = self.__data_generation(list_IDs_temp)

    return X, y

  def on_epoch_end(self):
    # This function is called at the end of each epoch.
    self.indexes = np.arange(len(self.list_examples))
    if self.shuffle == True:
      np.random.shuffle(self.indexes)

  def __data_generation(self, list_IDs_temp):
    # Load individual numpy arrays and aggregate them to a batch.
    
    X = np.empty([self.batch_size, self.dim[0], self.dim[1]], dtype=np.float32)
    
    # y is a one-hot encoded vector.
    y = np.empty([self.batch_size, 1, self.n_classes], dtype=np.int16)

    # Generate data.
    for i, ID in enumerate(list_IDs_temp):

        # Load sample
        X[i,:, :] = np.load(ID[0])
        # Load labels       
        y[i, :, :] = np.load(ID[1])

    return X, y

In [184]:
nn = np.load("/content/validation data/content/Mel Files/block-id-1/id-label-127.npy")
nn.shape

(1, 2)

In [185]:
"""
Natural Sort
"""

import re

def tryint(s):
    try:
        return int(s)
    except ValueError:
        return s
    
def alphanum_key(s):
    """ Turn a string into a list of string and number chunks.
        "z23a" -> ["z", 23, "a"]
    """
    return [ tryint(c) for c in re.split('([0-9]+)', s) ]

def sort_nicely(l):
    """ Sort the given list in the way that humans expect.
    """
    l.sort(key=alphanum_key)

In [186]:
"""
This loads data for the training set.
"""
import glob
import random
"""
Load the individual numpy arrays into partition
"""
data = glob.glob("/content/train data/**/id-[0-9]*.npy", recursive=True)
sort_nicely(data)

labels = glob.glob("/content/train data/**/id-label-[0-9]*.npy", recursive=True)
sort_nicely(labels)

train_examples = [(data[i], labels[i]) for i in range(len(data))]

random.seed(4)
random.shuffle(train_examples)

partition = {}
partition['train'] = train_examples

In [187]:
"""
This loads data for the validation set.
"""
import glob
import random

data = glob.glob("/content/validation data/**/id-[0-9]*.npy", recursive=True)
sort_nicely(data)

labels = glob.glob("/content/validation data/**/id-label-[0-9]*.npy", recursive=True)
sort_nicely(labels)

validation_examples = [(data[i], labels[i]) for i in range(len(data))]

random.seed(4)
random.shuffle(validation_examples)

partition['validation'] = validation_examples

In [188]:
# Parameters
params = {'dim': (802, 80),
          'batch_size': 32,
          'n_classes': 2,
          'shuffle': True}

# Define the generators
training_generator = DataGenerator(partition['train'], **params)
validation_generator = DataGenerator(partition['validation'], **params)

In [192]:
"""
RNN to train on data
"""

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

mel_input = keras.Input(shape=(802, 80), name="mel_input")
X = mel_input

X = layers.Bidirectional(layers.GRU(40, return_sequences = True))(X)
X = layers.BatchNormalization(momentum=0.0)(X)

pred = layers.Dense(2, activation='sigmoid')(X)

model = keras.Model(inputs = [mel_input], outputs = [pred])

keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss=[keras.losses.CategoricalCrossentropy()], metrics=['categorical_accuracy']
)

model.summary()

Model: "functional_37"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
mel_input (InputLayer)       [(None, 802, 80)]         0         
_________________________________________________________________
bidirectional_18 (Bidirectio (None, 802, 80)           29280     
_________________________________________________________________
batch_normalization_18 (Batc (None, 802, 80)           320       
_________________________________________________________________
dense_18 (Dense)             (None, 802, 2)            162       
Total params: 29,762
Trainable params: 29,602
Non-trainable params: 160
_________________________________________________________________


In [193]:
model.fit(training_generator, validation_data=validation_generator, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f99bdb12cc0>

In [196]:
model.evaluate(validation_generator)



[0.03891988843679428, 0.993960440158844]