In [1]:
# each TF-Record file is for a single class and will be loaded into a separate tf.data.Dataset
# these datasets will be appended to a list, and fed into tf.experimental.sample_from_datasets

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import librosa
import librosa.display
import pathlib
import gc
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
N_CLASSES = 264 
SAMPLE_RATE = 30000 # Audio sample rate
MAX_DURATION = 5 # Clip duration in seconds 
FFT_SIZE = 1024 # Fourier Transform size 
HOP_SIZE = 512 # Number of samples between each successive FFT window
N_MEL_BINS = 128 
N_SPECTROGRAM_BINS = (FFT_SIZE // 2) + 1
F_MIN = 20 # Min frequency cutoff
F_MAX = SAMPLE_RATE / 2  # Max Frequency cutoff
BATCH_SIZE = 16  # Training Batch size

In [None]:
train = pd.read_csv("/content/drive/My Drive/train.csv", parse_dates=['date'])

In [None]:
# directories for Train and Test TF Records, 264 files in each
train_dir = '/content/drive/My Drive/lala1/Data-5Seconds/Train'
test_dir = '/content/drive/My Drive/lala1/Data-5Seconds/Test'

In [None]:
# this notebook uses 5 seconds worth of data at a time

In [None]:
train_tfr = os.listdir(train_dir)
test_tfr = os.listdir(test_dir)

In [None]:
def read_tfrecord(serialized_example):
    feature_description = {
          'feature0': tf.io.FixedLenFeature((), tf.string),
          'feature1': tf.io.FixedLenFeature((), tf.int64),
    }
    example = tf.io.parse_single_example(serialized_example, feature_description)

    feature0 = tf.io.parse_tensor(example['feature0'], out_type = tf.float32)
    feature1 = example['feature1']

    return feature0, feature1

In [None]:
i = 0
lis = [0]*264
for d in train_tfr:
    d_path = os.path.join(train_dir, d)
    tfrecord_dataset_train = tf.data.TFRecordDataset([d_path], compression_type="GZIP")
    dataset = tfrecord_dataset_train.map(read_tfrecord)
    lis[i] = dataset
    i = i+1

train_ds = tf.data.experimental.sample_from_datasets(lis)

In [None]:
print(i) #264

In [None]:
i = 0
lis = [0]*264
for d in test_tfr:
    d_path = os.path.join(test_dir, d)
    tfrecord_dataset_test = tf.data.TFRecordDataset([d_path], compression_type="GZIP")
    dataset = tfrecord_dataset_test.map(read_tfrecord)
    lis[i] = dataset
    i = i+1

test_ds = tf.data.experimental.sample_from_datasets(lis)

In [None]:
print(i) #264

In [None]:
def prepare_for_training(ds, shuffle_buffer_size=64, batch_size=24):
    ds = ds.shuffle(buffer_size=shuffle_buffer_size)
    ds = ds.batch(batch_size).repeat()
    ds = ds.map(setshape)
    ds = ds.prefetch(2)
    return ds

def setshape(x, y):
    x.set_shape([None, 150000,1])
    y = tf.expand_dims(y, -1)
    y.set_shape([None,1])
    return x, y

In [None]:
train_final = prepare_for_training(train_ds)
test_final = prepare_for_training(test_ds)

In [None]:
# clearing RAM space by deleting the variables no longer needed, and calling the garbage collector

In [None]:
del train_dir
del test_dir
del d
del i
del train_ds
del test_ds
del test_tfr
del train_tfr
del d_path
del dataset
gc.collect()

In [None]:
# feeding tensors to a layer which calculated the Mel Spectrogram for the given audio instance
# this in turn is treated like an image, and 2D convolutions are applied, and sent to a trained Resnet
# the Resnet weights are frozen, and is only used to extract features from the Mel Spectrogram

In [None]:
class LogMelSpectrogram(tf.keras.layers.Layer):
    """Compute log-magnitude mel-scaled spectrograms."""

    def __init__(self, sample_rate, fft_size, hop_size, n_mels,
                 f_min=0.0, f_max=None, **kwargs):
        super(LogMelSpectrogram, self).__init__(**kwargs)
        self.sample_rate = sample_rate
        self.fft_size = fft_size
        self.hop_size = hop_size
        self.n_mels = n_mels
        self.f_min = f_min
        self.f_max = f_max if f_max else sample_rate / 2
        self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=self.n_mels,
            num_spectrogram_bins=fft_size // 2 + 1,
            sample_rate=self.sample_rate,
            lower_edge_hertz=self.f_min,
            upper_edge_hertz=self.f_max)

    def build(self, input_shape):
        self.non_trainable_weights.append(self.mel_filterbank)
        super(LogMelSpectrogram, self).build(input_shape)

    def call(self, waveforms):
        """Forward pass.

        Parameters
        ----------
        waveforms : tf.Tensor, shape = (None, n_samples)
            A Batch of mono waveforms.

        Returns
        -------
        log_mel_spectrograms : (tf.Tensor), shape = (None, time, freq, ch)
            The corresponding batch of log-mel-spectrograms
        """
        def _tf_log10(x):
            numerator = tf.math.log(x)
            denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
            return numerator / denominator

        def power_to_db(magnitude, amin=1e-16, top_db=80.0):
            """
            https://librosa.github.io/librosa/generated/librosa.core.power_to_db.html
            """
            ref_value = tf.reduce_max(magnitude)
            log_spec = 10.0 * _tf_log10(tf.maximum(amin, magnitude))
            log_spec -= 10.0 * _tf_log10(tf.maximum(amin, ref_value))
            log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)

            return log_spec

        spectrograms = tf.signal.stft(waveforms,
                                      frame_length=self.fft_size,
                                      frame_step=self.hop_size,
                                      pad_end=False)

        magnitude_spectrograms = tf.abs(spectrograms)

        mel_spectrograms = tf.matmul(tf.square(magnitude_spectrograms),
                                     self.mel_filterbank)

        log_mel_spectrograms = power_to_db(mel_spectrograms)

        # add channel dimension
        log_mel_spectrograms = tf.expand_dims(log_mel_spectrograms, 3)
        return log_mel_spectrograms

    def get_config(self):
        config = {
            'fft_size': self.fft_size,
            'hop_size': self.hop_size,
            'n_mels': self.n_mels,
            'sample_rate': self.sample_rate,
            'f_min': self.f_min,
            'f_max': self.f_max,
        }
        config.update(super(LogMelSpectrogram, self).get_config())

        return config

In [None]:
import tensorflow_hub as hub

feature_extractor_url = "https://tfhub.dev/google/imagenet/resnet_v2_50/feature_vector/4"
feature_extractor_layer = hub.KerasLayer(feature_extractor_url,
                                         input_shape=(311, 128, 3))

feature_extractor_layer.trainable = False

In [None]:
from tensorflow.keras.layers import (BatchNormalization, Conv2D, Dense, Dropout, Flatten, Input, MaxPool2D)
from tensorflow.keras.models import Model

def ConvModel(n_classes, sample_rate=SAMPLE_RATE, duration=MAX_DURATION,
              fft_size=FFT_SIZE, hop_size=HOP_SIZE, n_mels=N_MEL_BINS, fmin=F_MIN, fmax=F_MAX):
    n_samples = sample_rate * duration
    input_shape = (n_samples,)

    x = Input(shape=input_shape, name='input', dtype='float32')    
    y = LogMelSpectrogram(sample_rate, fft_size, hop_size, n_mels, fmin, fmax)(x)
    y = BatchNormalization(axis=2)(y)


    y = Conv2D(6, (3,3), padding='same')(y)  
    y = Dropout(0.4)(y)
    y = BatchNormalization()(y)
    y = Conv2D(3, (3,3), padding='same')(y)  
    y = Dropout(0.4)(y)
    y = BatchNormalization()(y)

    y = feature_extractor_layer(y, training=False)

    y = Dense(1024, activation='relu')(y)
    y = Dropout(0.4)(y)
    y = BatchNormalization()(y)

    y = Dense(512, activation='relu')(y)
    y = Dropout(0.4)(y)
    y = Dense(n_classes, activation='softmax')(y)

    return Model(inputs=x, outputs=y)

In [None]:
from tensorflow.keras.optimizers import SGD, schedules

n_classes = N_CLASSES
model = ConvModel(n_classes)

lr_schedule = schedules.ExponentialDecay(
    initial_learning_rate=0.05, decay_steps=1000, decay_rate=0.96, staircase=False
)
sgd = SGD(learning_rate=lr_schedule, momentum=0.85)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy', 
              metrics=['sparse_categorical_accuracy'])

model.summary()


In [None]:
steps_per_epoch = len(train)//BATCH_SIZE
steps_per_epoch

In [None]:
checkpoint_filepath = 'Model-4'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    save_best_only=True,
    verbose=2)

model.fit(train_final, 
          epochs=50, 
          steps_per_epoch=steps_per_epoch, 
          validation_data=test_final, 
          validation_steps=2, 
         callbacks=[model_checkpoint_callback])