In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
# import tensorflow_io as tfio  
import matplotlib.pyplot as plt
import time
import io
import sys


train_spectrogram_ds = None
val_spectrogram_ds = None
test_spectrogram_ds = None
example_spectrograms = None
label_names = None



model = None
history = None

Add `Dataset.cache` and `Dataset.prefetch` operations to reduce read latency while training the model:


In [2]:
def init():
    global train_spectrogram_ds, val_spectrogram_ds, test_spectrogram_ds    
    train_spectrogram_ds = train_spectrogram_ds.cache().shuffle(10000).prefetch(tf.data.AUTOTUNE)
    val_spectrogram_ds = val_spectrogram_ds.cache().prefetch(tf.data.AUTOTUNE)
    test_spectrogram_ds = test_spectrogram_ds.cache().prefetch(tf.data.AUTOTUNE)

For the model, you'll use a simple convolutional neural network (CNN), since you have transformed the audio files into spectrogram images.

Your `tf.keras.Sequential` model will use the following Keras preprocessing layers:

- `tf.keras.layers.Resizing`: to downsample the input to enable the model to train faster.
- `tf.keras.layers.Normalization`: to normalize each pixel in the image based on its mean and standard deviation.

For the `Normalization` layer, its `adapt` method would first need to be called on the training data in order to compute aggregate statistics (that is, the mean and the standard deviation).


In [None]:
@tf.autograph.experimental.do_not_convert
def build_model():    
    input_shape = example_spectrograms.shape[1:]
    print('Input shape:', input_shape)
    num_labels = len(label_names)
    print(f"num_labels: {num_labels}")

    # Instantiate the `tf.keras.layers.Normalization` layer.
    norm_layer = layers.Normalization()
    # Fit the state of the layer to the spectrograms
    # with `Normalization.adapt`.
    norm_layer.adapt(data=train_spectrogram_ds.map(map_func=lambda spec, label: spec))
    global model
    model = models.Sequential([
        layers.Input(shape=input_shape),
        # Downsample the input.
        layers.Resizing(32, 32),
        # Normalize.
        norm_layer,
        # layers.Conv2D(32, 3, activation='relu'),
        # layers.Conv2D(64, 3, activation='relu'),
        layers.Conv2D(16, 3, activation='relu'),
        layers.Conv2D(32, 3, activation='relu'),
        # layers.Conv2D(8, 3, activation='relu'),
        # layers.Conv2D(16, 3, activation='relu'),
        layers.MaxPooling2D(),
        # layers.Dropout(0.25),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        # layers.Dense(num_labels),
        layers.Dense(num_labels, activation='softmax'),
    ])

    model.summary()

In [None]:
def build_dual_input_model():
    input_shape_spectrogram = example_spectrograms.shape[1:]
    input_shape_waveform = (waveform_length,)  # Setze die Länge der Waveform

    # Spektrogramm-Pfad
    input_spectrogram = layers.Input(shape=input_shape_spectrogram, name="spectrogram_input")
    x_spec = layers.Resizing(32, 32)(input_spectrogram)
    x_spec = layers.Conv2D(16, 3, activation='relu')(x_spec)
    x_spec = layers.Conv2D(32, 3, activation='relu')(x_spec)
    x_spec = layers.MaxPooling2D()(x_spec)
    x_spec = layers.Flatten()(x_spec)

    # Waveform-Pfad
    input_waveform = layers.Input(shape=input_shape_waveform, name="waveform_input")
    x_wave = layers.Reshape((input_shape_waveform[0], 1))(input_waveform)
    x_wave = layers.Conv1D(16, 3, activation='relu')(x_wave)
    x_wave = layers.Conv1D(32, 3, activation='relu')(x_wave)
    x_wave = layers.MaxPooling1D()(x_wave)
    x_wave = layers.Flatten()(x_wave)

    # Kombinieren der Features
    combined = layers.concatenate([x_spec, x_wave])
    x = layers.Dense(128, activation='relu')(combined)
    x = layers.Dropout(0.5)(x)
    output = layers.Dense(num_labels, activation='softmax')(x)

    model = models.Model(inputs=[input_spectrogram, input_waveform], outputs=output)
    model.summary()

    return model

Configure the Keras model with the Adam optimizer and the cross-entropy loss:


In [4]:
def compile():
    # model.compile(
    #     optimizer=tf.keras.optimizers.Adam(),
    #     loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    #     metrics=['accuracy'],
    # )
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),  # Wichtig: from_logits=False
        metrics=['accuracy']
    )

Train the model over 10 epochs for demonstration purposes:


In [None]:
def train():
    EPOCHS = 5000
    global history
    
    num_train_files = sum(1 for _ in train_spectrogram_ds.unbatch()) 
    num_val_files = sum(1 for _ in val_spectrogram_ds.unbatch())
    
    print(f"Number of training files: {num_train_files}")
    print(f"Number of validation files: {num_val_files}")
    
    class TimeHistory(tf.keras.callbacks.Callback):
        def on_train_begin(self, logs=None):
            self.start_time = time.time()  
            self.epoch_times = [] 

        def on_epoch_begin(self, epoch, logs=None):
            self.epoch_start_time = time.time()

        def on_epoch_end(self, epoch, logs=None):
            epoch_time = time.time() - self.epoch_start_time 
            self.epoch_times.append(epoch_time)

            avg_epoch_time = sum(self.epoch_times) / len(self.epoch_times)

            remaining_epochs = self.params['epochs'] - (epoch + 1)
            estimated_remaining_time = remaining_epochs * avg_epoch_time

            hours, rem = divmod(estimated_remaining_time, 3600)
            minutes, seconds = divmod(rem, 60)

            print(f"\nEpoch {epoch + 1}/{self.params['epochs']} - Estimated time until finished: "
                f"{int(hours)} hours, {int(minutes)} minutes, {int(seconds)} seconds")


    MAX_VAL_LOSS = 0.4
    MAX_ACCURACY = 0.8
    max_runs = 5  
    run = 0  
    
    time_callback = TimeHistory()
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=200, restore_best_weights=True)
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=1e-6)
    
    while run < max_runs:
        print(f"Start of run {run+1}/{max_runs}")
        history = model.fit(
            train_spectrogram_ds,
            validation_data=val_spectrogram_ds,
            epochs=EPOCHS,
            # callbacks=[time_callback]
            callbacks=[time_callback, early_stopping],
            verbose = 1
            # callbacks=[time_callback, early_stopping, reduce_lr]
        )
        
        val_loss, val_accuracy = model.evaluate(val_spectrogram_ds, verbose=0)
    
        print(f"Evaluated model with best weights: val_loss={val_loss}, val_accuracy={val_accuracy}")
    

        if val_loss < MAX_VAL_LOSS and val_accuracy > MAX_ACCURACY:
            print(f"Run {run+1} successful with val_loss={val_loss} and val_accuracy={val_accuracy}")
            break
        else:
            print(f"Run {run+1} not successful. val_loss={val_loss}, val_accuracy={val_accuracy} - Restarting training...")
            run += 1
    else:
        print("Maximum number of runs reached. Best model from the last run will be used.")

In [None]:
def run(_train_spectrogram_ds,_val_spectrogram_ds,_test_spectrogram_ds, _label_names):
    global train_spectrogram_ds,val_spectrogram_ds,test_spectrogram_ds, example_spectrograms, label_names, model, history
    
    train_spectrogram_ds = _train_spectrogram_ds
    val_spectrogram_ds =_val_spectrogram_ds
    test_spectrogram_ds=_test_spectrogram_ds
    label_names = _label_names
    
    for example_spectrograms,_ in train_spectrogram_ds.take(1):
        break
    
    init()
    build_model()
    # build_model_GPT()
    compile()
    train()
    
    return train_spectrogram_ds,val_spectrogram_ds,test_spectrogram_ds, model, history