In [None]:
import matplotlib.pyplot as plt 
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display
import os
import librosa
# import tensorflow_io as tfio  


label_names = None
example_labels = None
example_audio = None
train_ds = None
val_ds = None
test_ds = None
waveform = None 

spectrogram = None
label = None

train_spectrogram_ds = None
val_spectrogram_ds = None
test_spectrogram_ds = None



## Convert waveforms to spectrograms

The waveforms in the dataset are represented in the time domain. Next, you'll transform the waveforms from the time-domain signals into the time-frequency-domain signals by computing the [short-time Fourier transform (STFT)](https://en.wikipedia.org/wiki/Short-time_Fourier_transform) to convert the waveforms to as [spectrograms](https://en.wikipedia.org/wiki/Spectrogram), which show frequency changes over time and can be represented as 2D images. You will feed the spectrogram images into your neural network to train the model.

A Fourier transform (`tf.signal.fft`) converts a signal to its component frequencies, but loses all time information. In comparison, STFT (`tf.signal.stft`) splits the signal into windows of time and runs a Fourier transform on each window, preserving some time information, and returning a 2D tensor that you can run standard convolutions on.

Create a utility function for converting waveforms to spectrograms:

- The waveforms need to be of the same length, so that when you convert them to spectrograms, the results have similar dimensions. This can be done by simply zero-padding the audio clips that are shorter than one second (using `tf.zeros`).
- When calling `tf.signal.stft`, choose the `frame_length` and `frame_step` parameters such that the generated spectrogram "image" is almost square. For more information on the STFT parameters choice, refer to [this Coursera video](https://www.coursera.org/lecture/audio-signal-processing/stft-2-tjEQe) on audio signal processing and STFT.
- The STFT produces an array of complex numbers representing magnitude and phase. However, in this tutorial you'll only use the magnitude, which you can derive by applying `tf.abs` on the output of `tf.signal.stft`.


In [None]:
def get_spectrogram(waveform):
    # Convert the waveform to a spectrogram via a STFT.
    spectrogram = tf.signal.stft(
        waveform, frame_length=255, frame_step=128)
    # Obtain the magnitude of the STFT.
    spectrogram = tf.abs(spectrogram)
    # Add a `channels` dimension, so that the spectrogram can be used
    # as image-like input data with convolution layers (which expect
    # shape (`batch_size`, `height`, `width`, `channels`).
    spectrogram = spectrogram[..., tf.newaxis]
    return spectrogram

In [None]:
def get_mel_spectrogram(waveform):
    spec = librosa.feature.melspectrogram(y=waveform, sr=44100)
    return spec

Next, start exploring the data. Print the shapes of one example's tensorized waveform and the corresponding spectrogram, and play the original audio:


In [None]:
def play():
  global waveform
  global label
  global spectrogram
  for i in range(3):
    label = label_names[example_labels[i]]
    waveform = example_audio[i]
    spectrogram = get_spectrogram(waveform)
    # spectrogram = get_mel_spectrogram(waveform)

    print('Label:', label)
    print('Waveform shape:', waveform.shape)
    print('Spectrogram shape:', spectrogram.shape)
    print('Audio playback')
    display.display(display.Audio(waveform, rate=44100))

Now, define a function for displaying a spectrogram:


In [None]:
# def plot_spectrogram(spectrogram, ax):
#   if len(spectrogram.shape) > 2:
#     assert len(spectrogram.shape) == 3
#     spectrogram = np.squeeze(spectrogram, axis=-1)
#   # Convert the frequencies to log scale and transpose, so that the time is
#   # represented on the x-axis (columns).
#   # Add an epsilon to avoid taking a log of zero.
#   log_spec = np.log(spectrogram.T + np.finfo(float).eps)
#   height = log_spec.shape[0]
#   width = log_spec.shape[1]
#   X = np.linspace(0, np.size(spectrogram), num=width, dtype=int)
#   Y = range(height)
#   ax.pcolormesh(X, Y, log_spec)

def plot_spectrogram(spectrogram, ax, sr=44100, duration=None):
    if len(spectrogram.shape) > 2:
        spectrogram = np.squeeze(spectrogram, axis=-1)

    # Logarithmische Skalierung der Spektrogramm-Daten
    log_spec = np.log(spectrogram.T + np.finfo(float).eps)
    height, width = log_spec.shape

    # Berechnung der x-Achse basierend auf der Audiodauer
    if duration is None:
        duration = width / sr  # Dauer in Sekunden falls keine spezifische Dauer gegeben ist

    X = np.linspace(0, duration, width)  # Zeit in Sekunden

    # Frequenzen basierend auf der Anzahl der Frequenz-Bins und der Sample-Rate
    freqs = np.linspace(0, sr / 2, height)  # Frequenzachse bis zur Nyquist-Frequenz
    Y = freqs  # Frequenzen in Hertz
    
    ax.pcolormesh(X, Y, log_spec, shading='auto')
    ax.set_xlabel("Time (seconds)")
    ax.set_ylabel("Frequency (Hz)")
    
    # Setzen der y-Achsen-Limits auf die Nyquist-Frequenz
    ax.set_ylim(0, sr / 2)

    # # Optional: Hinzufügen einer Farbskala
    # cbar = plt.colorbar(ax.collections[0], ax=ax)
    # cbar.set_label('Log Power')

Plot the example's waveform over time and the corresponding spectrogram (frequencies over time):

In [None]:
def small_plot():
    fig, axes = plt.subplots(2, figsize=(12, 8))
    timescale = np.arange(waveform.shape[0])
    axes[0].plot(timescale, waveform.numpy())
    axes[0].set_title('Waveform')
    axes[0].set_xlim([0, 44100])

    plot_spectrogram(spectrogram.numpy(), axes[1])
    axes[1].set_title('Spectrogram')
    plt.suptitle(label.title())
    plt.show()

In [None]:
def compare_waveforms_and_spectrograms(
    waveform_original, waveform_upsampled, 
    spectrogram_original, spectrogram_upsampled, 
    label, sr_original=44100, sr_upsampled=44100):

    fig, axes = plt.subplots(3, 2, figsize=(16, 16))
    duration_original = len(waveform_original) / sr_original
    duration_upsampled = len(waveform_upsampled) / sr_upsampled

    time_original = np.linspace(0, duration_original, num=len(waveform_original))
    time_upsampled = np.linspace(0, duration_upsampled, num=len(waveform_upsampled))

    # Original Waveform
    axes[0, 0].plot(time_original, waveform_original)
    axes[0, 0].set_title('Original Waveform')
    axes[0, 0].set_xlim([0, duration_original])
    axes[0, 0].set_xlabel("Time (seconds)")
    axes[0, 0].set_ylabel("Amplitude")

    # Upsampled Waveform
    axes[0, 1].plot(time_upsampled, waveform_upsampled)
    axes[0, 1].set_title('Upsampled Waveform')
    axes[0, 1].set_xlim([0, duration_upsampled])
    axes[0, 1].set_xlabel("Time (seconds)")
    axes[0, 1].set_ylabel("Amplitude")

    # Waveform Difference
    min_length = min(len(waveform_original), len(waveform_upsampled))
    waveform_diff = waveform_original[:min_length] - waveform_upsampled[:min_length]
    time_diff = np.linspace(0, min_length / sr_original, num=min_length)
    axes[1, 0].plot(time_diff, waveform_diff)
    axes[1, 0].set_title('Waveform Difference')
    axes[1, 0].set_xlim([0, time_diff[-1]])
    axes[1, 0].set_xlabel("Time (seconds)")
    axes[1, 0].set_ylabel("Amplitude Difference")

    # Original Spectrogram (um 90 Grad gedreht und gespiegelte Zeitachse)
    plot_spectrogram(np.flipud(np.rot90(spectrogram_original)), axes[1, 1], sr=sr_original, duration=duration_original)
    axes[1, 1].set_title('Original Spectrogram')

    # Upsampled Spectrogram (um 90 Grad gedreht und gespiegelte Zeitachse)
    plot_spectrogram(np.flipud(np.rot90(spectrogram_upsampled)), axes[2, 1], sr=sr_upsampled, duration=duration_upsampled)
    axes[2, 1].set_title('Upsampled Spectrogram')

    # Spectrogram Difference (um 90 Grad gedreht und gespiegelte Zeitachse)
    min_shape = np.minimum(spectrogram_original.shape, spectrogram_upsampled.shape)
    spectrogram_diff = spectrogram_original[:min_shape[0], :min_shape[1]] - spectrogram_upsampled[:min_shape[0], :min_shape[1]]
    plot_spectrogram(np.flipud(np.rot90(spectrogram_diff)), axes[2, 0], sr=sr_original, duration=duration_original)
    axes[2, 0].set_title('Spectrogram Difference')

    plt.suptitle(label)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()






def load_audio_files_and_compare(original_dir, upsampled_dir, n=3, s=1):
    # Hole die Original- und Upsampled-Dateien
    original_files = sorted(os.listdir(original_dir))[:n]
    upsampled_files = sorted(os.listdir(upsampled_dir))[:n]
    
    for orig_file, up_file in zip(original_files, upsampled_files):
        orig_path = os.path.join(original_dir, orig_file)
        up_path = os.path.join(upsampled_dir, up_file)

        # Laden der Audiodateien mit Librosa
        waveform_original, sr_original = librosa.load(orig_path, sr=None)
        waveform_upsampled, sr_upsampled = librosa.load(up_path, sr=None)

        # Berechne die Anzahl der Samples, die dem gewünschten Ausschnitt (in Sekunden) entsprechen
        samples_to_extract = s * sr_original  # in Samples
        # Extrahiere den Anfang des Tracks
        waveform_original_extract = waveform_original[:samples_to_extract]
        waveform_upsampled_extract = waveform_upsampled[:samples_to_extract]

        # Erzeuge die Spektrogramme aus den Ausschnitten
        spectrogram_original = librosa.feature.melspectrogram(y=waveform_original_extract, sr=sr_original)
        spectrogram_upsampled = librosa.feature.melspectrogram(y=waveform_upsampled_extract, sr=sr_upsampled)

        # Vergleichstitel
        label = f'Comparison: {orig_file} vs {up_file}'

        # Aufruf der Vergleichsfunktion mit den Ausschnitten
        compare_waveforms_and_spectrograms(
            waveform_original_extract, waveform_upsampled_extract,
            spectrogram_original, spectrogram_upsampled,
            label, sr_original=sr_original, sr_upsampled=sr_upsampled
        )

Now, create spectrogram datasets from the audio datasets:


In [None]:
# Umwandeln der Lambda-Funktion in eine reguläre Funktion
def map_audio_label(audio, label):
    return get_spectrogram(audio), label
    # return get_mel_spectrogram(audio), label

@tf.autograph.experimental.do_not_convert
def make_spec_ds(ds):
    return ds.map(
        map_func=map_audio_label,
        num_parallel_calls=tf.data.AUTOTUNE
    )

In [None]:

def plot():
    # Plot-Raster definieren
    rows = 4
    cols = 2
    n = rows * cols
    fig, axes = plt.subplots(rows, cols, figsize=(16, 9))

    for i in range(n):
        r = i // cols
        c = i % cols
        ax = axes[r][c]
        
        # Plotten des Spektrogramms
        plot_spectrogram(example_spectrograms[i].numpy(), ax)
        
        # Verwenden von example_spect_labels[i] statt einer nicht definierten Variable
        ax.set_title(f"{i}_Label: {example_spect_labels[i].numpy()}")
        
        # Setzen der y-Achsenwerte und Grenzen
        ax.set_yticks(np.arange(-1.2, 1.2, 0.2))
        ax.set_ylim([-1.1, 1.1])

    plt.tight_layout() 
    plt.show()

In [None]:
def build_ds():
    global train_spectrogram_ds
    global val_spectrogram_ds 
    global test_spectrogram_ds
    global example_spectrograms
    global example_spect_labels
    
    train_spectrogram_ds = make_spec_ds(train_ds)
    val_spectrogram_ds = make_spec_ds(val_ds)
    test_spectrogram_ds = make_spec_ds(test_ds)
    
    # train_spectrogram_ds = make_combined_ds(train_ds)
    # val_spectrogram_ds = make_combined_ds(val_ds)
    # test_spectrogram_ds = make_combined_ds(test_ds)
    
    # @tf.autograph.experimental.do_not_convert
    # def squeeze(audio, labels):
    #      audio = tf.squeeze(audio, axis=-1)
    #      return audio, labels

    # train_spectrogram_ds = train_spectrogram_ds.map(squeeze, tf.data.AUTOTUNE)
    # val_spectrogram_ds = val_spectrogram_ds.map(squeeze, tf.data.AUTOTUNE)
    # test_spectrogram_ds = test_spectrogram_ds.map(squeeze, tf.data.AUTOTUNE)
    
    
    
    for example_spectrograms, example_spect_labels in train_spectrogram_ds.take(1):
        break
    

In [None]:
def run(input_label_names, input_train_ds, input_val_ds, input_test_ds):
    global label_names, example_labels, example_audio
    global train_ds, val_ds, test_ds

    label_names = input_label_names
    train_ds = input_train_ds
    val_ds = input_val_ds
    test_ds = input_test_ds
    
    for example_audio, example_labels in train_ds.take(1):
        break

    print("Loaded label_names (before play):", label_names)
    
    play()
    # Verzeichnispfade
    original_dir = r"I:\Uni-Git\Master\Tutorial\data\large_train_ds\orig-16-44-mono"
    upsampled_dir = r"I:\Uni-Git\Master\Tutorial\data\large_train_ds\upscale-from-mp3-128"

    # Vergleich der ersten 3 Dateien
    load_audio_files_and_compare(original_dir, upsampled_dir, n=3)
    # load_and_compare_from_dataset()
    # small_plot()
    build_ds()
    # plot()
    print("Loaded label_names (after play):", label_names)
    
    return train_spectrogram_ds, val_spectrogram_ds, test_spectrogram_ds

# Run Notebook individual

In [12]:
# TRAIN_DIR = pathlib.Path('data/train_files')
# TEST_DIR = pathlib.Path('data/test_files')
# DATA_DIR = pathlib.Path('data')


# file_list = tf.data.Dataset.list_files(str(TRAIN_DIR / '**/*.wav'), shuffle=False)

# seconds=20
# train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
#     directory=TRAIN_DIR,
#     batch_size=64,
#     validation_split=0.2,
#     seed=0,
#     output_sequence_length=16000*seconds,
#     subset='both'
#     )

# label_names = np.array(train_ds.class_names)
# print()
# print("label names:", label_names)
# play()
# plot()