In [1]:
import os
import pathlib
import glob
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
import zipfile
import soundfile as sf
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display
import pathlib
import sys
# import tensorflow_io as tfio  
import pickle



# from shared_vars import *

label_names = None
example_labels = None
example_audio = None
train_ds = None
val_ds = None
test_ds = None
example_filenames = None
file_list = None
waveform = None 

spectrogram = None
label = None

train_spectrogram_ds = None
val_spectrogram_ds = None
test_spectrogram_ds = None
example_spectrograms = None
example_spect_labels = None



## Convert waveforms to spectrograms

The waveforms in the dataset are represented in the time domain. Next, you'll transform the waveforms from the time-domain signals into the time-frequency-domain signals by computing the [short-time Fourier transform (STFT)](https://en.wikipedia.org/wiki/Short-time_Fourier_transform) to convert the waveforms to as [spectrograms](https://en.wikipedia.org/wiki/Spectrogram), which show frequency changes over time and can be represented as 2D images. You will feed the spectrogram images into your neural network to train the model.

A Fourier transform (`tf.signal.fft`) converts a signal to its component frequencies, but loses all time information. In comparison, STFT (`tf.signal.stft`) splits the signal into windows of time and runs a Fourier transform on each window, preserving some time information, and returning a 2D tensor that you can run standard convolutions on.

Create a utility function for converting waveforms to spectrograms:

- The waveforms need to be of the same length, so that when you convert them to spectrograms, the results have similar dimensions. This can be done by simply zero-padding the audio clips that are shorter than one second (using `tf.zeros`).
- When calling `tf.signal.stft`, choose the `frame_length` and `frame_step` parameters such that the generated spectrogram "image" is almost square. For more information on the STFT parameters choice, refer to [this Coursera video](https://www.coursera.org/lecture/audio-signal-processing/stft-2-tjEQe) on audio signal processing and STFT.
- The STFT produces an array of complex numbers representing magnitude and phase. However, in this tutorial you'll only use the magnitude, which you can derive by applying `tf.abs` on the output of `tf.signal.stft`.


In [2]:
def get_spectrogram(waveform):
  # Convert the waveform to a spectrogram via a STFT.
  spectrogram = tf.signal.stft(
      waveform, frame_length=255, frame_step=128)
  # Obtain the magnitude of the STFT.
  spectrogram = tf.abs(spectrogram)
  # Add a `channels` dimension, so that the spectrogram can be used
  # as image-like input data with convolution layers (which expect
  # shape (`batch_size`, `height`, `width`, `channels`).
  spectrogram = spectrogram[..., tf.newaxis]
  return spectrogram

Next, start exploring the data. Print the shapes of one example's tensorized waveform and the corresponding spectrogram, and play the original audio:


In [3]:


def play():
  global waveform
  global label
  global spectrogram
  for i in range(3):
    label = label_names[example_labels[i]]
    waveform = example_audio[i]
    spectrogram = get_spectrogram(waveform)

    print('Label:', label)
    print('Waveform shape:', waveform.shape)
    print('Spectrogram shape:', spectrogram.shape)
    print('Audio playback')
    display.display(display.Audio(waveform, rate=16000))

Now, define a function for displaying a spectrogram:


In [4]:
def plot_spectrogram(spectrogram, ax):
  if len(spectrogram.shape) > 2:
    assert len(spectrogram.shape) == 3
    spectrogram = np.squeeze(spectrogram, axis=-1)
  # Convert the frequencies to log scale and transpose, so that the time is
  # represented on the x-axis (columns).
  # Add an epsilon to avoid taking a log of zero.
  log_spec = np.log(spectrogram.T + np.finfo(float).eps)
  height = log_spec.shape[0]
  width = log_spec.shape[1]
  X = np.linspace(0, np.size(spectrogram), num=width, dtype=int)
  Y = range(height)
  ax.pcolormesh(X, Y, log_spec)

Plot the example's waveform over time and the corresponding spectrogram (frequencies over time):

In [5]:
def small_plot():
    fig, axes = plt.subplots(2, figsize=(12, 8))
    timescale = np.arange(waveform.shape[0])
    axes[0].plot(timescale, waveform.numpy())
    axes[0].set_title('Waveform')
    axes[0].set_xlim([0, 16000])

    plot_spectrogram(spectrogram.numpy(), axes[1])
    axes[1].set_title('Spectrogram')
    plt.suptitle(label.title())
    plt.show()

Now, create spectrogram datasets from the audio datasets:


In [6]:
# def make_spec_ds(ds):
#   return ds.map(
#       map_func=lambda audio,label: (get_spectrogram(audio), label),
#       num_parallel_calls=tf.data.AUTOTUNE)

In [7]:
# Umwandeln der Lambda-Funktion in eine reguläre Funktion
def map_audio_label(audio, label):
    return get_spectrogram(audio), label

@tf.autograph.experimental.do_not_convert
def make_spec_ds(ds):
    return ds.map(
        map_func=map_audio_label,  # Verwendung der regulären Funktion
        num_parallel_calls=tf.data.AUTOTUNE
    )

In [8]:
# train_spectrogram_ds = make_spec_ds(train_ds)
# val_spectrogram_ds = make_spec_ds(val_ds)
# test_spectrogram_ds = make_spec_ds(test_ds)

Examine the spectrograms for different examples of the dataset:

In [9]:
# for example_spectrograms, example_spect_labels in train_spectrogram_ds.take(1):
#   break

In [10]:
def plot():
    global train_spectrogram_ds
    global val_spectrogram_ds 
    global test_spectrogram_ds
    global example_spectrograms
    global example_spect_labels
    train_spectrogram_ds = make_spec_ds(train_ds)
    val_spectrogram_ds = make_spec_ds(val_ds)
    test_spectrogram_ds = make_spec_ds(test_ds)
    
    for example_spectrograms, example_spect_labels in train_spectrogram_ds.take(1):
        break
    
    rows = 4
    cols = 2
    n = rows*cols
    fig, axes = plt.subplots(rows, cols, figsize=(16, 9))

    for i in range(n):
        r = i // cols
        c = i % cols
        ax = axes[r][c]
        plot_spectrogram(example_spectrograms[i].numpy(), ax)
        ax.set_title(f"{i}_Label: {label} __ {example_filenames[i]}")
        plt.yticks(np.arange(-1.2, 1.2, 0.2))
        plt.ylim([-1.1, 1.1])

    plt.tight_layout() 
    plt.show()

In [11]:
def run(input_label_names, input_example_labels, input_example_audio, input_train_ds, input_val_ds, input_test_ds, input_example_filenames, input_file_list):
    global label_names, example_labels, example_audio, train_ds, val_ds, test_ds, example_filenames, file_list  # Zugriff auf die globalen Variablen

    label_names = input_label_names
    example_labels = input_example_labels
    example_audio = input_example_audio
    train_ds = input_train_ds
    val_ds = input_val_ds
    test_ds = input_test_ds
    example_filenames = input_example_filenames
    file_list = input_file_list

    print("Loaded label_names (before play):", label_names)
    
    play()
    small_plot()
    plot()

    print("Loaded label_names (after play):", label_names)
    
    return label_names, example_labels, example_audio, train_ds, val_ds, test_ds, example_filenames, waveform, file_list, train_spectrogram_ds,val_spectrogram_ds,test_spectrogram_ds, example_spectrograms,example_spect_labels

# Run Notebook individual

In [12]:
# TRAIN_DIR = pathlib.Path('data/train_files')
# TEST_DIR = pathlib.Path('data/test_files')
# DATA_DIR = pathlib.Path('data')


# file_list = tf.data.Dataset.list_files(str(TRAIN_DIR / '**/*.wav'), shuffle=False)

# seconds=20
# train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
#     directory=TRAIN_DIR,
#     batch_size=64,
#     validation_split=0.2,
#     seed=0,
#     output_sequence_length=16000*seconds,
#     subset='both'
#     )

# label_names = np.array(train_ds.class_names)
# print()
# print("label names:", label_names)
# play()
# plot()