# Mel Generative Adversarial Networks Discriminator
## Initialisation and dataset preparation

First, let us import the required libraries.

In [1]:
import tensorflow as tf
import numpy as np
from scipy.io import wavfile
from scipy.signal import spectrogram, stft, istft
import matplotlib.pyplot as plt
import librosa
import librosa.display
from misceallaneous import getWavFileAsNpArray, displaySpectrogram
from IPython.display import Audio

Then, let us include the dataset.

The dataset is made of two files: `clean/p1.wav`and `white/p1.wav` which are converted into arrays of `int32` and then split into segments of `samples_length`.

The goal of the CGAN here is to predict the clean sample, when fed with the white one.

In [33]:
samplerate = 12000
nperseg = 1024

clean = getWavFileAsNpArray("../dataset_2/clean/p1.wav")
white = getWavFileAsNpArray("../dataset_2/white/p1.wav")
clean = np.array(clean, dtype="float32")
white = np.array(white, dtype="float32")

clean_dataset = []
white_dataset = []

samples_length = nperseg*10

for i in range(0, clean.shape[0]-samples_length, samples_length):
    clean_dataset.append(clean[i:i+samples_length])
    white_dataset.append(white[i:i+samples_length])
clean_dataset = np.array(clean_dataset)
white_dataset = np.array(white_dataset)

In [34]:
spectrogram_clean_dataset = []
spectrogram_white_dataset = []

for sample in clean_dataset:
    spectrogram_clean_dataset.append(librosa.amplitude_to_db(np.abs(librosa.stft(sample))))
for sample in white_dataset:
    spectrogram_white_dataset.append(librosa.amplitude_to_db(np.abs(librosa.stft(sample))))

spectrogram_clean_dataset = np.array(spectrogram_clean_dataset)
spectrogram_white_dataset = np.array(spectrogram_white_dataset)

max_clean = np.max(spectrogram_clean_dataset)
min_clean = np.min(spectrogram_clean_dataset)
spectrogram_clean_dataset = (spectrogram_clean_dataset-min_clean)/max_clean

max_white = np.max(spectrogram_white_dataset)
min_white = np.min(spectrogram_white_dataset)
spectrogram_white_dataset = (spectrogram_white_dataset-min_white)/max_white

In [35]:
data_shape = (spectrogram_white_dataset.shape[1:])

In [36]:
print("Data shape:", data_shape)
print("Dataset shape:", spectrogram_white_dataset.shape)

Data shape: (1025, 21)
Dataset shape: (1065, 1025, 21)


In [37]:
print(np.max(np.abs(spectrogram_clean_dataset)), np.max(np.abs(spectrogram_white_dataset)))

1.7244133 0.8820283


### Discriminator

The discriminator here uses a layer to process the Short-Time Fourier Transform (https://en.wikipedia.org/wiki/Short-time_Fourier_transform) before reducing the problem dimension to one single boolean prediction layer.

Interestingly, adding a Dropout layer on the input seems to prevent the generator to adapt itself to the little flaws of detection (which then only produces noise unrecognized by the discriminator).

In [38]:
def discriminator(input_shape):
    inputs = tf.keras.Input(shape=(input_shape[1], input_shape[2]))
    x = tf.keras.layers.Dropout(0.3)(inputs)
    x1 = tf.keras.layers.Convolution1D(10, kernel_size=(4), activation="tanh", padding="same")(x)
    x2 = tf.keras.layers.MaxPooling1D()(x1)
    x3 = tf.keras.layers.Convolution1D(10, kernel_size=(4), activation="tanh", padding="same")(x2)
    x4 = tf.keras.layers.MaxPooling1D()(x3)
    x5 = tf.keras.layers.Convolution1D(10, kernel_size=(4), activation="tanh", padding="same")(x4)
    x6 = tf.keras.layers.MaxPooling1D()(x5)
    x7 = tf.keras.layers.Dense(1, activation="tanh")(x6)
    x8 = tf.keras.layers.Flatten()(x7)
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x8)
    model = tf.keras.Model(inputs=inputs, outputs=outputs, name="discriminator")
    model.summary()
    model.compile(optimizer= 'adam', loss='mse', metrics=['accuracy'])
    return model
d = discriminator(spectrogram_white_dataset.shape)

Model: "discriminator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 1025, 21)]        0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 1025, 21)          0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 1025, 10)          850       
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 512, 10)           0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 512, 10)           410       
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 256, 10)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 256, 10)         

In [39]:
def train_on_batch(d, i, o, validation_split=0, batch_size=16, verbose=True):
    history = d.fit(i, o, batch_size=batch_size, validation_split=validation_split, verbose=verbose)
    return np.mean(history.history['accuracy'])

In [40]:
discriminator_train_size = (spectrogram_white_dataset.shape[0])
generator_train_size = spectrogram_white_dataset.shape[0]

### Pretraining the discriminator

In [41]:
d_accuracy = 0
while d_accuracy < 0.9:
    d_accuracy = train_on_batch(d, np.concatenate((spectrogram_white_dataset, spectrogram_clean_dataset)), np.concatenate((np.zeros(spectrogram_white_dataset.shape[0]), np.ones(spectrogram_clean_dataset.shape[0]))), verbose=True)

