In [None]:
import tensorflow as tf
import numpy as np
from scipy.io import wavfile
from scipy.signal import spectrogram, stft, istft
import matplotlib.pyplot as plt
import librosa
import librosa.display
from misceallaneous import getWavFileAsNpArray, displaySpectrogram
from IPython.display import Audio

samplerate = 12000

In [None]:
clean = getWavFileAsNpArray("../dataset_2/clean/p1.wav")
vinyl = getWavFileAsNpArray("../dataset_2/white/p1.wav")

In [None]:
nperseg = 1024

c, t, Cxx = stft(np.array(clean), fs=samplerate, nperseg=nperseg)
d, u, Vxx = stft(np.array(vinyl), fs=samplerate, nperseg=nperseg)

In [None]:
displaySpectrogram(Cxx[:, 32:64])
plt.show()
displaySpectrogram(Vxx[:, 32:64])
plt.show()

In [None]:
Cxx = Cxx[1:, :]
Vxx = Vxx[1:, :]
print(Cxx.shape, Vxx.shape)

In [None]:
Cxx_split = []
Vxx_split = []

In [None]:
samples_length = 64
for i in range(0, Cxx.shape[1]-samples_length, samples_length):
    Cxx_split.append(np.reshape(np.array(Cxx[:, i:i+samples_length]), (Cxx.shape[0], samples_length, 1)))
    Vxx_split.append(np.reshape(np.array(Vxx[:, i:i+samples_length]), (Cxx.shape[0], samples_length, 1)))
Cxx_split = np.array(Cxx_split)
Vxx_split = np.array(Vxx_split)

In [None]:
print(Cxx_split.shape)

In [None]:
Cxx_r = np.abs(Cxx_split)
Cxx_i = np.imag(Cxx_split)
Vxx_r = np.abs(Vxx_split)
Vxx_i = np.imag(Vxx_split)
print(Cxx_r.shape)

In [None]:
ker1 = (16, 4)
ker2 = (1, 4)
def simple_autoencoder(input_shape, V_shape_1):
    print(input_shape)
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Conv2D(1, kernel_size = ker,  activation = 'relu', input_shape = input_shape, padding='same', data_format='channels_last')) 
    model.add(tf.keras.layers.MaxPooling2D(pool_size = (2, 1)))
    
    model.add(tf.keras.layers.Conv2D(1, ker1, activation = 'relu', padding='same'))
    
    model.add(tf.keras.layers.Conv2D(1, ker1, activation = 'relu', padding='same'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size = (2, 1)))
    
    model.add(tf.keras.layers.Conv2D(1, ker1, activation = 'relu', padding='same'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size = (2, 1)))
    
    model.add(tf.keras.layers.Flatten())
    
    model.add(tf.keras.layers.Dense(V_shape_1*V_shape_1/8, activation = 'relu'))
    model.add(tf.keras.layers.Dense(V_shape_1*V_shape_1, activation = 'relu'))
    
    model.add(tf.keras.layers.Reshape((V_shape_1, V_shape_1, 1)))
    
    model.add(tf.keras.layers.Conv2D(1, ker2, activation = 'relu', padding='same'))
    model.add(tf.keras.layers.UpSampling2D((2, 1)))
    
    model.add(tf.keras.layers.Conv2D(1, ker2, activation = 'relu', padding='same'))
    
    model.add(tf.keras.layers.Conv2D(1, ker2, activation = 'relu', padding='same'))
    model.add(tf.keras.layers.UpSampling2D((2, 1)))
    
    model.add(tf.keras.layers.Conv2D(1, ker2, activation = 'relu', padding='same'))
    model.add(tf.keras.layers.UpSampling2D((2, 1)))
    model.summary()
    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    return model
model = simple_autoencoder((Cxx_r.shape[1], Cxx_r.shape[2], 1), Cxx_r.shape[2])

In [None]:
sample_number = 0
inp = (np.reshape(Vxx_r[sample_number, :], (Vxx_r[sample_number, :].shape[0], Vxx_r[sample_number, :].shape[1])))
displaySpectrogram(inp)
plt.show()
for _ in range(10):
    model.fit(Vxx_r, Cxx_r, batch_size=4, epochs=1)
    layerIndex = -1
    func = tf.keras.backend.function([model.get_layer(index=0).input], model.get_layer(index=layerIndex).output)
    layerOutput = func([Vxx_r])  # input_data is a numpy array
    out = (np.reshape(layerOutput[sample_number], (layerOutput[sample_number].shape[0], layerOutput[sample_number].shape[1])))

    displaySpectrogram(out)
    plt.show()

In [None]:
predict_length = 1000

In [None]:
output = model.predict(Vxx_r.T[:predict_length])
output.shape

In [None]:
displaySpectrogram(Vxx_r[:, :predict_length])
plt.show()
displaySpectrogram(output.T[:, :predict_length])
plt.show()
displaySpectrogram(Cxx_r[:, :predict_length])
plt.show()

In [None]:
cleaned_output = istft(output.T + Vxx_i[:, :predict_length]*1j, fs=samplerate, nperseg=nperseg)[1]
clean_output = istft(Vxx_r[:, :predict_length] + Vxx_i[:, :predict_length]*1j, fs=samplerate, nperseg=nperseg)[1]
Audio(clean_output, rate=samplerate)

In [None]:
Audio(cleaned_output, rate=samplerate)

It seems that auto encoders are always blurry: therefore, the audio output cannot be 