In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
import librosa.display as dsp

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
files="/content/gdrive/MyDrive/data/musdb18_train/"
import os
dirs = os.listdir(files)
# dirs = ["A Classic Education - NightOwl.stem.mp4","ANiMAL - Clinic A.stem.mp4","ANiMAL - Easy Tiger.stem.mp4"]
dirs

['A Classic Education - NightOwl.stem.mp4',
 'ANiMAL - Clinic A.stem.mp4',
 'ANiMAL - Easy Tiger.stem.mp4',
 'ANiMAL - Rockshow.stem.mp4',
 "Actions - Devil's Words.stem.mp4",
 'Actions - One Minute Smile.stem.mp4',
 'Actions - South Of The Water.stem.mp4',
 'Aimee Norwich - Child.stem.mp4',
 'Alexander Ross - Goodbye Bolero.stem.mp4',
 'Alexander Ross - Velvet Curtain.stem.mp4',
 'Angela Thomas Wade - Milk Cow Blues.stem.mp4',
 'Atlantis Bound - It Was My Fault For Waiting.stem.mp4',
 'Auctioneer - Our Future Faces.stem.mp4',
 'AvaLuna - Waterduct.stem.mp4',
 'BigTroubles - Phantom.stem.mp4',
 'Bill Chudziak - Children Of No-one.stem.mp4',
 'Black Bloc - If You Want Success.stem.mp4',
 'Celestial Shore - Die For Us.stem.mp4',
 'Chris Durban - Celebrate.stem.mp4',
 'Clara Berry And Wooldog - Air Traffic.stem.mp4',
 'Clara Berry And Wooldog - Stella.stem.mp4',
 'Clara Berry And Wooldog - Waltz For My Victims.stem.mp4',
 'Cnoc An Tursa - Bannockburn.stem.mp4',
 'Creepoid - OldTree.stem.m

In [None]:
mix_dirs = []
bass_dirs = []
for i in dirs:
    mix_dirs.append(i+"/mix.wav")
    bass_dirs.append(i+"/bass.wav")

In [None]:
mix_dirs

['A Classic Education - NightOwl.stem.mp4/mix.wav',
 'ANiMAL - Clinic A.stem.mp4/mix.wav',
 'ANiMAL - Easy Tiger.stem.mp4/mix.wav',
 'ANiMAL - Rockshow.stem.mp4/mix.wav',
 "Actions - Devil's Words.stem.mp4/mix.wav",
 'Actions - One Minute Smile.stem.mp4/mix.wav',
 'Actions - South Of The Water.stem.mp4/mix.wav',
 'Aimee Norwich - Child.stem.mp4/mix.wav',
 'Alexander Ross - Goodbye Bolero.stem.mp4/mix.wav',
 'Alexander Ross - Velvet Curtain.stem.mp4/mix.wav',
 'Angela Thomas Wade - Milk Cow Blues.stem.mp4/mix.wav',
 'Atlantis Bound - It Was My Fault For Waiting.stem.mp4/mix.wav',
 'Auctioneer - Our Future Faces.stem.mp4/mix.wav',
 'AvaLuna - Waterduct.stem.mp4/mix.wav',
 'BigTroubles - Phantom.stem.mp4/mix.wav',
 'Bill Chudziak - Children Of No-one.stem.mp4/mix.wav',
 'Black Bloc - If You Want Success.stem.mp4/mix.wav',
 'Celestial Shore - Die For Us.stem.mp4/mix.wav',
 'Chris Durban - Celebrate.stem.mp4/mix.wav',
 'Clara Berry And Wooldog - Air Traffic.stem.mp4/mix.wav',
 'Clara Berry 

In [1]:
sample_rate = 8000
n_fft = 255
hop_length_fft = 63
nb_samples = 50
batch_size = 20
epochs = 10
frame_length = 8064
hop_length_frame = 8064
min_duration = 1
categories = ["mix.wav","bass.wav"]
dim_square_spec = int(n_fft / 2) + 1

In [2]:
type(categories)

list

In [None]:
def audio_to_audio_frame_stack(sound_data, frame_length, hop_length_frame):
    """This function take an audio and split into several frame
       in a numpy matrix of size (nb_frame,frame_length)"""

    sequence_sample_length = sound_data.shape[0]

    sound_data_list = [sound_data[start:start + frame_length] for start in range(
    0, sequence_sample_length - frame_length + 1, hop_length_frame)]  # get sliding windows
    sound_data_array = np.vstack(sound_data_list)

    return sound_data_array


def audio_files_to_numpy(audio_dir, list_audio_files, sample_rate, frame_length, hop_length_frame, min_duration):
    """This function take audio files of a directory and merge them
    in a numpy matrix of size (nb_frame,frame_length) for a sliding window of size hop_length_frame"""

    list_sound_array = []

    for file in list_audio_files:
        # open the audio file
        y, sr = librosa.load(os.path.join(audio_dir,file), sr=sample_rate)
        total_duration = librosa.get_duration(y=y, sr=sr)

        if (total_duration >= min_duration):
            list_sound_array.append(audio_to_audio_frame_stack(
                y, frame_length, hop_length_frame))
        else:
            print(
                f"The following file {os.path.join(audio_dir,file)} is below the min duration")

    return np.vstack(list_sound_array)

def audio_to_magnitude_db_and_phase(n_fft, hop_length_fft, audio):
    """This function takes an audio and convert into spectrogram,
       it returns the magnitude in dB and the phase"""

    stftaudio = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length_fft)
    stftaudio_magnitude, stftaudio_phase = librosa.magphase(stftaudio)

    stftaudio_magnitude_db = librosa.amplitude_to_db(
        stftaudio_magnitude, ref=np.max)

    return stftaudio_magnitude_db, stftaudio_phase

def numpy_audio_to_matrix_spectrogram(numpy_audio, dim_square_spec, n_fft, hop_length_fft):
    """This function takes as input a numpy audi of size (nb_frame,frame_length), and return
    a numpy containing the matrix spectrogram for amplitude in dB and phase. It will have the size
    (nb_frame,dim_square_spec,dim_square_spec)"""

    nb_audio = numpy_audio.shape[0]

    m_mag_db = np.zeros((nb_audio, dim_square_spec, dim_square_spec))
    m_phase = np.zeros((nb_audio, dim_square_spec, dim_square_spec), dtype=complex)

    for i in range(nb_audio):
        m_mag_db[i, :, :], m_phase[i, :, :] = audio_to_magnitude_db_and_phase(
            n_fft, hop_length_fft, numpy_audio[i])

    return m_mag_db, m_phase

def magnitude_db_and_phase_to_audio(frame_length, hop_length_fft, stftaudio_magnitude_db, stftaudio_phase):
    """This functions reverts a spectrogram to an audio"""

    stftaudio_magnitude_rev = librosa.db_to_amplitude(stftaudio_magnitude_db, ref=1.0)

    # taking magnitude and phase of audio
    audio_reverse_stft = stftaudio_magnitude_rev * stftaudio_phase
    audio_reconstruct = librosa.core.istft(audio_reverse_stft, hop_length=hop_length_fft, length=frame_length)

    return audio_reconstruct

def matrix_spectrogram_to_numpy_audio(m_mag_db, m_phase, frame_length, hop_length_fft)  :
    """This functions reverts the matrix spectrograms to numpy audio"""

    list_audio = []

    nb_spec = m_mag_db.shape[0]

    for i in range(nb_spec):

        audio_reconstruct = magnitude_db_and_phase_to_audio(frame_length, hop_length_fft, m_mag_db[i], m_phase[i])
        list_audio.append(audio_reconstruct)

    return np.vstack(list_audio)



In [None]:
mix = audio_files_to_numpy(files,mix_dirs,sample_rate,frame_length,hop_length_frame,min_duration)

In [None]:
mix.shape

(22641, 8064)

In [None]:
bass = audio_files_to_numpy(files,bass_dirs,sample_rate,frame_length,hop_length_frame,min_duration)

In [None]:
bass.shape

In [None]:
dim_square_spec = int(n_fft / 2) + 1

In [None]:
# Create Amplitude and phase of the sounds
m_amp_db_voice,  m_pha_voice = numpy_audio_to_matrix_spectrogram(
        bass, dim_square_spec, n_fft, hop_length_fft)
# m_amp_db_noise,  m_pha_noise = numpy_audio_to_matrix_spectrogram(
#         prod_noise, dim_square_spec, n_fft, hop_length_fft)
# m_amp_db_noisy_voice,  m_pha_noisy_voice = numpy_audio_to_matrix_spectrogram(
        # mix, dim_square_spec, n_fft, hop_length_fft)

# path = "/content/gdrive/MyDrive/numpy/"

# # Save to disk for Training / QC
# np.save(path + 'vocals50', vocals)
# # np.save(path_save_time_serie + 'noise_timeserie', prod_noise)
# # np.save(path + 'mix50', mix)


# np.save(path + 'vocals50_amp_db', m_amp_db_voice)
# # np.save(path_save_spectrogram + 'noise_amp_db', m_amp_db_noise)
# # np.save(path + 'mix50_amp_db', m_amp_db_noisy_voice)

# np.save(path + 'vocals50_pha_db', m_pha_voice)
# # np.save(path_save_spectrogram + 'noise_pha_db', m_pha_noise)
# # np.save(path + 'mix50_pha_db', m_pha_noisy_voice)

m_amp_db_voice.shape

(607, 128, 128)

In [None]:
m_amp_db_noisy_voice,  m_pha_noisy_voice = numpy_audio_to_matrix_spectrogram(
        mix, dim_square_spec, n_fft, hop_length_fft)

# path = "/content/gdrive/MyDrive/numpy/"
# np.save(path + 'mix50', mix)
# np.save(path + 'mix50_amp_db', m_amp_db_noisy_voice)
# np.save(path + 'mix50_pha_db', m_pha_noisy_voice)

m_amp_db_noisy_voice.shape

(907, 256, 256)

In [None]:
path = "/content/gdrive/MyDrive/numpy/"

# Save to disk for Training / QC
np.save(path + 'bass100', bass)
# np.save(path_save_time_serie + 'noise_timeserie', prod_noise)
np.save(path + 'mix100', mix)


np.save(path + 'bass100_amp_db', m_amp_db_voice)
# np.save(path_save_spectrogram + 'noise_amp_db', m_amp_db_noise)
np.save(path + 'mix100_amp_db', m_amp_db_noisy_voice)

np.save(path + 'bass100_pha_db', m_pha_voice)
# np.save(path_save_spectrogram + 'noise_pha_db', m_pha_noise)
np.save(path + 'mix100_pha_db', m_pha_noisy_voice)

In [None]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, LeakyReLU, MaxPooling2D, Dropout, concatenate, UpSampling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend
import tensorflow as tf
print(tf.__version__)

#Unet network
def unet(pretrained_weights = None,input_size = (128,128,1)):
    #size filter input
    size_filter_in = 32
    #normal initialization of weights
    kernel_init = 'he_normal'
    #To apply leaky relu after the conv layer 
    activation_layer = None
    inputs = Input(input_size)
    conv1 = Conv2D(size_filter_in, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(inputs)
    conv1 = LeakyReLU()(conv1)
    conv1 = Conv2D(size_filter_in, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(conv1)
    conv1 = LeakyReLU()(conv1)
    pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
    conv2 = Conv2D(size_filter_in*2, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(pool1)
    conv2 = LeakyReLU()(conv2)
    conv2 = Conv2D(size_filter_in*2, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(conv2)
    conv2 = LeakyReLU()(conv2)
    pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
    conv3 = Conv2D(size_filter_in*4, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(pool2)
    conv3 = LeakyReLU()(conv3)
    conv3 = Conv2D(size_filter_in*4, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(conv3)
    conv3 = LeakyReLU()(conv3)
    pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)
    conv4 = Conv2D(size_filter_in*8, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(pool3)
    conv4 = LeakyReLU()(conv4)
    conv4 = Conv2D(size_filter_in*8, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(conv4)
    conv4 = LeakyReLU()(conv4)
    drop4 = Dropout(0.5)(conv4)
    pool4 = MaxPooling2D(pool_size=(2, 2))(drop4)

    conv5 = Conv2D(size_filter_in*16, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(pool4)
    conv5 = LeakyReLU()(conv5)
    conv5 = Conv2D(size_filter_in*16, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(conv5)
    conv5 = LeakyReLU()(conv5)
    drop5 = Dropout(0.5)(conv5)

    up6 = Conv2D(size_filter_in*8, 2, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(UpSampling2D(size = (2,2))(drop5))
    up6 = LeakyReLU()(up6)
    merge6 = concatenate([drop4,up6], axis = 3)
    conv6 = Conv2D(size_filter_in*8, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(merge6)
    conv6 = LeakyReLU()(conv6)
    conv6 = Conv2D(size_filter_in*8, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(conv6)
    conv6 = LeakyReLU()(conv6)
    up7 = Conv2D(size_filter_in*4, 2, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(UpSampling2D(size = (2,2))(conv6))
    up7 = LeakyReLU()(up7)
    merge7 = concatenate([conv3,up7], axis = 3)
    conv7 = Conv2D(size_filter_in*4, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(merge7)
    conv7 = LeakyReLU()(conv7)
    conv7 = Conv2D(size_filter_in*4, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(conv7)
    conv7 = LeakyReLU()(conv7)
    up8 = Conv2D(size_filter_in*2, 2, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(UpSampling2D(size = (2,2))(conv7))
    up8 = LeakyReLU()(up8)
    merge8 = concatenate([conv2,up8], axis = 3)
    conv8 = Conv2D(size_filter_in*2, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(merge8)
    conv8 = LeakyReLU()(conv8)
    conv8 = Conv2D(size_filter_in*2, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(conv8)
    conv8 = LeakyReLU()(conv8)

    up9 = Conv2D(size_filter_in, 2, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(UpSampling2D(size = (2,2))(conv8))
    up9 = LeakyReLU()(up9)
    merge9 = concatenate([conv1,up9], axis = 3)
    conv9 = Conv2D(size_filter_in, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(merge9)
    conv9 = LeakyReLU()(conv9)
    conv9 = Conv2D(size_filter_in, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(conv9)
    conv9 = LeakyReLU()(conv9)
    conv9 = Conv2D(2, 3, activation = activation_layer, padding = 'same', kernel_initializer = kernel_init)(conv9)
    conv9 = LeakyReLU()(conv9)
    conv10 = Conv2D(1, 1, activation = 'tanh')(conv9)

    model = Model(inputs,conv10)

    model.compile(optimizer = 'adam', loss = tf.keras.losses.Huber(), metrics = ['mae'])

    #model.summary()

    if(pretrained_weights):
    	model.load_weights(pretrained_weights)

    return model

In [None]:
mix_mag = np.load("/content/gdrive/MyDrive/numpy/mix100_amp_db.npy")
bass_mag = np.load("/content/gdrive/MyDrive/numpy/bass100_amp_db.npy")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
print(stats.describe(mix_mag.reshape(-1,1)))
print(stats.describe(bass_mag.reshape(-1,1)))

In [None]:
mix_mag = (mix_mag - mix_mag.mean())/mix_mag.std()
bass_mag = (bass_mag - bass_mag.mean())/bass_mag.std()

In [None]:
print(stats.describe(mix_mag.reshape(-1,1)))
print(stats.describe(bass_mag.reshape(-1,1)))

print(mix_mag.shape)
print(bass_mag.shape)

In [None]:
X_in = mix_mag
X_ou = bass_mag

# X_ou = X_in - X_ou
print(X_in.shape)
print(X_ou.shape)

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
#Reshape for training
X_in = X_in[:,:,:]
X_in = X_in.reshape(X_in.shape[0],X_in.shape[1],X_in.shape[2],1)
X_ou = X_ou[:,:,:]
X_ou = X_ou.reshape(X_ou.shape[0],X_ou.shape[1],X_ou.shape[2],1)

X_train, X_test, y_train, y_test = train_test_split(X_in, X_ou, test_size=0.20)

#If training from pre-trained weights
# generator_nn=unet(pretrained_weights = root_path+'mod_unet_last_weights.h5')

#If training from scratch
generator_nn=unet()
#Save best models to disk
checkpoint = ModelCheckpoint('/content/gdrive/My Drive/app/test/model-{epoch:03d}-{loss:03f}-{val_loss:03f}.h5', verbose=1, monitor='val_loss',save_best_only=True, mode='auto')

generator_nn.summary()
#Training
history = generator_nn.fit(X_train, y_train, epochs=25, batch_size=32, shuffle=True, callbacks=[checkpoint], verbose=1, validation_data=(X_test, y_test))

In [None]:
#Plot training and validation loss
from matplotlib import pyplot as plt
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)

plt.plot(epochs, loss, label='Training loss')
plt.plot(epochs, val_loss, label='Validation loss')
plt.yscale('log')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
sample_rate = 8000
n_fft = 255
hop_length_fft = 63
nb_samples = 50
batch_size = 20
epochs = 10
frame_length = 8064
hop_length_frame = 8064
min_duration = 1
categories = ["mix.wav","vocals.wav"]
dim_square_spec = int(n_fft / 2) + 1

In [None]:
import os
ls = os.listdir("/content/gdrive/MyDrive/data/musdb18_test")

weights_path = "/content/gdrive/MyDrive/app/"
name_model = "model-018-0.000564-0.000618.h5"

audio_dir_prediction = "/content/gdrive/MyDrive/data/musdb18_test"
dir_save_prediction = "/content/gdrive/MyDrive/app/predictions_audio/"
l = []
for i in ls:
    l.append(i+"/mix.wav")
audio_input_prediction = l[2:3]
audio_output_prediction = "bass.wav"


def scaled_in(matrix_spec):
    "global scaling apply to noisy voice spectrograms (scale between -1 and 1)"
    # matrix_spec = (matrix_spec + 46)/50
    matrix_spec = (matrix_spec - (-39.59250678))/246.04414739
    return matrix_spec

# def scaled_ou(matrix_spec):
#     "global scaling apply to noise models spectrograms (scale between -1 and 1)"
#     # matrix_spec = (matrix_spec -6 )/82
#     matrix_spec = (matrix_spec - (-39.2319582))/235.33391848
#     return matrix_spec

def inv_scaled_ou(matrix_spec):
    "inverse global scaling apply to noise models spectrograms"
    # matrix_spec = matrix_spec * 82 + 6
    matrix_spec = matrix_spec * 511.57269939 + (-35.18166773)
    return matrix_spec

In [None]:
import librosa
import tensorflow as tf
from tensorflow.keras.models import model_from_json
import soundfile as sf


def prediction(weights_path, name_model, audio_dir_prediction, dir_save_prediction, audio_input_prediction,
audio_output_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft):
    """ This function takes as input pretrained weights, noisy voice sound to denoise, predict
    the denoise sound and save it to disk.
    """

    # # load json and create model
    # json_file = open(weights_path+'/'+name_model+'.json', 'r')
    # loaded_model_json = json_file.read()
    # json_file.close()
    # loaded_model = model_from_json(loaded_model_json)
    # # load weights into new model
    # loaded_model.load_weights(weights_path+'/'+name_model+'.h5')
    from tensorflow import keras
    loaded_model = keras.models.load_model("/content/gdrive/MyDrive/app/model-020-0.000841-0.000868.h5")
    print("Loaded model from disk")

    # Extracting noise and voice from folder and convert to numpy
    audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction, sample_rate,
                                 frame_length, hop_length_frame, min_duration)

    #Dimensions of squared spectrogram
    dim_square_spec = int(n_fft / 2) + 1
    print(dim_square_spec)

    # Create Amplitude and phase of the sounds
    m_amp_db_audio,  m_pha_audio = numpy_audio_to_matrix_spectrogram(
        audio, dim_square_spec, n_fft, hop_length_fft)

    #global scaling to have distribution -1/1
    X_in = scaled_in(m_amp_db_audio)
    #Reshape for prediction
    X_in = X_in.reshape(X_in.shape[0],X_in.shape[1],X_in.shape[2],1)
    #Prediction using loaded network
    X_pred = loaded_model.predict(X_in)
    #Rescale back the noise model
    inv_sca_X_pred = inv_scaled_ou(X_pred)
    #Remove noise model from noisy speech
    # X_denoise = m_amp_db_audio - inv_sca_X_pred[:,:,:,0]
    # X_denoise = m_amp_db_audio - X_denoise
    
    X_denoise = inv_sca_X_pred[:,:,:,0]
    #Reconstruct audio from denoised spectrogram and phase
    print(X_denoise.shape)
    print(m_pha_audio.shape)
    print(frame_length)
    print(hop_length_fft)
    audio_denoise_recons = matrix_spectrogram_to_numpy_audio(X_denoise, m_pha_audio, frame_length, hop_length_fft)
    #Number of frames
    nb_samples = audio_denoise_recons.shape[0]
    #Save all frames in one file
    denoise_long = audio_denoise_recons.reshape(1, nb_samples * frame_length)*10
    # librosa.output.write_wav(dir_save_prediction + audio_output_prediction, denoise_long[0, :], sample_rate)
    fin = dir_save_prediction + audio_output_prediction
    sf.write("test1.wav", denoise_long[0, :], sample_rate, 'PCM_24')

In [None]:
prediction(weights_path, name_model, audio_dir_prediction, dir_save_prediction, audio_input_prediction, audio_output_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft)