# Speech Denoising using 1D CNN

Import required libraries

In [1]:
import librosa
import numpy as np
import tensorflow as tf

Load the dirty and clean audio files to be used for training purposes.

In [2]:
s, sr=librosa.load("data/train_clean_male.wav", sr=None)
S=librosa.stft(s, n_fft=1024, hop_length=512)
sn, sr=librosa.load("data/train_dirty_male.wav", sr=None)
X=librosa.stft(sn, n_fft=1024, hop_length=512)
S_abs=np.abs(np.transpose(S))
X_abs=np.abs(np.transpose(X))
X_abs.shape

(2459, 513)

Model architecture:

> 1st Conv1D layer has kernel size of 3, strides is 1 and 25 filters with relu activation and no padding.

> 2nd Conv1D layer has kernel size of 5, strides is 2 and 50 filters with relu activation and no padding.
    
> 3rd Conv1D layer has kernel size of 5, strides is 2 and 75 filters with relu activation and no padding.
    
> The output of the 3rd Conv1D is flattened and fed into a dense layer with 1026 neurons.

In [3]:
x = tf.placeholder(tf.float32, [None, 513, 1])
print("x",x.shape)
h1 = tf.layers.conv1d(x, filters = 25, kernel_size=3, strides=1, activation="relu", kernel_initializer="he_normal", padding = "valid")
print("h1",h1.shape)
h2 = tf.layers.conv1d(h1, filters = 50,kernel_size=5, strides=2, activation="relu", kernel_initializer="he_normal",padding = "valid")
print("h2",h2.shape)
h3 = tf.layers.conv1d(h2, filters = 75,kernel_size=5, strides=2, activation="relu", kernel_initializer="he_normal",padding = "valid")
print("h3",h3.shape)
h3_drop = tf.layers.dropout(h3, rate = 0.3)
h4 = tf.layers.flatten(h3_drop)
print("h4",h4.shape)
h5 = tf.layers.dense(h4,1026, activation="relu", kernel_initializer="he_normal")
y = tf.layers.dense(h5,513,activation="relu", kernel_initializer="he_normal")
print("y",y.shape)

x (?, 513, 1)
Instructions for updating:
Use `tf.keras.layers.Conv1D` instead.
h1 (?, 511, 25)
h2 (?, 254, 50)
h3 (?, 125, 75)
Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Use keras.layers.flatten instead.
h4 (?, 9375)
Instructions for updating:
Use keras.layers.dense instead.
y (?, 513)


Mean squared error is used to calculate the loss on training. Adam Optimizer is used with a learning rate of 0.0001 and run for 200 epochs .       

In [4]:
y_ = tf.placeholder(tf.float32, [None,513])
error = tf.losses.mean_squared_error(y_,y)
optimizer = tf.train.AdamOptimizer(0.0001).minimize(error)
loss_log = []
init = tf.global_variables_initializer()
epochs = 200

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Data is split into 250 batches each of ~(10,513)

In [5]:
batch_S=np.array_split(S_abs,250)
batch_X=np.array_split(X_abs,250)
batches=len(batch_X)
print(S_abs.shape,len(batch_X))
batch_S[0].shape

(2459, 513) 250


(10, 513)

Obtained a loss of approx. 0.0007 in 200 epochs.

In [6]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)

sess1 = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
#sess1 = tf.Session()
sess1.run(init)

for epoch in range(epochs):
    print("Epoch: ", epoch)
    temp_loss=0
    for _ in range(batches):
        batch_xs, batch_ys = batch_X[_], batch_S[_]
        batch_xs=np.expand_dims(batch_xs, axis=2).astype('float32')
#         batch_ys=np.expand_dims(batch_ys, axis=2)
        #print(batch_xs.shape,batch_ys.shape)

        _, loss,k = sess1.run([optimizer, error,y], feed_dict={x:batch_xs,y_:batch_ys})
        
        temp_loss+=loss
    loss_log.append(temp_loss/batches)
    print("loss",temp_loss/batches)


Epoch:  0
loss 0.0446876526447013
Epoch:  1
loss 0.020439540092425885
Epoch:  2
loss 0.01625603042115108
Epoch:  3
loss 0.014094875444396165
Epoch:  4
loss 0.012320511776633793
Epoch:  5
loss 0.01120607516838936
Epoch:  6
loss 0.010150949994160328
Epoch:  7
loss 0.009225268380730995
Epoch:  8
loss 0.008618526328442386
Epoch:  9
loss 0.00814693107252242
Epoch:  10
loss 0.007793657429647283
Epoch:  11
loss 0.00750395778096572
Epoch:  12
loss 0.007210249029667466
Epoch:  13
loss 0.006612117763841525
Epoch:  14
loss 0.006155847539979732
Epoch:  15
loss 0.0059222700202371924
Epoch:  16
loss 0.00582705008207995
Epoch:  17
loss 0.005708669135026867
Epoch:  18
loss 0.005814338088835939
Epoch:  19
loss 0.005770865869548288
Epoch:  20
loss 0.005779512162989704
Epoch:  21
loss 0.005456983837546432
Epoch:  22
loss 0.005126315555120527
Epoch:  23
loss 0.004869292139730533
Epoch:  24
loss 0.004609501395643747
Epoch:  25
loss 0.004613217738078674
Epoch:  26
loss 0.004702711955171253
Epoch:  27
loss 0

Test the network on test1 audio

In [7]:
test1, sr=librosa.load("data/test_x_01.wav", sr=None)
test1_stft=librosa.stft(test1, n_fft=1024, hop_length=512)
test1_abs=np.abs(np.transpose(test1_stft))
test1_abs=np.expand_dims(test1_abs,axis=2)
test1_output= sess1.run(y, feed_dict={x:test1_abs})
test1_output.shape

(142, 513)

In [8]:
clean_test1=(test1_stft/np.abs(test1_stft))*(np.transpose(test1_output))
clean_test1.shape
clean_test1_istft_1d=librosa.istft(clean_test1, hop_length=512, length=test1.shape[0])
clean_test1_istft_1d.shape
librosa.output.write_wav("test_s_01_recons_1dcnn.wav", clean_test1_istft_1d, sr)

Test the network on test 2 audio

In [9]:
test2, sr=librosa.load("data/test_x_02.wav", sr=None)
test2_stft=librosa.stft(test2, n_fft=1024, hop_length=512)
test2_abs=np.abs(np.transpose(test2_stft))
test2_abs=np.expand_dims(test2_abs,axis=2)
test2_output= sess1.run(y, feed_dict={x:test2_abs})
test2_output.shape

(380, 513)

In [10]:
clean_test2=(test2_stft/np.abs(test2_stft))*(np.transpose(test2_output))
clean_test2.shape
clean_test2_istft_1d=librosa.istft(clean_test2, hop_length=512,length=test2.shape[0])
clean_test2_istft_1d.shape
librosa.output.write_wav("test_s_02_recons_1dcnn.wav", clean_test2_istft_1d, sr)

Calculate SNR on cleaned dirty train audio file.

In [11]:
train_dirty_output= sess1.run(y, feed_dict={x:np.expand_dims(X_abs, axis=2)})
print(train_dirty_output.shape)
clean_train1=(X/np.abs(X))*(np.transpose(train_dirty_output))
clean_train1.shape
clean_train1_istft_1d=librosa.istft(clean_train1, hop_length=512, length=sn.shape[0])
clean_train1_istft_1d.shape
librosa.output.write_wav("train_dirty_recons_1dcnn.wav", clean_train1_istft_1d, sr)

(2459, 513)


In [12]:
num = (np.sum(sn*sn))
den = np.sum(np.square(sn-clean_train1_istft_1d))
SNR =10*np.log10(num/den)
print("SNR on train dirty audio:",SNR)

SNR on train dirty audio: 7.484267950057983


In [13]:
num = (np.sum(test1*test1))
den = np.sum(np.square(test1-clean_test1_istft_1d))
SNR =10*np.log10(num/den)
print("SNR on test 1 output:",SNR)

SNR on test 1 output: 8.106775283813477


In [14]:
num = (np.sum(test2*test2))
den = np.sum(np.square(test2-clean_test2_istft_1d))
SNR =10*np.log10(num/den)
print("SNR on test 2 output:",SNR)

SNR on test 2 output: 7.973528504371643


In [19]:
sess1.close()