# Speech Denoising using 2D CNN.

Import the required libraries

In [1]:
import librosa
import numpy as np
import tensorflow as tf

Load the dirty and clean audio files for training purposes.

In [2]:
s, sr=librosa.load("data/train_clean_male.wav", sr=None)
S=librosa.stft(s, n_fft=1024, hop_length=512)
sn, sr=librosa.load("data/train_dirty_male.wav", sr=None)
X=librosa.stft(sn, n_fft=1024, hop_length=512)
S_abs=np.abs(np.transpose(S))
X_abs=np.abs(np.transpose(X))
#X_abs

Model architecture:

>1st Conv2D layer is used with a filter of (18,2) and strides are (1,1) with relu activation and no padding.

>2nd Conv2D layer is used with a filter of (3,12) and strides are (2,2) with relu activation and no padding.

>Dropout of 0.1 and 0.2 is applied after each convolution layer respectively.

>The flattened output after convolutions is fed into a dense layer with 1024 neurons.

>>>I tried using many different combinations of kernel size but the outputs on tests weren't clear enough. It seemed logical since we're using 19 previous frames in each row and the network seems to learn the same frames over and over again with a small kernel size. Switching to a large kernel size seemed to do the trick and the output on the test files is clear with SNR >6

In [3]:
x = tf.placeholder(tf.float32, [None, 20, 513, 1])
print("x",x.shape)

h1 = tf.layers.conv2d(x, filters = 50, kernel_size=(18,2), strides=(1,1), activation="relu", padding = "valid")
print("h1",h1.shape)

h1_drop = tf.layers.dropout(h1, rate = 0.1)

h2 = tf.layers.conv2d(h1_drop, filters = 100, kernel_size=(3,12), strides=(2,2), activation="relu", padding = "valid")
print("h2",h2.shape)

h2_drop = tf.layers.dropout(h2, rate = 0.2)

flatten_layer = tf.layers.flatten(h2_drop)
print("flatten", flatten_layer)

dense_1 = tf.layers.dense(flatten_layer, 1024, activation = "relu")
print("dense1" ,dense_1)

y = tf.layers.dense(dense_1,513,activation="relu")
print("y",y.shape)

x (?, 20, 513, 1)
Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
h1 (?, 3, 512, 50)
Instructions for updating:
Use keras.layers.dropout instead.
h2 (?, 1, 251, 100)
Instructions for updating:
Use keras.layers.flatten instead.
flatten Tensor("flatten/Reshape:0", shape=(?, 25100), dtype=float32)
Instructions for updating:
Use keras.layers.dense instead.
dense1 Tensor("dense/Relu:0", shape=(?, 1024), dtype=float32)
y (?, 513)


Creating dataset for 2D CNN:

Here, we append 20 silent frames uniformly in the range of 0 to the minimum value in that data.
Then we create a 3D data where each frame consists of 20 previous frames.

In [4]:
X_silent_frames = np.random.uniform(low=0.0,high=np.amin(X_abs),size=(19,513))
print(type(X_silent_frames),X_silent_frames.shape)

X_concat = np.concatenate((X_silent_frames, X_abs), axis = 0)
print(X_concat.shape)

X_aug = np.empty([X_abs.shape[0],20,513])

for i in range(19,X_abs.shape[0]):
    X_aug[i-19] = X_concat[i-19:i+1,:]
    
print(X_aug.shape)

<class 'numpy.ndarray'> (19, 513)
(2478, 513)
(2459, 20, 513)


Split the data into 250 batches each of shape (10,20,513)

In [5]:
batch_S=np.array_split(S_abs,250)
batch_X=np.array_split(X_aug,250)
batches=len(batch_X)
print(batch_X[0].shape,batch_S[0].shape)


(10, 20, 513) (10, 513)


Mean squared error is used to calculate the loss on training. Adam Optimizer is used with a learning rate of 0.0001 and run for 100 epochs .

In [6]:
y_ = tf.placeholder(tf.float32, [None,513])
error = tf.losses.mean_squared_error(y_,y)
optimizer = tf.train.AdamOptimizer(0.0001).minimize(error)
loss_log = []
init = tf.global_variables_initializer()
epochs = 100

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Obtained a loss of approx. 0.001 in 100 epochs.

In [7]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)


sess2dcnn = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
sess2dcnn.run(init)

for epoch in range(epochs):
    print("Epoch:", epoch)
    temp_loss=0
    for _ in range(batches):
        batch_xs, batch_ys = batch_X[_], batch_S[_]
        batch_xs=np.expand_dims(batch_xs, axis=3).astype('float32')
        _, loss= sess2dcnn.run([optimizer, error], feed_dict={x:batch_xs,y_:batch_ys})
        
        temp_loss+=loss
    loss_log.append(temp_loss/batches)
    print("loss:",temp_loss/batches)


Epoch: 0
loss: 0.07505544050686876
Epoch: 1
loss: 0.04785396481314092
Epoch: 2
loss: 0.024933920323397616
Epoch: 3
loss: 0.01585378907667473
Epoch: 4
loss: 0.012036051739414688
Epoch: 5
loss: 0.009984532903297804
Epoch: 6
loss: 0.008981215081439587
Epoch: 7
loss: 0.008600238814615295
Epoch: 8
loss: 0.007878675508953166
Epoch: 9
loss: 0.0077252776430395895
Epoch: 10
loss: 0.00901262122442131
Epoch: 11
loss: 0.008280378688839846
Epoch: 12
loss: 0.006341080901242094
Epoch: 13
loss: 0.005297363792458782
Epoch: 14
loss: 0.0048154142039566064
Epoch: 15
loss: 0.0045634268961439375
Epoch: 16
loss: 0.004378801583057793
Epoch: 17
loss: 0.004382945528166602
Epoch: 18
loss: 0.004487465191894443
Epoch: 19
loss: 0.0043948167345370165
Epoch: 20
loss: 0.004346266303356969
Epoch: 21
loss: 0.0045718168844323375
Epoch: 22
loss: 0.004404086481677951
Epoch: 23
loss: 0.004310132772588986
Epoch: 24
loss: 0.004010167736843868
Epoch: 25
loss: 0.003683302089335484
Epoch: 26
loss: 0.004464467364428856
Epoch: 27


Test the network on test1 audio. Test files that are fed into the network are created in a similar manner as the dirty train input i.e. appending 20 previous frames to each row.

In [8]:
test1, sr=librosa.load("data/test_x_01.wav", sr=None)
test1_stft=librosa.stft(test1, n_fft=1024, hop_length=512)
test1_abs=np.abs(np.transpose(test1_stft))
test1_silent_frames = np.random.uniform(low=0.0, high=np.amin(test1_abs),size=(19,513))
test1_concat = np.concatenate((test1_silent_frames, test1_abs), axis = 0)

test1_aug = np.empty([test1_abs.shape[0],20,513])

for i in range(19,test1_abs.shape[0]):
    test1_aug[i-19] = test1_concat[i-19:i+1,:]

test1_aug=np.expand_dims(test1_aug,axis=3)
test1_output= sess2dcnn.run(y, feed_dict={x:test1_aug})
test1_output.shape

(142, 513)

In [9]:
clean_test1_2d=(test1_stft/np.abs(test1_stft))*(np.transpose(test1_output))
print(clean_test1_2d.shape)
clean_test1_istft_2d=librosa.istft(clean_test1_2d, hop_length=512, length=test1.shape[0])
print(clean_test1_istft_2d.shape)
librosa.output.write_wav("test_s_01_recons_2dcnn.wav", clean_test1_istft_2d, sr)

(513, 142)
(72619,)


Test the network on test 2 audio.

In [10]:
test2, sr=librosa.load("data/test_x_02.wav", sr=None)
test2_stft=librosa.stft(test2, n_fft=1024, hop_length=512)
test2_abs=np.abs(np.transpose(test2_stft))
test2_silent_frames = np.random.uniform(low=0.0, high=np.amin(test2_abs),size=(19,513))
test2_concat = np.concatenate((test2_silent_frames, test2_abs), axis = 0)

test2_aug = np.empty([test2_abs.shape[0],20,513])

for i in range(19,test2_abs.shape[0]):
    test2_aug[i-19] = test2_concat[i-19:i+1,:]

test2_aug=np.expand_dims(test2_aug,axis=3)
test2_output= sess2dcnn.run(y, feed_dict={x:test2_aug})
test2_output.shape

(380, 513)

In [11]:
clean_test2_2d=(test2_stft/np.abs(test2_stft))*(np.transpose(test2_output))
print(clean_test2_2d.shape)
clean_test2_istft_2d=librosa.istft(clean_test2_2d, hop_length=512, length=test2.shape[0])
print(clean_test2_istft_2d.shape)
librosa.output.write_wav("test_s_02_recons_2dcnn.wav", clean_test2_istft_2d, sr)

(513, 380)
(194353,)


Calculate SNR:

In [12]:
train_dirty_output= sess2dcnn.run(y, feed_dict={x:np.expand_dims(X_aug, axis=3)})
print(train_dirty_output.shape)
clean_train1=(X/np.abs(X))*(np.transpose(train_dirty_output))
clean_train1.shape
clean_train1_istft_2d=librosa.istft(clean_train1, hop_length=512, length=sn.shape[0])
clean_train1_istft_2d.shape
librosa.output.write_wav("train_dirty_recons_2dcnn.wav", clean_train1_istft_2d, sr)

(2459, 513)


In [13]:
num = (np.sum(sn*sn))
den = np.sum(np.square(sn-clean_train1_istft_2d))
SNR =10*np.log10(num/den)
print("SNR on train dirty output:",SNR)

SNR on train dirty output: 7.308210730552673


In [14]:
num = (np.sum(test1*test1))
den = np.sum(np.square(test1-clean_test1_istft_2d))
SNR =10*np.log10(num/den)
print("SNR on test 1 output:",SNR)

SNR on test 1 output: 7.778724431991577


In [15]:
num = (np.sum(test2*test2))
den = np.sum(np.square(test2-clean_test2_istft_2d))
SNR =10*np.log10(num/den)
print("SNR on test 2 output:",SNR)

SNR on test 2 output: 6.713163256645203


In [15]:
sess2dcnn.close()