# Speech Denoising Using 1D CNN

### Imports


In [0]:
import librosa
import numpy as np
import tensorflow as tf

### Upload Files
Please upload the following files -

1.   train_clean_male.wav
2.   train_dirty_male.wav
3.   test_x_01.wav
4.   test_x_02.wav


In [3]:
from google.colab import files

uploaded = files.upload()

Saving test_x_01.wav to test_x_01.wav
Saving test_x_02.wav to test_x_02.wav
Saving train_clean_male.wav to train_clean_male.wav
Saving train_dirty_male.wav to train_dirty_male.wav


### Convert to Time_Frequency Domain
Convert the signal from time domain to frequency domain and take the absolute.  
Also taking the transpose of the absolute


In [0]:
s, sr = librosa.load('train_clean_male.wav', sr = None)
S_clean = librosa.stft(s, n_fft=1024, hop_length=512)
S_mag = np.abs(S_clean).T

sn, sr = librosa.load('train_dirty_male.wav', sr = None)
X_dirty = librosa.stft(s, n_fft=1024, hop_length=512)
X_mag = np.abs(X_dirty).T

## Training

### Create Placeholders

In [0]:
def create_placeholders(n_x = 513, n_y = 513):
  X = tf.placeholder(tf.float32, [None, n_x])
  Y = tf.placeholder(tf.float32, [None, n_y])
  return X, Y

### Convolution 1D
As mentioned by Prof. Kim to not use any zero padding, I've used VALID padding.  
Stride by default is 1

In [0]:
def conv1d(x, W, b, strides = 1):
  x = tf.nn.conv1d(x, W, stride = strides, padding = 'VALID')
  x = tf.nn.bias_add(x, b)
  return tf.nn.relu(x)

### MaxPooling 2D
ksize and strides by default is 2

In [0]:
def max_pool1d(x, k = 2):
  return tf.nn.max_pool1d(x, ksize = [1, k, 1], strides = [1, k, 1], padding = 'VALID') 

### Initialize Parameters

Used He initialization for weights and kernels.  
Used Zeros Initializer for biases.

1.   W1 is a 3x3 kernel with 16 channels
2.   b1 is the bias added to the first Convolution layer
3. W2 is a 5x5 kernel with 32 channels 
4. b2 is the bias added to the second Convolution layer
5. W3 is a 7x7 kernel with 64 channels 
4. b3 is the bias added to the third Convolution layer
5. W4 and b4 are the weights and biases respectively for the 4rd Layer which is a Fully Connected Layer
6. W5 and b5 are the weights and biases respectively for the 5th Layer which is a Fully Connected Layer


In [0]:
def initialize_parameters():
  W1 = tf.get_variable("W1", [3, 1, 16], initializer=tf.contrib.layers.variance_scaling_initializer())
  b1 = tf.get_variable("b1", [16], initializer=tf.zeros_initializer)
 
  W2 = tf.get_variable("W2", [5, 16, 32], initializer=tf.contrib.layers.variance_scaling_initializer())
  b2 = tf.get_variable("b2", [32], initializer=tf.zeros_initializer)
  
  W3 = tf.get_variable("W3", [7, 32, 64], initializer=tf.contrib.layers.variance_scaling_initializer())
  b3 = tf.get_variable("b3", [64], initializer=tf.zeros_initializer)
  
  W4 = tf.get_variable("W4", [1024, 3776], initializer=tf.contrib.layers.variance_scaling_initializer())
  b4 = tf.get_variable("b4", [1024, 1], initializer=tf.zeros_initializer) 

  W5 = tf.get_variable("W5", [513, 1024], initializer=tf.contrib.layers.variance_scaling_initializer())
  b5 = tf.get_variable("b5", [513, 1], initializer=tf.zeros_initializer)
  

  parameters = {
      "W1" : W1,
      "b1" : b1,
      "W2" : W2,
      "b2" : b2,
      "W3" : W3,
      "b3" : b3,
      "W4" : W4,
      "b4" : b4,
      "W5" : W5,
      "b5" : b5
  }
  
  return parameters

### Forward Propagation
The input is reshaped so that it has 3 dimensions - batchsize, width and number of channels.  
The layers are as follows
1. Convolution 
2. Pooling
3. Convolution 
4. Pooling
4. Convolution 
6. Pooling
7. Fully Connected
8. Fully Connected

The Output dimensions are batch_size x width(513)


In [0]:
def forward_propagation(X, parameters):

  X = tf.reshape(X, [-1, 513, 1])

  W1 = parameters['W1']
  b1 = parameters['b1']
  A1 = conv1d(X, W1, b1, strides = 1)
  A2 = max_pool1d(A1, k=2)
  
  W2 = parameters['W2']
  b2 = parameters['b2']
  A3 = conv1d(A2, W2, b2, strides = 1)
  A4 = max_pool1d(A3, k=2)
  
  W3 = parameters['W3']
  b3 = parameters['b3']
  A5 = conv1d(A4, W3, b3, strides = 1)
  A6 = max_pool1d(A5, k=2) 
  
  A6 = tf.transpose(tf.reshape(A6, [-1, A6.shape[1] * A6.shape[2]]))
  
  
  W4 = parameters['W4']
  b4 = parameters['b4']
  Z7 = tf.matmul(W4, A6) + b4
  A7 = tf.nn.relu(Z7)
  
  W5 = parameters['W5']
  b5 = parameters['b5']
  Z8 = tf.matmul(W5, A7) + b5

  return tf.transpose(Z8)

### Compute Cost
Loss function - MSE  
Cost is the mean of all losses

In [0]:
def compute_cost(Z8, Y):
  cost = tf.reduce_mean(tf.losses.mean_squared_error(tf.nn.relu(Z8), Y))
  
  return cost

### Initialize Model for Training

1.   Epochs = 100
2.   Learning Rate = 2e4
3. Minibatch Size = 128
4. Optimizer - Adam


In [0]:
def model(X_mag, S_mag, learning_rate = 0.0002, epochs = 100, print_cost = True, minibatch_size = 128):
  
  tf.reset_default_graph()
  m = X_mag.shape[0]
  X, Y = create_placeholders()

  parameters = initialize_parameters()
  
  Z8 = forward_propagation(X, parameters)
  cost = compute_cost(Z8, Y)
  
  optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
  init = tf.global_variables_initializer()
  
  with tf.Session() as sess:
    sess.run(init)
    
    costs = []
    
    for epoch in range(epochs):
      epoch_cost = 0.
      
      num_minibatches = m//minibatch_size
      
      for i in range(num_minibatches+1):
        start = i*minibatch_size
        if i<num_minibatches:
          end = start + minibatch_size
          mini_X = X_mag[start:end, :]
          mini_Y = S_mag[start:end, :]
        else:
          mini_X = X_mag[start:, :]
          mini_Y = S_mag[start:, :]
        
        _, minibatch_cost = sess.run([optimizer, cost], feed_dict = {X: mini_X, Y:mini_Y})
        
        epoch_cost += minibatch_cost / num_minibatches
                               
      if print_cost == True and (epoch % 10 == 0 or (epochs-1)==epoch):
        print("Cost after epoch %i: %f" %(epoch, epoch_cost))
        
      if print_cost == True and epoch % 5 == 0:
        costs.append(epoch_cost)
        
    parameters = sess.run(parameters)
    print ("Parameters have been trained!")

    sess.close()
    return parameters

### Run the model

In [80]:
parameters = model(X_mag, S_mag)

yshape =  (?, 513)
Cost after epoch 0: 0.450827
Cost after epoch 10: 0.050473
Cost after epoch 20: 0.043122
Cost after epoch 30: 0.036590
Cost after epoch 40: 0.033830
Cost after epoch 50: 0.029496
Cost after epoch 60: 0.027994
Cost after epoch 70: 0.020802
Cost after epoch 80: 0.018496
Cost after epoch 90: 0.016363
Cost after epoch 99: 0.013781
Parameters have been trained!


### Clean  

1.   Forward propagation on trained Network
2.   Input - Noisy Signal
3. Output - Clean Signal

In [0]:
def clean(X_mag, parameters):
  tf.reset_default_graph()
  
  X = tf.placeholder(tf.float32, [None, 513])
  Z8 = forward_propagation(X, parameters)
  S_test = tf.nn.relu(Z8) 
  
  init = tf.global_variables_initializer()
  with tf.Session() as sess:
    sess.run(init) 
    S_test = sess.run(S_test, feed_dict = {X: X_mag})
    
    sess.close()
  return S_test

#### Clean the training data 

Feed the noisy signal to the trained network to get the clean signal


In [82]:
c1 = clean(X_mag, parameters)
c1.shape

(2459, 513)

### Recover the Complex Valued Spectrogram
Recovering the complex valued Spectrogram and applying inverse STFT to convert from time-frequency domain to time domain

In [83]:
S = np.multiply(np.divide(X_dirty.T, X_mag), c1)
print(S.shape)
sh_test = librosa.istft(S.T, win_length=1024 ,hop_length=512)

(2459, 513)


### Saving the cleaned signal and downloading it

In [0]:
librosa.output.write_wav('test_clean_male.wav', sh_test, sr)
files.download('test_clean_male.wav')

### SNR
Calculate the SNR of the train file

In [0]:
def snr(clean, recon):
  if len(recon)> len(clean):
    new = np.zeros(len(recon)-len(clean))
    clean = np.concatenate([clean, new])
  
  elif len(recon)< len(clean):
      new = np.zeros(len(clean)-len(recon))
      recon = np.concatenate([recon, new])
  
  return 10*np.log10(np.sum(clean**2)/np.sum(clean**2 - recon**2))

In [86]:
snr(s, sh_test)

8.810792035336561

## For test_x_01.wav
Load the file -> transform to feature domain using stft and take magnitude

In [0]:
test1, sr = librosa.load('test_x_01.wav', sr = None)
t1_dirty = librosa.stft(test1, n_fft=1024, hop_length=512)
t1_mag = np.abs(t1_dirty).T    

Clean the signal using the trained network

In [0]:
clean_t1 = clean(t1_mag, parameters)

Recover Complex Valued Speech Spectrogram  
And apply inverse STFT to convert from requency domain to time domain

In [0]:
T1 = np.multiply(np.divide(t1_dirty.T, t1_mag), clean_t1)
test_01 = librosa.istft(T1.T, win_length=1024, hop_length=512)

Save the cleaned signal and download it

In [0]:
librosa.output.write_wav('test_s_01_recons.wav', test_01, sr)
files.download('test_s_01_recons.wav')

## For test_x_02.wav
Load the file -> transform to feature domain using stft and take magnitude

In [0]:
test2, sr = librosa.load('test_x_02.wav', sr = None)
t2_dirty = librosa.stft(test2, n_fft=1024, hop_length=512)
t2_mag = np.abs(t2_dirty).T

Clean the signal using the trained network

In [0]:
clean_t2 = clean(t2_mag, parameters)

Recover Complex Valued Speech Spectrogram  
And apply inverse STFT to convert from requency domain to time domain

In [0]:
T2 = np.multiply(np.divide(t2_dirty.T, t2_mag), clean_t2)
test_02 = librosa.istft(T2.T, win_length=1024, hop_length=512)

Save the cleaned signal and download it

In [0]:
librosa.output.write_wav('test_s_02_recons.wav', test_02, sr)
files.download('test_s_02_recons.wav')