# Speech Denoising using 2D CNN

### Imports

In [0]:
import librosa
import numpy as np
import tensorflow as tf

### Upload Files
Please upload the following files -

1.   train_clean_male.wav
2.   train_dirty_male.wav
3.   test_x_01.wav
4.   test_x_02.wav



In [3]:
from google.colab import files

uploaded = files.upload()

Saving test_x_01.wav to test_x_01.wav
Saving test_x_02.wav to test_x_02.wav
Saving train_clean_male.wav to train_clean_male.wav
Saving train_dirty_male.wav to train_dirty_male.wav


### Convert to Time_Frequency Domain
Convert the signal from time domain to frequency domain and take the absolute.  
Also taking the transpose of the absolute

In [0]:
s, sr = librosa.load('train_clean_male.wav', sr = None)
S_clean = librosa.stft(s, n_fft=1024, hop_length=512)
S_mag = np.abs(S_clean).T

sn, sr = librosa.load('train_dirty_male.wav', sr = None)
X_dirty = librosa.stft(s, n_fft=1024, hop_length=512)
X_mag = np.abs(X_dirty).T

## Training

### Create Placeholders

In [0]:
def create_placeholders(n_x = 513, n_y = 513):
  X = tf.placeholder(tf.float32, [None, 20, n_x])
  Y = tf.placeholder(tf.float32, [None, n_y])
  return X, Y

### Convolution 2D
As mentioned by Prof. Kim to not use any zero padding, I've used VALID padding.  
Stride by default is 1

In [0]:
def conv2d(x, W, b, stride = 1):
  x = tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = 'VALID')
  x = tf.nn.bias_add(x, b)
  return tf.nn.relu(x)

### MaxPooling 2D
ksize and strides by default is 2

In [0]:
def max_pool2d(x, k = 2):
  return tf.nn.max_pool2d(x, ksize = [1, k, k, 1], strides = [1, k, k, 1], padding = 'VALID') 

### Initialize Parameters

Used He initialization for weights and kernels.  
Used Zeros Initializer for biases.

1.   W1 is a 5x5 kernel with 16 channels
2.   b1 is the bias added to the first Convolution layer
3. W2 is a 5x5 kernel with 32 channels 
4. b2 is the bias added to the second Convolution layer
5. W3 and b3 are the weights and biases respectively for the 3rd Layer which is a Fully Connected Layer
6. W4 and b4 are the weights and biases respectively for the 3rd Layer which is a Fully Connected Layer



In [0]:
def initialize_parameters():
  W1 = tf.get_variable("W1", [5, 5, 1, 16], initializer=tf.contrib.layers.variance_scaling_initializer())
  b1 = tf.get_variable("b1", [16], initializer=tf.zeros_initializer)
 
  W2 = tf.get_variable("W2", [5, 5, 16, 32], initializer=tf.contrib.layers.variance_scaling_initializer())
  b2 = tf.get_variable("b2", [32], initializer=tf.zeros_initializer)
  
  W3 = tf.get_variable("W3", [1024, 8000], initializer=tf.contrib.layers.variance_scaling_initializer())
  b3 = tf.get_variable("b3", [1024, 1], initializer=tf.zeros_initializer) 

  W4 = tf.get_variable("W4", [513, 1024], initializer=tf.contrib.layers.variance_scaling_initializer())
  b4 = tf.get_variable("b4", [513, 1], initializer=tf.zeros_initializer)
  

  parameters = {
      "W1" : W1,
      "b1" : b1,
      "W2" : W2,
      "b2" : b2,
      "W3" : W3,
      "b3" : b3,
      "W4" : W4,
      "b4" : b4
  }
  
  return parameters

### Forward Propagation
The input is reshaped so that it has 4 dimensions - batchsize, height, width and number of channels.  
The layers are as follows
1. Convolution 
2. Pooling
3. Convolution 
4. Pooling
5. Fully Connected
6. Fully Connected

The Output dimensions are batch_size x width(513)


In [0]:
def forward_propagation(X, parameters):

  X = tf.reshape(X, [-1, 20, 513, 1])
  W1 = parameters['W1']
  b1 = parameters['b1']
  A1 = conv2d(X, W1, b1, stride = 1)
  A1 = max_pool2d(A1, k=2)
  
  W2 = parameters['W2']
  b2 = parameters['b2']
  A2 = conv2d(A1, W2, b2, stride = 1)
  A2 = max_pool2d(A2, k=2)
  A2 = tf.transpose(tf.reshape(A2, [-1, A2.shape[1] * A2.shape[2] * A2.shape[3] ] ))

  W3 = parameters['W3']
  b3 = parameters['b3']
  Z3 = tf.matmul(W3, A2) + b3
  A3 = tf.nn.relu(Z3)
  
  W4 = parameters['W4']
  b4 = parameters['b4']
  Z4 = tf.matmul(W4, A3) + b4

  return tf.transpose(Z4)

### Create Images
We create images of dimensions 20x513 for 2D convolution.

In [0]:
def create_images(input):

  batches = []
  start = 0
  while start<input.shape[0]-19:
    end = start+20
    batches.append(input[start:end, :])
    start += 1
   
    
  return np.array(batches)

In [11]:
X_images = create_images(X_mag)
X_images.shape

(2440, 20, 513)

### Compute Cost
Loss function - MSE  
Cost is the mean of all losses

In [0]:
def compute_cost(Z4, Y):

  cost = tf.reduce_mean(tf.losses.mean_squared_error(tf.nn.relu(Z4), Y))
  
  return cost

### Initialize Model for Training


1.   Epochs = 100
2.   Learning Rate = 2e4
3. Minibatch Size = 16
4. Optimizer - Adam


In [0]:
def model(X_mag, S_mag, learning_rate = 0.0002, epochs = 200, print_cost = True, minibatch_size = 16):
  
  tf.reset_default_graph()
  m = X_mag.shape[0]
  X, Y = create_placeholders()
  parameters = initialize_parameters()
  
  Z4 = forward_propagation(X, parameters)

  cost = compute_cost(Z4, Y)
  
  optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
  init = tf.global_variables_initializer()
  
  with tf.Session() as sess:
    sess.run(init)
    
    costs = []
    
    for epoch in range(epochs):
      epoch_cost = 0.
      
      num_minibatches = m//minibatch_size
      
      for i in range(num_minibatches+1):
        start = i*minibatch_size
        if i<num_minibatches:
          end = start + minibatch_size
          mini_X = X_mag[start:end, :, :]
          mini_Y = S_mag[start:end, :]
        else:
          mini_X = X_mag[start:, :, :]
          mini_Y = S_mag[start:, :]
        
        _, minibatch_cost = sess.run([optimizer, cost], feed_dict = {X: mini_X, Y:mini_Y})
        
        epoch_cost += minibatch_cost / num_minibatches
                               
      if print_cost == True and (epoch % 10 == 0 or epoch == (epochs-1)):
        print("Cost after epoch %i: %f" %(epoch, epoch_cost))
        
      if print_cost == True and epoch % 5 == 0:
        costs.append(epoch_cost)
        
    parameters = sess.run(parameters)
    print ("Parameters have been trained!")

    sess.close()
    return parameters

### Run the model

In [50]:
parameters = model(X_images, S_mag[19:])

Cost after epoch 0: 0.257478
Cost after epoch 10: 0.075119
Cost after epoch 20: 0.038676
Cost after epoch 30: 0.028056
Cost after epoch 40: 0.023496
Cost after epoch 50: 0.018783
Cost after epoch 60: 0.017085
Cost after epoch 70: 0.015653
Cost after epoch 80: 0.011903
Cost after epoch 90: 0.011365
Cost after epoch 100: 0.009074
Cost after epoch 110: 0.007725
Cost after epoch 120: 0.007954
Cost after epoch 130: 0.007159
Cost after epoch 140: 0.006056
Cost after epoch 150: 0.005806
Cost after epoch 160: 0.005040
Cost after epoch 170: 0.004692
Cost after epoch 180: 0.004267
Cost after epoch 190: 0.004139
Cost after epoch 199: 0.004194
Parameters have been trained!


### Append Silent Frames
Append with 19 Silent frames to match the output dimensions.

In [0]:
def append_silent_frames(input):
  to_append = np.array([[np.random.rand() for j in range(513)]for i in range(19)])
  return np.concatenate((to_append, input), axis =0)

### Clean  

1.   Forward propagation on trained Network
2.   Input - Noisy Signal appended with 19 silent frames and converted to images of dimensions 20x513
3. Output - Clean Signal



In [0]:
def clean(X_mag, parameters):
  tf.reset_default_graph()
  
  X = tf.placeholder(tf.float32, [None, 20, 513])
  Z4 = forward_propagation(X, parameters)
  S_test = tf.nn.relu(Z4) 
  
  init = tf.global_variables_initializer()
  with tf.Session() as sess:
    sess.run(init) 
    S_test = sess.run(S_test, feed_dict = {X: X_mag})
    
    sess.close()
  return S_test

#### Clean the training data 


1.   Append silent frames
2.   Create images of dimensions 20x513
3. Feed the images to the trained network to get the clean signal



In [53]:
X_mag_new = append_silent_frames(X_mag)
X_images_new = create_images(X_mag_new)
c1 = clean(X_images_new, parameters)
c1.shape

(2459, 513)

### Recover the Complex Valued Spectrogram
Recovering the complex valued Spectrogram and applying inverse STFT to convert from time-frequency domain to time domain

In [0]:
S = np.multiply(np.divide(X_dirty.T, X_mag), c1)
sh_test = librosa.istft(S.T, win_length=1024 ,hop_length=512)

### Saving the cleaned signal and downloading it

In [0]:
librosa.output.write_wav('test_clean_male.wav', sh_test, sr)
files.download('test_clean_male.wav')

### SNR
Calculate the SNR of the train file

In [0]:
def snr(clean, recon):
  if len(recon)> len(clean):
    new = np.zeros(len(recon)-len(clean))
    clean = np.concatenate([clean, new])
  
  elif len(recon)< len(clean):
      new = np.zeros(len(clean)-len(recon))
      recon = np.concatenate([recon, new])
  
  return 10*np.log10(np.sum(clean**2)/np.sum(clean**2 - recon**2))

In [58]:
snr(s, sh_test)

7.178626987161065

## For test_x_01.wav
Load the file -> transform to feature domain using stft and take magnitude

In [0]:
test1, sr = librosa.load('test_x_01.wav', sr = None)
t1_dirty = librosa.stft(test1, n_fft=1024, hop_length=512)
t1_mag = np.abs(t1_dirty).T    

1. Append silent frames
2. Create images of dimension 20x513
3. Clean the signal using the trained network

In [0]:
t1_new = append_silent_frames(t1_mag)
t1_images = create_images(t1_new)
clean_t1 = clean(t1_images, parameters)

Recover Complex Valued Speech Spectrogram  
And apply inverse STFT to convert from requency domain to time domain

In [0]:
T1 = np.multiply(np.divide(t1_dirty, t1_mag.T), clean_t1.T)
test_01 = librosa.istft(T1, win_length=1024, hop_length=512)

Save the cleaned signal and download it

In [0]:
librosa.output.write_wav('test_s_01_recons.wav', test_01, sr)
files.download('test_s_01_recons.wav')

## For test_x_02.wav
Load the file -> transform to feature domain using stft and take magnitude

In [0]:
test2, sr = librosa.load('test_x_02.wav', sr = None)
t2_dirty = librosa.stft(test2, n_fft=1024, hop_length=512)
t2_mag = np.abs(t2_dirty).T

1. Append silent frames
2. Create images of dimension 20x513
3. Clean the signal using the trained network

In [64]:
t2_new = append_silent_frames(t2_mag)
t2_images = create_images(t2_new)
clean_t2 = clean(t2_images, parameters)
clean_t2.shape

(380, 513)

Recover Complex Valued Speech Spectrogram
And apply inverse STFT to convert from requency domain to time domain

In [0]:
T2 = np.multiply(np.divide(t2_dirty, t2_mag.T), clean_t2.T)
test_02 = librosa.istft(T2, win_length=1024, hop_length=512)

Save the cleaned signal and download it

In [0]:
librosa.output.write_wav('test_s_02_recons.wav', test_02, sr)
files.download('test_s_02_recons.wav')

## Hyperparameters Used


Permutations of the following hyperparameters have been used

1. Learning rates - 0.01, 0.003, 0.001, 0.0003, 0.0001
2. Epochs 50, 100, 150, 200, 250, 400, 500, 1000
3. Minibatch Sizes - 16, 32, 64, 128, 256, 512, 1024, 2048

Learning rates 0.01 and 0.003 were not good after a certain epochs the loss wouldn't change.

Best Learning rates were 0.0003 and 0.0001. So I decided to choose 0.0002

Minibatch sizes greater than 64 didn't help much either. So I played around with sizes 16, 32 and 64

### Tests which had a good output
I've saved their output in the folders with the same name. e.g test1
https://drive.google.com/drive/folders/1bf0W4hnLWqzZttlmQEus1Fi5DAuthFcU?usp=sharing

According to me Test 6 is the best. 

#### Test 1
1. Number of Epochs - 100
2. Minibatch size - 16
3. Learning Rate - 0.002

#### Test 2
1. Number of Epochs - 500
2. Minibatch size - 32
3. Learning Rate - 0.002

#### Test 3
1. Number of Epochs - 1000
2. Minibatch size - 64
3. Learning Rate - 0.002

#### Test 4
1. Number of Epochs - 1000
2. Minibatch size - 16
3. Learning Rate - 0.002

#### Test 5
1. Number of Epochs - 500
2. Minibatch size - 16
3. Learning Rate - 0.002

#### Test 6
1. Number of Epochs - 200
2. Minibatch size - 16
3. Learning Rate - 0.002

### Observations
Appending Silent frames before training did not help as we were using 19 random frames to predict something so I decided to add them after training.

### Assumptions

If Zero Padding was allowed it would've helped as the corners and edges would get equal importances as the middle elements