<a href="https://colab.research.google.com/github/saurabhIU/Deep-Learning/blob/master/Speech_Denoising_Using_2D_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Preparation

In [1]:

import librosa
import librosa.display as disp
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import time;
print(tf.__version__)

1.13.1


## Load Data (Training Clean and Training Noisy data)

In [2]:
s_clean, sr_clean=librosa.load('train_clean_male.wav',sr=None)
print(f'Shape of clean signal is {s_clean.shape}')
S_clean=librosa.stft(s_clean, n_fft=1024, hop_length=512)
S_clean_abs = np.abs(S_clean).T

s_dirty, sr_noisy=librosa.load('train_dirty_male.wav',sr=None)
print(f'Shape of clean signal is {s_dirty.shape}')
S_dirty=librosa.stft(s_dirty, n_fft=1024, hop_length=512)

S_dirty_abs = np.abs(S_dirty).T

Shape of clean signal is (1258899,)
Shape of clean signal is (1258899,)


In [3]:
print(f'Shape of clean signal is {S_clean_abs.shape} and shape of noisy signal is {S_dirty_abs.shape}')

Shape of clean signal is (2459, 513) and shape of noisy signal is (2459, 513)


## Helper Functions

In [0]:
def get_input_batch(batch_size, data, labels):
    '''
    Return a total of `batch_size` random samples and labels. 
    '''
    index = np.random.choice(data.shape[0], batch_size)
    x = [data[i] for i in (index)]
    y = [labels[i] for i in (index)]
    return np.asarray(x), np.asarray(y)
  
def flatten_layer(layer):
  
    # Get the shape of the input layer.
    layer_shape = layer.get_shape()

    
    feature_num = layer_shape[1:4].num_elements()
    
    # Flatten
    layer_flat = tf.reshape(layer, [-1, feature_num])

    # Return flattened layer and the number of features.
    return layer_flat, feature_num

  
# Function to create 2-D image of spectrogram
  
def create_2D_Image(data,image_height):
  
  image_width = data.shape[1]
  number_of_images = data.shape[0] - image_height +1
  output = np.zeros((number_of_images,image_height,image_width))
  for i in range(number_of_images):
    output[i,:,:] = data[i:i+20,:]
  return output

# Function to augment predicted output

def augment_prediction(input):
  augmented_mat = np.random.rand(19,513)/1000
  return np.vstack((augmented_mat,input))

## Prepare 2-D Image data out of spectrogram

In [5]:
image_height = 20 
# Create Training image data


X_train = create_2D_Image(S_dirty_abs,image_height)
  

# Create Training label data
number_of_train_label = S_clean_abs.shape[0] - image_height +1
Y_train = S_clean_abs[19:,:]

print(f'Shape of train data is {X_train.shape} and shape of train label is {Y_train.shape}')


Shape of train data is (2440, 20, 513) and shape of train label is (2440, 513)


In [0]:
X_train = X_train.reshape(-1,20,513,1)

## CNN Set-up

In [0]:
EPOCHS = 1500

kernel_size1 = 5
kernel_size2 = 7

conv_strides = [1,2,2,1]
max_pool_strides = [1,2,2,1]

Batch_Size = 200

kernel_num1 = 16
kernel_num2 = 32

fc1_size = 4500
fc2_size = 513

X = tf.placeholder("float", [None,20,513,1])
Y = tf.placeholder("float", [None,513])

### Weights and Biases

In [8]:
filters = {
              'wl1': tf.get_variable('W1', shape=(kernel_size1,kernel_size1,1,kernel_num1), initializer=tf.initializers.he_normal()),
              'wl2': tf.get_variable('W2', shape=(kernel_size2,kernel_size2,kernel_num1,kernel_num2), initializer=tf.initializers.he_normal()),
              'wfc1': tf.get_variable('W4', shape=(2112,fc1_size), initializer=tf.initializers.he_normal()),
              'wfc2': tf.get_variable('W5', shape=(fc1_size,fc2_size), initializer=tf.initializers.he_normal()),
    
              
          }

biases = {
              'bl1': tf.get_variable('B1', shape=(kernel_num1), initializer=tf.initializers.he_normal()),
              'bl2': tf.get_variable('B2', shape=(kernel_num2), initializer=tf.initializers.he_normal()),
              'bl3': tf.get_variable('B4', shape=(fc1_size), initializer=tf.initializers.he_normal()),
              'bl4': tf.get_variable('B5', shape=(fc2_size), initializer=tf.initializers.he_normal()),
    
          }

Instructions for updating:
Colocations handled automatically by placer.


### Function to build Convolution Layer 

In [0]:
#Function to build Convolution Layer 


def build_convolutional_layer(input,filter_num,bias_num,kernel_num):
    
    
    conv_layer = tf.nn.conv2d(input,filters[filter_num],strides=conv_strides,padding='SAME')
    
    conv_layer += biases[bias_num]
    
    conv_layer = tf.nn.max_pool(value=conv_layer,
                                ksize=max_pool_strides,
                                strides=max_pool_strides,
                                padding='SAME')
    
    conv_layer = tf.nn.relu(conv_layer)
    
    return conv_layer

### Build two convolutional layers with maxpool and two fully connected layers

In [0]:
def conv_nn(input):
  
  
  layer1 = build_convolutional_layer(input,'wl1','bl1',kernel_num1)
  print(layer1)
  
  layer2 = build_convolutional_layer(layer1,'wl2','bl2',kernel_num2)
  print(layer2)
  
  
  # Fully connected layer
  layer_flat, fc_feature_num  = flatten_layer(layer2)
  
  print(fc_feature_num)
  fc1 = tf.matmul(layer_flat,filters['wfc1']) + biases['bl3']
  
  fc1 = tf.nn.relu(fc1)
  
  fc2 = tf.matmul(fc1,filters['wfc2']) + biases['bl4']
  
  return fc2

### Define Cost and Optimizations

In [11]:

logits = conv_nn(X)
print(logits)
cost = tf.losses.mean_squared_error(Y,logits)
optimizer = tf.train.AdamOptimizer().minimize(cost)

Tensor("Relu:0", shape=(?, 5, 129, 16), dtype=float32)
Tensor("Relu_1:0", shape=(?, 2, 33, 32), dtype=float32)
2112
Tensor("add_3:0", shape=(?, 513), dtype=float32)
Instructions for updating:
Use tf.cast instead.


## Train Convolutional Neural Network

In [12]:
sess =  tf.Session() 
   
sess.run(tf.global_variables_initializer())
  
tic = time.time()
for i in range(EPOCHS):
    x_batch, y_batch = get_input_batch(Batch_Size,X_train,Y_train)
    c,_ = sess.run([cost, optimizer],feed_dict={X:x_batch, Y: y_batch})
    if i % 100 == 0:
      print(f'Epoch: {i},training loss:{c}')
toc = time.time()
print(f'Time taken for training is {toc-tic}')

Epoch: 0,training loss:1.1554371118545532
Epoch: 100,training loss:0.04832402989268303
Epoch: 200,training loss:0.025135790929198265
Epoch: 300,training loss:0.01323155127465725
Epoch: 400,training loss:0.007687220815569162
Epoch: 500,training loss:0.005572779104113579
Epoch: 600,training loss:0.004203344229608774
Epoch: 700,training loss:0.003197616897523403
Epoch: 800,training loss:0.0026615827810019255
Epoch: 900,training loss:0.0022299382835626602
Epoch: 1000,training loss:0.0024288829881697893
Epoch: 1100,training loss:0.002103083534166217
Epoch: 1200,training loss:0.001630069687962532
Epoch: 1300,training loss:0.0014179139398038387
Epoch: 1400,training loss:0.0012071490054950118
Time taken for training is 61.290491819381714


## Denoise train noisy signal by feeding it through trained network

In [13]:
prediction = sess.run(logits,feed_dict={X: X_train, Y: Y_train})
print(prediction.shape)

(2440, 513)


## Augment predicted value with some silent frames to match the size of input

In [0]:
augmented_prediction = augment_prediction(prediction)

## Recover speech spectrogram

In [0]:
prediction_complex = np.multiply(np.divide(S_dirty,S_dirty_abs.T),augmented_prediction.T)


## Recover Time domain signal

In [0]:
prediction_timedomain = librosa.istft(prediction_complex,hop_length=512, win_length=1024)

## Trim down ground truth to match the size of denoised signal to calculate SNR

In [17]:

s_clean = s_clean[0:prediction_timedomain.size]
s_clean.shape

(1258496,)

## Calculate SNR

In [18]:
SNR = 10*np.log10(np.dot(s_clean.T,s_clean)/np.dot((s_clean - prediction_timedomain).T,(s_clean - prediction_timedomain)))
SNR

15.956320762634277

## Load test noisy signal test_01_x.wav and test_02_x.wav. Feed their magnitude spectra to trained network

In [19]:
test1, sr_test=librosa.load('test_x_01.wav',sr=None)
Test1=librosa.stft(test1, n_fft=1024, hop_length=512,window="hann")
Test1_absolute = np.abs(Test1).T


test2, sr_test=librosa.load('test_x_02.wav',sr=None)
Test2=librosa.stft(test2, n_fft=1024, hop_length=512,window="hann")
Test2_absolute = np.abs(Test2).T

print(f'Shape of Test1 is {Test1_absolute.shape} and Test2 is {Test2_absolute .shape}')


Shape of Test1 is (142, 513) and Test2 is (380, 513)


In [0]:
X_test1 = create_2D_Image(Test1_absolute,image_height)

X_test2 = create_2D_Image(Test2_absolute,image_height)

In [0]:
X_Test1 = X_test1.reshape(-1,20,513,1)

X_Test2 = X_test2.reshape(-1,20,513,1)


test_prediction_1 = sess.run(logits,feed_dict={X: X_Test1})

test_prediction_2 = sess.run(logits,feed_dict={X: X_Test2})


## Augment test predictions

In [0]:
# Augment prediction for Test1
augmented_prediction_1 = augment_prediction(test_prediction_1)

# Augment prediction for Test2
augmented_prediction_2 = augment_prediction(test_prediction_2)

## Recover complex valued speech spectrogram of cleaned test signals

In [0]:
test1_complex = np.multiply(np.divide(Test1,Test1_absolute.T),augmented_prediction_1.T)

test2_complex = np.multiply(np.divide(Test2,Test2_absolute.T),augmented_prediction_2.T)

## Recover time domain speech signal by applying inverse-STFT

In [0]:
test1_timedomain = librosa.istft(test1_complex,hop_length=512, win_length=1024,window="hann")

test2_timedomain = librosa.istft(test2_complex,hop_length=512, win_length=1024,window="hann")

## Write out the cleaned speech signals

In [0]:
librosa.output.write_wav('cleaned_test1_conv2.wav', test1_timedomain, sr_test)

librosa.output.write_wav('cleaned_test2_conv2.wav', test2_timedomain, sr_test)