<a href="https://colab.research.google.com/github/saurabhIU/Deep-Learning/blob/master/Problem_2_Speech_Denoising_Using_Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preparation

In [0]:
# Dependencies
import librosa
import librosa.display as disp
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import time;

In [219]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving test_x_01.wav to test_x_01 (3).wav
Saving test_x_02.wav to test_x_02 (3).wav
Saving train_clean_male.wav to train_clean_male (3).wav
Saving train_dirty_male.wav to train_dirty_male (3).wav
User uploaded file "test_x_01.wav" with length 145284 bytes
User uploaded file "test_x_02.wav" with length 388752 bytes
User uploaded file "train_clean_male.wav" with length 2522886 bytes
User uploaded file "train_dirty_male.wav" with length 2522898 bytes


# Load Data (Training Clean and Training Noisy data)

In [0]:
s_clean, sr_clean=librosa.load('train_clean_male.wav',sr=None)
S_train=librosa.stft(s_clean, n_fft=1024, hop_length=512)
s_noisy, sr_noisy=librosa.load('train_dirty_male.wav',sr=None)
X_train=librosa.stft(s_noisy, n_fft=1024, hop_length=512)

In [270]:
print(f'Shape of slean signal is {s_clean.shape} and shape of noisy signal is {s_noisy.shape}')

Shape of slean signal is (1258899,) and shape of noisy signal is (1258899,)


# Neural Network 

**Tanh activation functions in first two hidden layers and RELU in output layer gave me best SNR and results**

In [0]:
EPOCHS = 1500
X = tf.placeholder("float", [None, 513])
Y = tf.placeholder("float", [None, 513])
def nn_model(x):
  
    layer1 = tf.layers.dense(x, 1024,activation=tf.nn.tanh)
    layer2 = tf.layers.dense(layer1, 1024,activation=tf.nn.tanh)
    output = tf.layers.dense(layer2, 513,activation=tf.nn.relu)
    return output

logits = nn_model(X)
cost = tf.losses.mean_squared_error(Y,logits)
optimizer = tf.train.AdamOptimizer().minimize(cost)

# Training Neural Network

In [272]:
sess =  tf.Session() 
    
sess.run(tf.global_variables_initializer())
  
tic = time.time()
for i in range(EPOCHS):
    c,_ = sess.run([cost, optimizer],feed_dict={X: X_absolute.T, Y: S_absolute.T})
    if i % 100 == 0:
      print(f'Epoch: {i},training loss:{c}')
toc = time.time()
print(f'Time taken for training is {toc-tic}')

Epoch: 0,training loss:0.10957279801368713
Epoch: 100,training loss:0.0034950440749526024
Epoch: 200,training loss:0.0019965246319770813
Epoch: 300,training loss:0.0015231455909088254
Epoch: 400,training loss:0.0013008574023842812
Epoch: 500,training loss:0.0012068605283275247
Epoch: 600,training loss:0.0010690761264413595
Epoch: 700,training loss:0.0010167710715904832
Epoch: 800,training loss:0.0011247664224356413
Epoch: 900,training loss:0.0007912996807135642
Epoch: 1000,training loss:0.0008151379879564047
Epoch: 1100,training loss:0.0007655711960978806
Epoch: 1200,training loss:0.0007338332361541688
Epoch: 1300,training loss:0.0006723499391227961
Epoch: 1400,training loss:0.0007152247708290815
Time taken for training is 29.180617809295654


# Denoise train noisy signal by feeding it through trained network


In [0]:
prediction = sess.run(logits,feed_dict={X: X_absolute.T, Y: S_absolute.T})

# Recover speech spectrogram

In [0]:
prediction_complex = np.multiply(np.divide(X_train,X_absolute),prediction.T)

# Recover Time domain signal

In [0]:
prediction_timedomain = librosa.istft(prediction_complex,hop_length=512, win_length=1024)

In [276]:
print(f'Size of predicted signal is {prediction_timedomain.size} and ground truth signal is {s_clean.size}')

Size of predicted signal is 1258496 and ground truth signal is 1258899


# Trim down ground truth to match the size of denoised signal to calculate SNR

In [277]:
s_clean = s_clean[0:prediction_timedomain.size]
s_clean.shape

(1258496,)

# Calculate SNR

In [278]:
SNR = 10*np.log10(np.dot(s_clean.T,s_clean)/np.dot((s_clean - prediction_timedomain).T,(s_clean - prediction_timedomain)))
SNR

18.982619047164917

# Load test noisy signal test_01_x.wav and feed its magnitude spectra to trained network

In [0]:
test1, sr_test=librosa.load('test_x_01.wav',sr=None)
Test1=librosa.stft(test1, n_fft=1024, hop_length=512,window="hann")
Test1_absolute = np.abs(Test1)

In [281]:
Test1_absolute.shape

(513, 142)

In [282]:
test_prediction = sess.run(logits,feed_dict={X: Test1_absolute.T})
test_prediction.shape

(142, 513)

# Recover complex valued speech spectrogram of cleaned test signal

In [0]:
test1_complex = np.multiply(np.divide(Test1,Test1_absolute),test_prediction.T)


In [284]:
test1_complex.shape

(513, 142)

# Recover time domain speech signal by applying inverse-STFT

In [0]:
test1_timedomain = librosa.istft(test1_complex,hop_length=512, win_length=1024,window="hann")

# Write out the cleaned speech signal

In [0]:
librosa.output.write_wav('cleaned_test1.wav', test1_timedomain, sr_test)

In [287]:
import IPython.display as ipd
ipd.Audio('cleaned_test1.wav')

In [288]:
ipd.Audio('test_x_01.wav')

# Load test noisy signal test_02_x.wav and feed its magnitude spectra to trained network

In [0]:
test2, sr_test=librosa.load('test_x_02.wav',sr=None)
Test2=librosa.stft(test2, n_fft=1024, hop_length=512,window="hann")
Test2_absolute = np.abs(Test2)

In [0]:
test2_prediction = sess.run(logits,feed_dict={X: Test2_absolute.T})

# Recover complex valued speech spectrogram of cleaned test signal

In [0]:
test2_complex = np.multiply(np.divide(Test2,Test2_absolute),test2_prediction.T)


# Recover time domain speech signal by applying inverse-STFT

In [0]:
test2_timedomain = librosa.istft(test2_complex,hop_length=512, win_length=1024,window="hann")

# Write out the cleanes speech signal

In [0]:
librosa.output.write_wav('cleaned_test2.wav', test2_timedomain, sr_test)

In [294]:
import IPython.display as ipd
ipd.Audio('cleaned_test2.wav')

In [247]:
ipd.Audio('test_x_02.wav')