In [0]:
import numpy as np
import tensorflow  as tf
from tensorflow.examples.tutorials.mnist import input_data
from tensorflow.contrib.layers import fully_connected
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from matplotlib import cm
from sklearn.manifold import TSNE
import pandas as pd
import librosa
from google.colab import files


In [78]:
files.upload()

{}

In [0]:
X = tf.placeholder(tf.float32, shape=(None, 513))
y = tf.placeholder(tf.float32, shape=(None,513))

In [0]:
#Number of hidden layers
n_hl = 3

#Number of neurons in each layer
n_nodes_hl = [512]*n_hl

#Size of input
n_inputs = 513

#Number of outputs (regression problem)
n_class = 513

In [0]:
#Neural Network with specified hidden layers 
hidden_layer = [0]*n_hl

#First hidden layer
hidden_layer[0] = fully_connected(X, n_nodes_hl[0])

#Subsequent hidden layer
for i in range(1,n_hl):
  hidden_layer[i] = fully_connected(hidden_layer[i-1],n_nodes_hl[i])

#Last hidden layer
last_hl = hidden_layer[-1]

#Output layer
logits = fully_connected(hidden_layer[-1], n_class, activation_fn=None)
  

In [0]:
#Loading train files
s, sr=librosa.load('train_clean_male.wav', sr=None)
S=librosa.stft(s, n_fft=1024, hop_length=512)
sn, sr=librosa.load('train_dirty_male.wav', sr=None) 
Xs=librosa.stft(sn, n_fft=1024, hop_length=512)

In [0]:
#Loading test files
test1, sr1 = librosa.load('test_x_01.wav', sr=None)
t1 = librosa.stft(test1, n_fft=1024, hop_length=512)
test2, sr2 = librosa.load('test_x_02.wav', sr=None)
t2 = librosa.stft(test2, n_fft=1024, hop_length=512)

In [0]:
#Taking absolute of input and output of training file
Xs_abs =  np.abs(Xs).T
S_abs = np.abs(S).T

In [0]:
#Taking absolute of input and output of test file
t1_abs = np.abs(t1).T
t2_abs = np.abs(t2).T

In [0]:
#Taking the output of neural network and passing it through relu function to get a positive output
reg_op = tf.nn.relu(logits)

In [0]:
#Mean Square Error(MSE) - Defining the error calculated using output of the NN and the given output
mse = tf.losses.mean_squared_error(labels=y, predictions=reg_op)
loss = tf.reduce_mean(mse, name="loss")

#Calculation of Rsquared, which is a measure of error for a regression problem
total_error = tf.reduce_sum(tf.square(tf.subtract(y, tf.reduce_mean(y))))
unexplained_error = tf.reduce_sum(tf.square(tf.subtract(y, reg_op)))
R_squared = tf.subtract(1.0, tf.div(unexplained_error, total_error))

In [0]:
lr = 0.0003

#Using Adam Optimizer for minimizing the loss
optimizer2 = tf.train.AdamOptimizer(lr).minimize(loss)

In [0]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [36]:
#Training the NN
n_epochs = 100
batch_size = 20

with tf.Session() as sess:
  init.run()
  for epoch in range(n_epochs):
    for iteration in range(Xs_abs.shape[0] // batch_size):
      X_batch = Xs_abs[batch_size*iteration:batch_size*(iteration+1)]
      y_batch = S_abs[batch_size*iteration:batch_size*(iteration+1)]
      sess.run(optimizer2, feed_dict = {X: X_batch, y: y_batch})
    acc_train = R_squared.eval(feed_dict={X: X_batch, y: y_batch})
    #acc_test = R_squared.eval(feed_dict={X2: mnist.test.images, y2:mnist.test.labels})
    if epoch % 10 == 0:
      print(epoch, "Train accuracy:", acc_train)
    #, "Test accuracy:", acc_test
    
  save_path = saver.save(sess, "./audio_model.ckpt")

10 Train accuracy: 0.7752273
20 Train accuracy: 0.84042567
30 Train accuracy: 0.8374022
40 Train accuracy: 0.8328972
50 Train accuracy: 0.87562525
60 Train accuracy: 0.85982764
70 Train accuracy: 0.8822912
80 Train accuracy: 0.880097
90 Train accuracy: 0.90460974
0 Train accuracy: 0.5329154
10 Train accuracy: 0.8033442
20 Train accuracy: 0.8335345
30 Train accuracy: 0.7890721
40 Train accuracy: 0.86957955
50 Train accuracy: 0.8487057
60 Train accuracy: 0.865092
70 Train accuracy: 0.90314734
80 Train accuracy: 0.8958189
90 Train accuracy: 0.86618704


In [0]:
#Function to recover clean audio from input in  time domain after running through the trained NN
def speech_recovery(data):
  X1 = librosa.stft(data, n_fft=1024, hop_length=512)
  X_abs = np.abs(X1).T
  with tf.Session() as sess:
    saver.restore(sess, "./audio_model.ckpt")
    clean = reg_op.eval(feed_dict = {X: X_abs})
        
    #Hadamard product to construct clean test signal
    sabs = tf.multiply(tf.div(X1,np.abs(X1)),clean.T)
    
    sbar = sabs.eval(feed_dict={X: X_abs})
    
    #inverse stft to bring the signal to time domain
    s = librosa.core.istft(sbar, hop_length=512, length=len(data))
    
    return s  
  

In [38]:
#Recovering the vector for clean audio of test1 after running through the NN
test1_clean = speech_recovery(test1)

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./audio_model.ckpt
INFO:tensorflow:Restoring parameters from ./audio_model.ckpt


In [0]:
librosa.output.write_wav('test_s_01_recons.wav', test1_clean, sr1)

In [0]:
files.download('test_s_01_recons.wav')

In [0]:
#Recovering the vector for clean audio of test2
test2_clean = speech_recovery(test2)

INFO:tensorflow:Restoring parameters from ./audio_model.ckpt


In [0]:
librosa.output.write_wav('test_s_02_recons.wav', test2_clean, sr2)
files.download('test_s_02_recons.wav')

In [43]:
train_clean = speech_recovery(sn)

INFO:tensorflow:Restoring parameters from ./audio_model.ckpt


In [45]:
#SNR - Signal to Noise Ratio for input file
num = np.sum(np.square(train_clean))
den = np.sum(np.square(np.subtract(s, train_clean)))
snr = 10*np.log10(num/den)
print('Signal to Noise Ratio for input file:',snr)

Signal to Noise Ratio for input file: 13.15358281135559
