# <center>Speech Denoising Using 2-D CNN</center>

In [1]:
import tensorflow as tf, librosa, numpy as np, pandas as pd, matplotlib.pyplot as plt
tf.compat.v1.disable_v2_behavior() # Disabling tensorflow v2.0 behavior
np.random.seed(7) # Set random seed for numpy
tf.random.set_random_seed(7) # Set tensorflow random seed
%matplotlib inline

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
# Convert the audio files to spectrograms, matrices of size 513x2459
s, sr = librosa.load('train_clean_male.wav', sr = None)
S = librosa.stft(s, n_fft = 1024, hop_length = 512)
sn, sr = librosa.load('train_dirty_male.wav', sr = None)
X = librosa.stft(sn, n_fft = 1024, hop_length = 512)

In [3]:
# Printing the spectrograms
print(X)

[[ 1.29152825e-02+0.0000000e+00j  8.45048483e-03+0.0000000e+00j
  -2.07481943e-02+0.0000000e+00j ... -1.52135687e-02+0.0000000e+00j
  -4.98118671e-03+0.0000000e+00j -7.75823556e-03+0.0000000e+00j]
 [-7.06597278e-03-4.3706900e-19j -6.94738980e-03+5.1884386e-03j
   2.39305142e-02+3.3414264e-03j ...  1.20950164e-02-9.4868029e-03j
  -3.12095246e-04-1.4751647e-03j  1.96925160e-02+6.0344348e-03j]
 [ 3.11613153e-03+2.0498197e-19j -1.59678720e-02-1.7763563e-02j
   8.85831192e-03+8.3497362e-03j ... -1.23670641e-02+5.9306147e-03j
   7.67358765e-03+2.1299278e-02j -2.82987002e-02+3.5692996e-04j]
 ...
 [ 5.43509632e-05+1.1858461e-20j -4.15163022e-03-1.1022503e-03j
   1.19022408e-03-2.0613750e-03j ...  5.36997162e-04+7.6431502e-04j
  -1.19794244e-02+7.1640010e-03j -9.00672283e-03-8.2842866e-03j]
 [-4.78351954e-04+4.2351647e-19j  2.63221073e-03-1.5408223e-03j
   7.62302428e-04+5.9353854e-03j ... -3.81498365e-03-6.0208812e-03j
   5.79493213e-03-3.9956453e-03j  1.48106953e-02+1.0228334e-02j]
 [ 6.20555

In [4]:
# Getting the real valued equivalents of the spectrograms
X_mod = abs(X)
S_mod = abs(S)

In [5]:
# Dimension of the spectrograms
print(X_mod.shape, S_mod.shape)

(513, 2459) (513, 2459)


In [6]:
# Getting the transpose matrices for feeding to the 1-D CNN
X_T = X_mod.T
S_T = S_mod.T

### We need to modify our data so that 20 rows of X transpose are the input "image" of size 20 x 513 which corresponds to the 20th clean row of S transpose as the "labels" for our input image.

In [7]:
CNN_2d = tf.Graph() # Graph for the 2-D CNN

with CNN_2d.as_default(): 

    X_dirty = tf.placeholder(tf.float32, shape = [None,513])
    S_clean = tf.placeholder(tf.float32, shape = [None,513])
    batch_size = tf.placeholder(tf.int64)

    # Due to our mapping of the dataet as given above, we need a custom generator function
    def gen_data(X_dirty, S_clean):

        offset = 0 # To start from
        stride = 1 # How much to increment offset by
        data_length = X_dirty.shape[0] - 1
        while offset <= (data_length-19):
            # Create the input image and associated label data
            yield(X_dirty[offset:offset+20],S_clean[offset+19])
            offset = offset + stride

    # Create a tf dataset
    dataset = tf.data.Dataset.from_generator(gen_data,(tf.float32,tf.float32),(tf.TensorShape([20,513]),tf.TensorShape([513])),args=(X_dirty,S_clean))
    dataset = dataset.repeat()  # Loop through the dataset forever
    dataset = dataset.batch(batch_size,drop_remainder=False)  # Batch the newly created data
    dataset = dataset.prefetch(1) # Try prefetching
    iterator = dataset.make_initializable_iterator()    # Create an iterator for the dataset
    next_element = iterator.get_next()

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_initializable_iterator(dataset)`.


In [8]:
with CNN_2d.as_default():

    (X_nn,y_nn) = iterator.get_next() # Create the 2D CNN network

    drop_rate = tf.placeholder(tf.float32)  # Specify the drop rate for the dropout

    # Reshape X_nn to have shape = (batch_size,height,width,channels)
    X_adj = tf.reshape(X_nn,shape = [batch_size, 20, 513, 1]) # Change batch_size to y_nn.shape[20], last batch has 19 records instead of 20

    # Number of filters in each layer and their size
    layer1_kernel_num = 15
    layer2_kernel_num = 15
    layer3_kernel_num = 10

    layer1_kernel_size = [5,5]
    layer2_kernel_size = [5,5]
    layer3_kernel_size = [2,2]

    # Complex convolution layer 1
    conv1 = tf.layers.conv2d(inputs = X_adj, filters = layer1_kernel_num, kernel_size = layer1_kernel_size, activation = tf.nn.relu, padding='same')
    drop1 = tf.layers.dropout(inputs = conv1, rate = drop_rate)
    pool1 = tf.layers.max_pooling2d(inputs = drop1, pool_size = [3,3], strides = 2)

    # Complex convolution layer 2
    conv2 = tf.layers.conv2d(inputs=pool1,filters=layer2_kernel_num,kernel_size=layer2_kernel_size,activation = tf.nn.relu,padding='same')
    drop2 = tf.layers.dropout(inputs=conv2,rate=drop_rate)
    pool2 = tf.layers.max_pooling2d(inputs=drop2,pool_size=[3,3],strides=2)

    # Complex convolution layer 3
    conv3 = tf.layers.conv2d(inputs=pool2, filters=layer3_kernel_num, kernel_size=layer3_kernel_size, activation = tf.nn.relu)
    drop3 = tf.layers.dropout(inputs=conv3, rate=drop_rate)
    pool3 = tf.layers.max_pooling2d(inputs=drop3, pool_size=[2,2], strides=1)

    fl = tf.layers.flatten(inputs=pool2)  # Flatten to feed to the fully connected layer

    fc1 = tf.layers.dense(inputs=fl, units=800, activation = tf.nn.relu) # Fully connected layer 1

    fc = tf.layers.dense(inputs=fc1, units=513, activation = tf.nn.relu)  # Last fully connected layer

    loss = tf.reduce_sum(tf.squared_difference(fc, y_nn)) # Define the loss

    optimize = tf.train.AdamOptimizer(1e-04).minimize(loss) # Using Adam optimizer

Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Use keras.layers.MaxPooling2D instead.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.Dense instead.


## Training the 2-D CNN Model

In [11]:
with CNN_2d.as_default():
   
    train_losses = []
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(iterator.initializer, feed_dict = {X_dirty: X_T, S_clean: S_T, batch_size: 20})

    max_epochs = 100
    progbar = tf.keras.utils.Progbar(max_epochs)
    
    # Training loop
    for ep in range(max_epochs):
        batch_losses = 0

        for batch in range(122):
            loss_, __ = sess.run([loss,optimize], feed_dict = {drop_rate: 0.4, batch_size: 20})
            batch_losses = batch_losses + loss_

        train_losses.extend([(batch_losses/122)])
        progbar.update(ep+1)



## Obtain the predictions for the training audio signal with 2-D CNN

In [12]:
with CNN_2d.as_default():
    sess.run(iterator.initializer, feed_dict = {X_dirty: X_T, S_clean: S_T, batch_size: X_T.shape[0]-19})
    X_hat = sess.run(fc, feed_dict = {drop_rate: 0.0, batch_size: X_T.shape[0]-19}) # Get the predicted signal
    print(X_hat.shape)  # Get the shape of predicted signal

(2440, 513)


In [13]:
S_T1_predicted = X_hat.T # Take the transpose
S_T1_hat = np.multiply(np.divide(X[:,19:],X_mod[:,19:]),S_T1_predicted) # The predicted S_hat
S_T1 = librosa.istft(S_T1_hat,hop_length=512) # Inverse STFT
librosa.output.write_wav('train_2D_recon.wav', S_T1, sr)

## SNR of the training audio signal
## SNR = 10 log<sub>10</sub> $\frac{\sum_{t}^{} ( s ( t ) )^2} {\sum_{t}^{} ( s ( t ) - \hat{s} ( t ) )^2}$ 

In [14]:
# Formula to calculate SNR
snr = 10 * np.log10(np.sum(np.square(S[:,19:])) / np.sum(np.square(np.subtract(S[:,19:], S_T1_hat))))
print("SNR of reconstructed train signal = ", abs(snr), "db")

SNR of reconstructed train signal =  20.779985854430667 db


## Testing the 2-D CNN model on test signal 1 (test_x_01.wav)

In [15]:
s3, sr = librosa.load('test_x_01.wav', sr = None)
T1 = librosa.stft(s3, n_fft=1024, hop_length = 512) # STFT on test_x_01

T1_mod = np.abs(T1) # Magnitude of T1
T1_T = T1_mod.T     # Transpose
with CNN_2d.as_default():
    sess.run(iterator.initializer, feed_dict = {X_dirty: T1_T, S_clean: T1_T, batch_size: T1_T.shape[0]-19})
    S_T1_predicted_T = sess.run(fc, feed_dict = {drop_rate: 0.0, batch_size: T1_T.shape[0]-19})  # Get the predicted signal
S_T1_predicted = S_T1_predicted_T.T  # Take the transpose
S_T1_hat = np.multiply(np.divide(T1[:,19:], T1_mod[:,19:]), S_T1_predicted) # The predicted S_hat
S_T1 = librosa.istft(S_T1_hat, hop_length = 512)   # Inverse STFT
librosa.output.write_wav('test_s_01_2D_CNN_recon.wav', S_T1, sr)

## Testing the 2-D CNN model on test signal 2 (test_x_02.wav)

In [16]:
s3, sr = librosa.load('test_x_02.wav', sr = None)
T1 = librosa.stft(s3, n_fft = 1024, hop_length = 512) # STFT on test_x_02

T1_mod = np.abs(T1) # Magnitude of T1
T1_T = T1_mod.T  # Transpose

with CNN_2d.as_default():
    sess.run(iterator.initializer, feed_dict = {X_dirty: T1_T, S_clean: T1_T, batch_size: T1_T.shape[0]-19})
    S_T1_predicted_T = sess.run(fc, feed_dict = {drop_rate: 0.0, batch_size: T1_T.shape[0]-19})  # Get the predicted signal

    S_T1_predicted = S_T1_predicted_T.T  # Transpose
S_T1_hat = np.multiply(np.divide(T1[:,19:], T1_mod[:,19:]), S_T1_predicted) # Predicted S_hat
S_T1 = librosa.istft(S_T1_hat, hop_length = 512)  # Inverse STFT
librosa.output.write_wav('test_s_02_2D_CNN_recon.wav', S_T1, sr)

In [17]:
sess.close() # End the session