# Speech Denoising Using GRU

### Imports

In [1]:
import numpy as np
import librosa
import os
import tensorflow as tf
import tensorflow.keras.layers as layers

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Paths

In [2]:
train_path = "./homework3/tr/"
test_path = "./homework3/te/"
val_path = "./homework3/v/"

## Training

### Training Files List

In [3]:
trainList = os.listdir(train_path)
trainList.sort()

### Create Lists

Create List containing all Noise Signals

In [4]:
noise_train = []
# maxi = 0  
for i in range(1200):
    s, sr = librosa.load(train_path+trainList[i], sr=None)
    S = librosa.stft(s, n_fft=1024, hop_length=512)
    S = np.abs(S)
    pad = np.zeros((513, 178))
    pad[:, :S.shape[1]] = S 
    noise_train.append(pad.T)

Create List containing all Clean Signals

In [5]:
clean_train = []
for i in range(1200, 2400):
    s, sr = librosa.load(train_path+trainList[i], sr=None)
    S = librosa.stft(s, n_fft=1024, hop_length=512)
    S = np.abs(S)
    pad = np.zeros((513, 178))
    pad[:, :S.shape[1]] = S 
    clean_train.append(pad.T)

Create List containing all Noisy Signals

In [6]:
noisy_train = []
for i in range(2400, 3600):
    s, sr = librosa.load(train_path+trainList[i], sr=None)
    S = librosa.stft(s, n_fft=1024, hop_length=512)
    S = np.abs(S)
    pad = np.zeros((513, 178))
    pad[:, :S.shape[1]] = S 
    noisy_train.append(pad.T)

### Calculate IBM Mask

In [7]:
def ibm(clean, noise):
    
    ibm_mask = np.zeros((clean.shape))
    ibm_mask[clean > noise] = 1
    
    return ibm_mask

Create List containing all IBM Masks

In [8]:
ibm_train = []
for i in range(1200):
    ibm_mask = ibm(clean_train[i], noise_train[i])
    ibm_train.append(ibm_mask)

### Create model

In [10]:
def gru_model():
    model = tf.keras.Sequential();
    model.add(layers.GRU(513, return_sequences=True))
    model.add(layers.Dropout(rate = 0.2))
    
    model.add(layers.GRU(513, return_sequences=True))
    model.add(layers.Dropout(rate = 0.2))
    
    model.add(layers.Dense(513, activation='sigmoid'))
    return model

### Create a keras model object

In [11]:
model = gru_model()

Stacking stft frames to get dimensions as number_of_files x segments x 513

In [12]:
new_ibm_train = np.stack(ibm_train)
new_noisy_train = np.stack(noisy_train)

In [13]:
new_ibm_train.shape

(1200, 178, 513)

### Compile the Model and Start Training

In [14]:
model.compile(optimizer = tf.keras.optimizers.Adam(), loss = tf.keras.losses.BinaryCrossentropy())
model.fit(new_noisy_train, new_ibm_train, epochs=20, batch_size=10)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f60b6df9780>

## Validation

Training Files List

In [15]:
valList = os.listdir(val_path)
valList.sort()

### Create Lists for Validation

Create List containing all Noise Signals

In [16]:
noise_val_abs = []
noise_val = []

for i in range(1200):
    s, sr = librosa.load(val_path+valList[i], sr=None)
    S = librosa.stft(s, n_fft=1024, hop_length=512)

    pad = np.zeros((513, 178))
    pad[:, :S.shape[1]] = S
    
    pad = pad.T
    
    abs_value = np.abs(pad)
    
    noise_val.append(pad) 
    noise_val_abs.append(abs_value)

  if __name__ == '__main__':


Create List containing all Clean Signals

In [17]:
clean_val = []
clean_val_abs = []

# maxi = 0  
for i in range(1200, 2400):
    s, sr = librosa.load(val_path+valList[i], sr=None)
    S = librosa.stft(s, n_fft=1024, hop_length=512)

    pad = np.zeros((513, 178))
    pad[:, :S.shape[1]] = S
    
    pad = pad.T
    abs_value = np.abs(pad)
    
    clean_val.append(S) 
    clean_val_abs.append(abs_value)

  # Remove the CWD from sys.path while we load stuff.


Create List containing all Noisy Signals

In [18]:
noisy_val = []
noisy_val_abs = []

# maxi = 0  
for i in range(2400, 3600):
    s, sr = librosa.load(val_path+valList[i], sr=None)
    S = librosa.stft(s, n_fft=1024, hop_length=512)

    pad = np.zeros((513, 178))
    pad[:, :S.shape[1]] = S
    
    pad = pad.T
    abs_value = np.abs(pad)
    
    noisy_val.append(pad) 
    noisy_val_abs.append(abs_value)

  # Remove the CWD from sys.path while we load stuff.


Create List containing all IBM Masks

In [19]:
ibm_val = []
for i in range(1200):
    ibm_mask = ibm(clean_val_abs[i], noise_val_abs[i])
    ibm_val.append(ibm_mask)

Stacking stft frames to get dimensions as number_of_files x segments x 513

In [20]:
new_ibm_val = np.stack(ibm_val)
new_noisy_val = np.stack(noisy_val_abs)

In [21]:
new_ibm_val.shape

(1200, 178, 513)

In [22]:
new_noisy_val.shape

(1200, 178, 513)

### Check Validation Loss

In [23]:
model.evaluate(new_noisy_val, new_ibm_val, batch_size=10)



0.2678025668486953

### Get the predicted IBM masks for Validation Set

In [24]:
Getmasks = model.predict(new_noisy_val, batch_size=10)

Stacking stft frames to get dimensions as number_of_files x segments x 513

In [25]:
new_noisy_val_complex = np.stack(noisy_val)

In [26]:
new_noisy_val_complex.shape

(1200, 178, 513)

### Multiply the IBM Mask with Noisy Speech to get Clean Speech

In [27]:
S_recon = np.multiply(masks, new_noisy_val_complex)

### Calculate SNR

In [28]:
def snr(clean, recon):
    print(clean.shape)
    recon = recon[:clean.shape[0]]
    return 10*np.log10(np.sum(clean**2)/np.sum(clean - recon)**2)

In [29]:
clean_val[0].shape

(513, 65)

### Calculating Average SNR of the Validation Set

In [30]:
avg_snr = 0
for i in range(1200):
    print(i)
    c = librosa.istft(clean_val[i], win_length=1024, hop_length=512)
    r = librosa.istft(S_recon[i].T, win_length=1024, hop_length=512)
    new_snr = snr(c, r)
    print(new_snr)
    avg_snr += new_snr
    
avg_snr /= 1200

0
(32768,)
17.086732134715493
1
(32768,)
19.97048753021257
2
(32768,)
17.31621421682287
3
(32768,)
25.257293242610324
4
(32768,)
17.624635251910068
5
(32768,)
17.80715238059592
6
(32768,)
17.916363544140435
7
(32768,)
37.65166943441137
8
(32768,)
18.61337045407922
9
(32768,)
15.096537765239553
10
(66048,)
19.646598221712537
11
(66048,)
27.537725528410974
12
(66048,)
19.192541517192243
13
(66048,)
24.48793721860271
14
(66048,)
19.269260728956073
15
(66048,)
27.24476161492956
16
(66048,)
19.78179607358945
17
(66048,)
49.205529700536644
18
(66048,)
13.70239142585839
19
(66048,)
18.203415798324723
20
(37888,)
23.345084525726723
21
(37888,)
25.935868640971393
22
(37888,)
17.44814039439405
23
(37888,)
20.1628028138471
24
(37888,)
21.55320410640347
25
(37888,)
18.100370919114205
26
(37888,)
19.169197475493746
27
(37888,)
50.62604505767339
28
(37888,)
26.880237539350468
29
(37888,)
21.675413323741477
30
(34816,)
65.70234510256921
31
(34816,)
26.75749837269295
32
(34816,)
29.220884103127418
33


(52224,)
15.845296061701838
266
(52224,)
12.671415941512258
267
(52224,)
23.759569033284578
268
(52224,)
10.722961409407546
269
(52224,)
14.176663490005172
270
(73216,)
11.64899083572299
271
(73216,)
12.49668676644992
272
(73216,)
8.238692456894672
273
(73216,)
16.365391511131957
274
(73216,)
12.413262920678697
275
(73216,)
13.763552176021516
276
(73216,)
13.398906350059814
277
(73216,)
23.670062699237793
278
(73216,)
18.70687335959172
279
(73216,)
11.48743472442093
280
(56320,)
14.416855061520046
281
(56320,)
16.68572211089283
282
(56320,)
16.51697794551702
283
(56320,)
12.09758715712081
284
(56320,)
13.262170057321264
285
(56320,)
16.487161766890633
286
(56320,)
14.934068953496606
287
(56320,)
24.44554485662694
288
(56320,)
14.441938627945287
289
(56320,)
13.274439800489484
290
(73216,)
13.405945397810896
291
(73216,)
14.321472847601115
292
(73216,)
12.332714354961265
293
(73216,)
12.982013552037504
294
(73216,)
13.156495455555483
295
(73216,)
15.955045292529661
296
(73216,)
23.07142

(51200,)
16.734435807113965
554
(51200,)
13.149520242944453
555
(51200,)
19.483156073073616
556
(51200,)
14.136981850353273
557
(51200,)
26.483549332048646
558
(51200,)
17.66231001705622
559
(51200,)
19.10286396206512
560
(37376,)
11.330469832446113
561
(37376,)
13.902872014696655
562
(37376,)
43.05032409572101
563
(37376,)
26.99466601327083
564
(37376,)
11.515672851781408
565
(37376,)
16.754619120487142
566
(37376,)
14.820761967596878
567
(37376,)
32.028322211568764
568
(37376,)
8.262412417413005
569
(37376,)
11.465466080442505
570
(64000,)
9.565503249956883
571
(64000,)
11.298788661081696
572
(64000,)
8.67169498545608
573
(64000,)
12.536917197160896
574
(64000,)
8.769874469733113
575
(64000,)
19.977385780282773
576
(64000,)
20.70803398252953
577
(64000,)
43.52313176941135
578
(64000,)
8.577080290843863
579
(64000,)
8.506407665785897
580
(38400,)
10.135656602140557
581
(38400,)
15.032106741836786
582
(38400,)
9.87024134781346
583
(38400,)
20.8076617215022
584
(38400,)
16.5986881788453

(46592,)
20.671352207459073
824
(46592,)
18.27080933437367
825
(46592,)
23.9123668518693
826
(46592,)
15.02839252398594
827
(46592,)
40.099573984548115
828
(46592,)
30.99893179477544
829
(46592,)
15.233099714368976
830
(51200,)
15.061478940895547
831
(51200,)
16.821268436651145
832
(51200,)
11.932218313124363
833
(51200,)
18.649888163658076
834
(51200,)
19.317371323673274
835
(51200,)
16.63414901652384
836
(51200,)
16.40366255327446
837
(51200,)
34.97455604797268
838
(51200,)
14.302569424378085
839
(51200,)
14.650819971807376
840
(38400,)
17.19948797060319
841
(38400,)
15.717908270612133
842
(38400,)
19.695606383230817
843
(38400,)
19.96275557383368
844
(38400,)
17.06016373123022
845
(38400,)
17.03145919720357
846
(38400,)
26.030989451175216
847
(38400,)
24.22960326051453
848
(38400,)
13.509646794972651
849
(38400,)
17.856118450060784
850
(44032,)
15.570714473847394
851
(44032,)
17.558760411463258
852
(44032,)
13.98052157637686
853
(44032,)
20.93526804530129
854
(44032,)
17.74372793339

(50688,)
27.94757802455632
1086
(50688,)
21.946022906450967
1087
(50688,)
32.85837982732075
1088
(50688,)
40.809906541567926
1089
(50688,)
38.535089428573784
1090
(40448,)
29.668794730762457
1091
(40448,)
37.37324173763231
1092
(40448,)
29.542274865213997
1093
(40448,)
36.40324183511287
1094
(40448,)
27.51878183560713
1095
(40448,)
28.26156085299015
1096
(40448,)
36.24829947486337
1097
(40448,)
35.392382033078334
1098
(40448,)
36.73845537547602
1099
(40448,)
34.267620148986936
1100
(39936,)
32.67512126945927
1101
(39936,)
33.54970973044617
1102
(39936,)
25.017338098777802
1103
(39936,)
29.222973015606666
1104
(39936,)
43.747302033735664
1105
(39936,)
25.627562671158095
1106
(39936,)
26.53387378964102
1107
(39936,)
33.007751699290516
1108
(39936,)
18.01897910608131
1109
(39936,)
36.4454961292966
1110
(38912,)
25.656469123844005
1111
(38912,)
26.128038032057262
1112
(38912,)
19.014238691241175
1113
(38912,)
26.563290886601514
1114
(38912,)
23.393161840404595
1115
(38912,)
27.282833687794

In [31]:
print(avg_snr)

24.008218650585924


## Testing

### Testing Files List

In [32]:
testList = os.listdir(test_path)
testList.sort()

In [33]:
len(testList)

400

### Create List containing all Noisy Signals

In [34]:
noisy_test_abs = []
noisy_test = []

for i in range(400):
    s, sr = librosa.load(test_path+testList[i], sr=None)
    S = librosa.stft(s, n_fft=1024, hop_length=512)

    pad = np.zeros((513, 195))
    pad[:, :S.shape[1]] = S
    
    pad = pad.T
    
    abs_value = np.abs(pad)
    
    noisy_test.append(pad) 
    noisy_test_abs.append(abs_value)

  if __name__ == '__main__':


Stacking stft frames to get dimensions as number_of_files x segments x 513

In [35]:
new_noisy_test_abs = np.stack(noisy_test_abs)

### Get the predicted IBM masks for the Test Set

In [36]:
masks = model.predict(new_noisy_test_abs, batch_size=10)

Stacking stft frames to get dimensions as number_of_files x segments x 513

In [37]:
new_noisy_test_complex = np.stack(noisy_test)

### Multiply the IBM Mask with Noisy Speech to get Clean Speech

In [38]:
S_recon = np.multiply(masks, new_noisy_test_complex)

### Save all the reconstructed files

In [39]:
save_path = './homework3/test_wav/'

Take inverse STFT and truncate the signals to the length of the original noisy signal and then save the corresponding signal

In [40]:
for i in range(400):
    s, sr = librosa.load(test_path+testList[i], sr=None)
    recon = librosa.istft(S_recon[i].T, win_length=1024, hop_length=512)
    recon = recon[:len(s)]
    librosa.output.write_wav(save_path+testList[i]+"_cleaned.wav", recon, sr)


## Things Noticed
I am getting a High SNR ~24. I thought the model was overfitting the data but then I generated the Clean Test Signals and I am getting a clear output.  
Also, if the model was overfitting I would get a High SNR just on the training set and not on the Validation Set