In [1]:
import numpy as np

import random
import h5py
from keras.models import *
from keras.layers import *
from keras.layers.core import *
from keras.layers.normalization import *
from keras.optimizers import *
from keras.callbacks import *
from keras import backend as K
from keras.regularizers import *
import theano.tensor as T
import theano

import os
import random
import time
import matplotlib
import matplotlib.pyplot as plt

import scipy.signal as sig
import operator
import math
import re

# for reproducibility
np.random.seed(1337) 
random.seed(1337)

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 1: GeForce GTX 690 (CNMeM is disabled, cuDNN 5105)


In [2]:
# external custom code I wrote
from load_data import *
from windowing import *
from consts import *
from utility import *
from pesq import *
from nn_blocks import *

In [3]:
train_X = np.load('./pesq_dataset/train_X.npy')
train_y = np.load('./pesq_dataset/train_y.npy')
val_X = np.load('./pesq_dataset/val_X.npy')
val_y = np.load('./pesq_dataset/val_y.npy')

In [4]:
train_X = np.reshape(train_X, (2, -1, WINDOW_SIZE, 1))
val_X = np.reshape(val_X, (2, -1, WINDOW_SIZE, 1))
train_X = [train_X[0], train_X[1]]
val_X = [val_X[0], val_X[1]]

In [5]:
NFILTS = 32
FSIZE = 5
DENSE_SIZE = 32

def siamese_half():
    inp = Input(shape = (WINDOW_SIZE, 1))
    ret = Reshape((WINDOW_SIZE, 1))(inp)
    
    ret = channel_increase_block(NFILTS, FSIZE)(ret)
    
    ret = residual_block(NFILTS, FSIZE, dilation = 1)(ret)
    ret = residual_block(NFILTS, FSIZE, dilation = 1)(ret)
    
    ret = channel_decrease_block(NFILTS, FSIZE)(ret)
    
    ret = Flatten()(ret)
    ret = Dense(DENSE_SIZE, activation = 'linear', kernel_initializer = W_INIT)(ret)
    
    return Model(inputs = inp, outputs = ret)


input_orig = Input(shape = (WINDOW_SIZE, 1))
input_dirty = Input(shape = (WINDOW_SIZE, 1))

base_network = siamese_half()
processed_a = base_network(input_orig)
processed_b = base_network(input_dirty)

out = EuclideanDistance()([processed_a, processed_b])
out = Dense(1, activation = 'sigmoid', kernel_initializer = W_INIT)(out)

model = Model(inputs = [input_orig, input_dirty], outputs = out)

model.compile(loss = ['mae'], optimizer = Adam(), metrics = ['mse'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 512, 1)        0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 512, 1)        0                                            
____________________________________________________________________________________________________
model_1 (Model)                  (None, 32)            47681                                        
____________________________________________________________________________________________________
lambda_1 (Lambda)                (None, 1)             0                                            
___________________________________________________________________________________________

In [6]:
model.fit(train_X, train_y,
          batch_size = 128,
          epochs = 10,
          verbose = 1,
          shuffle = True,
          validation_data = (val_X, val_y))

Train on 129597 samples, validate on 135616 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

KeyboardInterrupt: 

In [7]:
model.save('pesq_model.h5')

In [8]:
# number of speech files for train, val, and test
TRAIN_SIZE = 100
VAL_SIZE = 100
TEST_SIZE = 100

[train_paths, val_paths, test_paths], \
[train_waveforms, val_waveforms, test_waveforms], \
[train_procwave, val_procwave, test_procwave], \
[train_wparams, val_wparams, test_wparams], \
[train_windows, val_windows, test_windows] = load_data(TRAIN_SIZE, VAL_SIZE, TEST_SIZE)

0: /home/sri/Desktop/timit/TIMIT/TRAIN/DR1/FSJK1/SI1025.WAV1: /home/sri/Desktop/timit/TIMIT/TRAIN/DR2/MPPC0/SX152.WAV2: /home/sri/Desktop/timit/TIMIT/TRAIN/DR3/FLAC0/SI1339.WAV3: /home/sri/Desktop/timit/TIMIT/TRAIN/DR4/MLBC0/SX339.WAV4: /home/sri/Desktop/timit/TIMIT/TRAIN/DR5/FLOD0/SX117.WAV5: /home/sri/Desktop/timit/TIMIT/TRAIN/DR6/FSDJ0/SA1.WAV6: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/FCJS0/SA1.WAV7: /home/sri/Desktop/timit/TIMIT/TRAIN/DR8/MCXM0/SI721.WAV8: /home/sri/Desktop/timit/TIMIT/TRAIN/DR1/FDML0/SA1.WAV9: /home/sri/Desktop/timit/TIMIT/TRAIN/DR2/FLMC0/SI1372.WAV10: /home/sri/Desktop/timit/TIMIT/TRAIN/DR3/MAKR0/SX272.WAV11: /home/sri/Desktop/timit/TIMIT/TRAIN/DR4/FJWB1/SI2055.WAV12: /home/sri/Desktop/timit/TIMIT/TRAIN/DR5/FLOD0/SA1.WAV13: /home/sri/Desktop/timit/TIMIT/TRAIN/DR6/FSDJ0/SX305.WAV14: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/FLEH0/SI2311.WAV15: /home/sri/Desktop/timit/TIMIT/TRAIN/DR8/MKRG0/SX411.WAV16: /home/sri/Desktop/timit/TIMIT/TRAIN/DR1/MMRP0/SX5

In [9]:
orig_windows = np.copy(train_windows[0])
corrupted_windows = np.copy(orig_windows)

crange = WINDOW_SIZE / 2
corrupted_windows *= np.random.normal(1.0, 0.1, corrupted_windows.shape)

In [10]:
print run_pesq_windows(orig_windows, corrupted_windows, train_wparams[0], train_wparams[0])
print mse(orig_windows, corrupted_windows)
print avgErr(orig_windows, corrupted_windows)

2.157
0.000107495
0.00479659


In [11]:
orig_windows = np.reshape(orig_windows, (-1, WINDOW_SIZE, 1))
corrupted_windows = np.reshape(corrupted_windows, (-1, WINDOW_SIZE, 1))

pred = np.mean(model.predict([orig_windows, corrupted_windows]))
pred = pred * 3.5 + 1.0

print pred

2.85794138908
