In [1]:
import numpy as np

import random
import h5py
from keras.models import *
from keras.layers import *
from keras.layers.core import *
from keras.layers.normalization import *
from keras.optimizers import *
from keras.callbacks import *
from keras import backend as K
from keras.regularizers import *
from keras.initializers import *
from keras.models import load_model
from keras.losses import *
from sklearn import metrics
from sklearn.preprocessing import *
from scipy.fftpack import dct, idct
from keras.activations import softmax
from sklearn.cluster import KMeans, MiniBatchKMeans
import scipy.io.wavfile as sciwav

import os
import random
import time
import matplotlib
import matplotlib.pyplot as plt
import glob

import operator
import math
import re

# for reproducibility
np.random.seed(1337) 
random.seed(1337)

# increase recursion limit for adaptive VQ
import sys
sys.setrecursionlimit(40000)

Using TensorFlow backend.


In [2]:
# control amount of GPU memory used
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
set_session(tf.Session(config=config))

In [3]:
# external custom code I wrote
from load_data import *
from windowing import *
from nn_util import *
from pesq import *
from consts import *
from nn_blocks import *
from perceptual_loss import *
from evaluation import *

In [4]:
# number of speech files for train, val, and test
TRAIN_SIZE = 1000
VAL_SIZE = 100
TEST_SIZE = 500

[train_paths, val_paths, test_paths], \
[train_waveforms, val_waveforms, test_waveforms], \
[train_procwave, val_procwave, test_procwave], \
[train_wparams, val_wparams, test_wparams], \
[train_windows, val_windows, test_windows] = load_data(TRAIN_SIZE, VAL_SIZE, TEST_SIZE)

In [6]:
KERAS_LOAD_MAP = {'PhaseShiftUp1D' : PhaseShiftUp1D,
                  'SoftmaxQuantization' : SoftmaxQuantization,
                  'SoftmaxDequantization' : SoftmaxDequantization}

autoencoder = load_model('best_auto.h5', KERAS_LOAD_MAP)
K.set_value(QUANTIZATION_ON, True)



In [9]:
# test model speed
w = val_windows[0].reshape(-1, WINDOW_SIZE, 1)

start = time.time()
autoencoder.predict(w, batch_size = 1, verbose = 0)
end = time.time()

averageMs = (end - start) / w.shape[0] * 1000.0

print "Averaged", averageMs, "ms per window"

Averaged 20.7105657329 ms per window


In [6]:
test_model_on_wav("./SA1.wav", "SA1_final", autoencoder)
test_model_on_wav("./SA1.wav", "SA1_final", autoencoder, argmax = True)

test_model_on_wav("./SX383.wav", "SX383_final", autoencoder)
test_model_on_wav("./SX383.wav", "SX383_final", autoencoder, argmax = True)

test_model_on_wav("./fiveYears.wav", "fy_final", autoencoder)
test_model_on_wav("./fiveYears.wav", "fy_final", autoencoder, argmax = True) 

MSE:         2775.49
Avg err:     31.9091
PESQ:        3.92686700821
MSE:         2815.66
Avg err:     32.1905
PESQ:        3.91300988197
MSE:         2967.67
Avg err:     24.5403
PESQ:        4.00055789948
MSE:         2987.09
Avg err:     24.7282
PESQ:        3.97957181931
MSE:         1.07846e+06
Avg err:     736.161
PESQ:        3.86556792259
MSE:         1.08235e+06
Avg err:     737.834
PESQ:        3.86135601997


[1082353.0, 737.83405, 3.861356019973755]

In [7]:
captions = ["training", "validation", "test"]
datasets = [train_paths, val_paths, test_paths]

for i in xrange(0, 3):
    print "Model evaluation (" + captions[i] + ")"

    base_scores = []
    for path in datasets[i]:
        pesq = test_model_on_wav(path, "", autoencoder,
                                 save_recons = False,
                                 verbose = False,
                                 argmax = True)[2]
        base_scores.append(pesq)

    print "Mean:", np.mean(base_scores)
    print "Max: ", np.max(base_scores)
    print "Min: ", np.min(base_scores)
    print ""

Model evaluation (training)
Mean: 4.21994055176
Max:  4.54800462723
Min:  2.29520010948

Model evaluation (validation)
Mean: 4.34140196562
Max:  4.52157258987
Min:  3.53952479362

Model evaluation (test)
Mean: 4.21966117287
Max:  4.51846265793
Min:  2.42620277405

