In [1]:
import numpy as np

import random
import h5py
from keras.models import load_model

import os
import random
import time

# for reproducibility
np.random.seed(1337) 
random.seed(1337)

Using TensorFlow backend.


In [2]:
# control amount of GPU memory used
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
set_session(tf.Session(config=config))

In [3]:
# external custom code I wrote
from load_data import *
from windowing import *
from nn_util import *
from pesq import *
from consts import *
from nn_blocks import *
from perceptual_loss import *
from evaluation import *

In [4]:
[train_paths, val_paths, test_paths], \
[train_waveforms, val_waveforms, test_waveforms], \
[train_procwave, val_procwave, test_procwave], \
[train_wparams, val_wparams, test_wparams], \
[train_windows, val_windows, test_windows] = load_data(TRAIN_SIZE, VAL_SIZE, TEST_SIZE)

In [5]:
KERAS_LOAD_MAP = {'PhaseShiftUp1D' : PhaseShiftUp1D,
                  'SoftmaxQuantization' : SoftmaxQuantization,
                  'SoftmaxDequantization' : SoftmaxDequantization}

autoencoder = load_model('best_auto.h5', KERAS_LOAD_MAP)
K.set_value(QUANTIZATION_ON, True)



In [6]:
encoder = autoencoder.layers[1]
decoder = autoencoder.layers[2]

# Test speech coder speed

In [7]:
windows = np.random.uniform(-1.0, 1.0, (150, WINDOW_SIZE))

# test encoder
start = time.time()
encoded = encoder.predict(windows, batch_size = 1, verbose = 0)
end = time.time()

averageMs = (end - start) / encoded.shape[0] * 1000.0
print "Encoder: Averaged", averageMs, "ms per window"

# test encoder
start = time.time()
decoded = decoder.predict(encoded, batch_size = 1, verbose = 0)
end = time.time()

averageMs = (end - start) / decoded.shape[0] * 1000.0
print "Decoder: Averaged", averageMs, "ms per window"

Encoder: Averaged 4.1894197464 ms per window
Decoder: Averaged 2.86569436391 ms per window


# Test speech coder quality

In [8]:
test_model_on_wav("./SA1.wav", "SA1_final", autoencoder)
test_model_on_wav("./SA1.wav", "SA1_final", autoencoder, argmax = True)

test_model_on_wav("./SX383.wav", "SX383_final", autoencoder)
test_model_on_wav("./SX383.wav", "SX383_final", autoencoder, argmax = True)

test_model_on_wav("./fiveYears.wav", "fy_final", autoencoder)
test_model_on_wav("./fiveYears.wav", "fy_final", autoencoder, argmax = True) 

MSE:         2905.23
Avg err:     33.7247
PESQ:        3.89142799377
MSE:         2957.77
Avg err:     34.0851
PESQ:        3.85853695869
MSE:         2510.51
Avg err:     25.8492
PESQ:        3.95614433289
MSE:         2525.63
Avg err:     26.0233
PESQ:        3.93742275238
MSE:         1.09881e+06
Avg err:     767.414
PESQ:        3.83553242683
MSE:         1.10356e+06
Avg err:     769.528
PESQ:        3.82843208313


[1103563.1, 769.5282, 3.828432083129883]

In [9]:
captions = ["training", "validation", "test"]
datasets = [train_paths, val_paths, test_paths]

for i in xrange(0, 3):
    print "Model evaluation (" + captions[i] + ")"

    base_scores = []
    for path in datasets[i]:
        pesq = test_model_on_wav(path, "", autoencoder,
                                 save_recons = False,
                                 verbose = False,
                                 argmax = True)[2]
        base_scores.append(pesq)

    print "Mean:", np.mean(base_scores)
    print "Max: ", np.max(base_scores)
    print "Min: ", np.min(base_scores)
    print ""

Model evaluation (training)
Mean: 4.16111077881
Max:  4.51238441467
Min:  2.31207442284

Model evaluation (validation)
Mean: 4.27203153372
Max:  4.48932218552
Min:  3.47763538361

Model evaluation (test)
Mean: 4.15086880255
Max:  4.49495267868
Min:  2.33864164352

