In [1]:
import numpy as np

import random
import h5py
from keras.models import *
from keras.layers import *
from keras.layers.core import *
from keras.layers.normalization import *
from keras.optimizers import *
from keras.callbacks import *
from keras import backend as K
from keras.regularizers import *
from keras.initializers import *
from keras.models import load_model
import theano.tensor as T
import theano
from sklearn import metrics
from sklearn.preprocessing import *
from scipy.fftpack import dct, idct

import os
import random
import time
import matplotlib
import matplotlib.pyplot as plt
import glob

import operator
import math
import re

# for reproducibility
np.random.seed(1337) 
random.seed(1337)

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 690 (CNMeM is disabled, cuDNN 5105)


In [2]:
# external custom code I wrote
from load_data import *
from windowing import *
from utility import *
from pesq import *
from noise import *
from consts import *
from nn_blocks import *

In [3]:
# randomly shuffle data before partitioning into training/validation?
RANDOM_SHUFFLE = True

# number of speech files for train, val, and test
TRAIN_SIZE = 1000
VAL_SIZE = 100
TEST_SIZE = 500

# during training, we evaluate PESQ and RMSE and such on full speech files every epoch, which
# is kind of expensive. so instead of selecting the full training and validation set, we
# randomly select this many waveforms
TRAIN_EVALUATE = 50
VAL_EVALUATE = 50

In [4]:
[train_paths, val_paths, test_paths], \
[train_waveforms, val_waveforms, test_waveforms], \
[train_procwave, val_procwave, test_procwave], \
[train_wparams, val_wparams, test_wparams], \
[train_windows, val_windows, test_windows] = load_data(TRAIN_SIZE, VAL_SIZE, TEST_SIZE)



In [5]:
# flatten all of the train windows into vectors
train_processed = np.array([i for z in train_windows for i in z])
train_processed = np.reshape(train_processed, (train_processed.shape[0], WINDOW_SIZE, 1))

# randomly shuffle data, if we want to
if (RANDOM_SHUFFLE):
    train_processed = np.random.permutation(train_processed)
    
print train_processed.shape
print np.mean(train_processed, axis=None)
print np.std(train_processed, axis=None)
print np.min(train_processed, axis = None)
print np.max(train_processed, axis = None)

(101814, 512, 1)
6.41179e-06
0.103588
-1.0
1.0


In [6]:
input_dim = (WINDOW_SIZE, 1)

In [7]:
from keras.activations import softmax

# softmax hardness variable
tau = K.variable(1500.0, name = "hardness")
anneal_rate = 1.01
max_tau = 1500.00

VEC_SIZE = 4
BINS_INIT = np.mgrid[-1:1:4j, -1:1:4j, -1:1:4j, -1:1:4j].reshape(VEC_SIZE, -1).T
QUANT_BINS = K.variable(BINS_INIT)

def unquantize_batch(one_hot):
    out = T.tensordot(one_hot, QUANT_BINS, axes = [2, 0])
    out = K.reshape(out, (out.shape[0], out.shape[1] * VEC_SIZE))
    return out

def unquantize_vec(one_hot):
    out = T.tensordot(one_hot, QUANT_BINS, axes = [1, 0])
    out = K.reshape(out, (WINDOW_SIZE,))
    return out

class SoftmaxQuantization(Layer):
    def __init__(self, shared_bins, **kwargs):
        super(SoftmaxQuantization, self).__init__(**kwargs)
        self.bins = shared_bins
    
    def build(self, input_shape):
        self.trainable_weights = [self.bins]
        super(SoftmaxQuantization, self).build(input_shape)
        
    def call(self, x, mask=None):
        # x is an array: [BATCH x WINDOW_SIZE]
        # x_r becomes: [BATCH x (WINDOW_SIZE / VEC_SIZE) x NBINS x VEC_SIZE]
        x_r = K.reshape(x, (-1, x.shape[1] / VEC_SIZE, 1, VEC_SIZE))
        x_r = K.repeat_elements(x_r, self.bins.shape[0], -2)

        # quant_bins is an array: [NBINS x VEC_SIZE] 
        # q_r becomes: [BATCH x (WINDOW_SIZE / VEC_SIZE) x NBINS x VEC_SIZE]
        q_r = K.reshape(self.bins, (1, 1, self.bins.shape[0], VEC_SIZE))
        q_r = K.repeat_elements(q_r, x_r.shape[0], 0)
        q_r = K.repeat_elements(q_r, x_r.shape[1], 1)

        # get L2 distance from each element to each of the bins
        dist = K.sqrt(K.sum(K.square(x_r - q_r), axis = -1) + K.epsilon())

        # turn into softmax probabilities, which we return
        probs = softmax(tau * -dist)
        return probs
        
        '''
        # hard probabilities if tau >= max, or if we're at test time
        hard = K.one_hot(K.argmax(probs), NBINS)
        
        train = probs
        test = hard
        return K.in_train_phase(train, test)
        '''
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[1] / VEC_SIZE, NBINS)


class SoftmaxDequantization(Layer):
    def __init__(self, shared_bins, **kwargs):
        super(SoftmaxDequantization, self).__init__(**kwargs)
        self.bins = shared_bins
    
    def build(self, input_shape):
        self.trainable_weights = []
        super(SoftmaxDequantization, self).build(input_shape)
        
    def call(self, x, mask=None):
        out = T.tensordot(x, self.bins, axes = [2, 0])
        out = K.reshape(out, (out.shape[0], out.shape[1] * VEC_SIZE))
        return out
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[1] * VEC_SIZE)

In [8]:
bin_size = 2
bins = [[-1, 0], [0, 0], [1, 0], [-1, -1], [0, -1], [1, -1], [-1, 1], [0, 1], [1, 1]]
bins = np.array(bins)

wnds = [[0.0, 1.0, 0.5, 0.25, 0.0, 0.0], [1.0, -1.0, -1.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
wnds = np.array(wnds)

x_r = K.reshape(wnds, (-1, wnds.shape[1] / bin_size, 1, bin_size))
x_r = K.repeat_elements(x_r, bins.shape[0], -2)

q_r = K.reshape(bins, (1, 1, bins.shape[0], bin_size))
q_r = K.repeat_elements(q_r, x_r.shape[0], 0)
q_r = K.repeat_elements(q_r, x_r.shape[1], 1)

dists = K.sqrt(K.sum(K.square(x_r - q_r), axis = -1) + K.epsilon())
probs = softmax(-dists)

print probs.eval()

recons = T.tensordot(probs, bins, axes = [2, 0])
recons = K.reshape(recons, (recons.shape[0], recons.shape[1] * bin_size))
print recons.eval()

[[[ 0.08273085  0.12518668  0.08273085  0.03636978  0.04605361  0.03636978
    0.12518668  0.3401851   0.12518668]
  [ 0.07229083  0.18911795  0.18911795  0.0469379   0.08606448  0.08606448
    0.0618268   0.1342898   0.1342898 ]
  [ 0.10682777  0.29029619  0.10682777  0.07059818  0.10682777  0.07059818
    0.07059818  0.10682777  0.07059818]]

 [[ 0.0423767   0.09639489  0.14586283  0.05365994  0.14586283  0.39637095
    0.02343521  0.0423767   0.05365994]
  [ 0.14586283  0.09639489  0.0423767   0.05365994  0.0423767   0.02343521
    0.39637095  0.14586283  0.05365994]
  [ 0.10682777  0.29029619  0.10682777  0.07059818  0.10682777  0.07059818
    0.07059818  0.10682777  0.07059818]]

 [[ 0.10682777  0.29029619  0.10682777  0.07059818  0.10682777  0.07059818
    0.07059818  0.10682777  0.07059818]
  [ 0.10682777  0.29029619  0.10682777  0.07059818  0.10682777  0.07059818
    0.07059818  0.10682777  0.07059818]
  [ 0.10682777  0.29029619  0.10682777  0.07059818  0.10682777  0.07059818
 

In [9]:
# ---------------------------------------------------------------------------
# autoencoder: takes an audio window, compresses it, and tries to reconstruct it
# ---------------------------------------------------------------------------
def autoencoder_structure(dim):
    # - - - - - - - - - - - - - - - - - - - - -
    # parameters
    # - - - - - - - - - - - - - - - - - - - - -
    NCHAN = 32
    FILT_SIZE = 9
    OUT_CHAN = 1
    
    TIMES_DOWNSAMPLE = 0
    NUM_RES_BLOCKS = 5
    DILATION_LIMIT = 8
       
    # - - - - - - - - - - - - - - - - - - - - -
    # encoder
    # - - - - - - - - - - - - - - - - - - - - -
    enc_input = Input(shape = dim)
    enc = enc_input
    
    enc = Reshape(dim, input_shape = dim)(enc)
    
    # increase number of channels via convolution
    enc = channel_increase_block(NCHAN, FILT_SIZE)(enc)
        
    # residual blocks
    dilation_rate = 1
    for i in xrange(0, NUM_RES_BLOCKS):
        enc = residual_block(NCHAN, FILT_SIZE, 2)(enc)
        if (dilation_rate < DILATION_LIMIT):
            dilation_rate *= 2
            
    # downsampling blocks
    for i in xrange(0, TIMES_DOWNSAMPLE): 
        enc = downsample_block(NCHAN, FILT_SIZE)(enc)
    
    # decrease back down to 1 channel
    enc = channel_decrease_block(NCHAN, FILT_SIZE)(enc)
    enc = Reshape((WINDOW_SIZE,))(enc)
    
    # softmax quantization
    enc = SoftmaxQuantization()(enc)
    
    enc = Model(inputs = enc_input, outputs = enc)
    enc.name = 'encoder'
    
    # - - - - - - - - - - - - - - - - - - - - -
    # decoder
    # - - - - - - - - - - - - - - - - - - - - -
    dec_input = Input(shape = (WINDOW_SIZE / VEC_SIZE, NBINS))
    dec = dec_input
    
    dec = SoftmaxDequantization()(dec)
    
    # increase number of channels via convolution
    dec = Reshape((WINDOW_SIZE, 1))(dec)
    dec = channel_increase_block(NCHAN, FILT_SIZE)(dec)
    
    # upsampling blocks
    for i in xrange(0, TIMES_DOWNSAMPLE):
        dec = upsample_block(NCHAN, FILT_SIZE)(dec)
    
    # residual blocks
    dilation_rate = 1
    for i in xrange(0, NUM_RES_BLOCKS):
        dec = residual_block(NCHAN, FILT_SIZE, 2)(dec)
        if (dilation_rate < DILATION_LIMIT):
            dilation_rate *= 2
    
    # decrease back down to 1 channel
    dec = channel_decrease_block(NCHAN, FILT_SIZE)(dec)
    dec = Activation('tanh')(dec)
    #dec = Lambda(lambda x : K.clip(x, -1.0, 1.0))(dec)
    
    dec = Model(inputs = dec_input, outputs = dec)
    dec.name = 'decoder'
    
    # return both encoder and decoder
    return enc, dec

In [10]:
# ---------------------------------------------------------------------------
# perceptual model: takes a clean window and "dirty" window, and computes
# a perceptual score between them
#     (0.0 to 1.0, where 0 is completely garbage and 1 is perfect)
# perceptual score is based off PESQ in this case
# ---------------------------------------------------------------------------
def perceptual_model_structure(dim):
    # - - - - - - - - - - - - - - - - - - - - -
    # parameters
    # - - - - - - - - - - - - - - - - - - - - -
    NCHAN = 32
    FILT_SIZE = 7
    DENSE_SIZE = 32

    # - - - - - - - - - - - - - - - - - - - - -
    # shared Siamese structure applied to both inputs
    # - - - - - - - - - - - - - - - - - - - - -
    def siamese_half():
        inp = Input(shape = (WINDOW_SIZE, 1))
        ret = Reshape((WINDOW_SIZE, 1))(inp)

        ret = channel_increase_block(NCHAN, FILT_SIZE)(ret)

        ret = downsample_block(NCHAN, FILT_SIZE)(ret)
        ret = downsample_block(NCHAN, FILT_SIZE)(ret)
        ret = residual_block(NCHAN, FILT_SIZE)(ret)
        ret = residual_block(NCHAN, FILT_SIZE)(ret)
        ret = residual_block(NCHAN, FILT_SIZE)(ret)

        ret = channel_decrease_block(NCHAN, FILT_SIZE)(ret)

        ret = Flatten()(ret)
        ret = Dense(DENSE_SIZE, activation = 'linear', kernel_initializer = W_INIT)(ret)

        return Model(inputs = inp, outputs = ret)
    
    # - - - - - - - - - - - - - - - - - - - - -
    # combined model
    # - - - - - - - - - - - - - - - - - - - - -
    input_orig = Input(shape = dim)
    input_dirty = Input(shape = dim)

    base_network = siamese_half()
    processed_a = base_network(input_orig)
    processed_b = base_network(input_dirty)

    #'''
    def func(vects):
        x, y = vects
        return x - y

    def shape(shapes):
        shape1, shape2 = shapes
        return shape1

    out = Lambda(func, output_shape = shape)([processed_a, processed_b])
    #'''

    
    #out = EuclideanDistance()([processed_a, processed_b])
    out = Dense(1, activation = 'sigmoid', kernel_initializer = W_INIT)(out)
    
    model = Model(inputs = [input_orig, input_dirty], outputs = out)
    return model

In [11]:
# ---------------------------------------------------------------------------
# discriminator: tries to differentiate between original and reconstructed samples
# ---------------------------------------------------------------------------
def discriminator_structure(dim):
    # - - - - - - - - - - - - - - - - - - - - -
    # parameters
    # - - - - - - - - - - - - - - - - - - - - -
    NCHAN = 32
    FILT_SIZE = 7
    DENSE_SIZE = 32
    
    # - - - - - - - - - - - - - - - - - - - - -
    # model
    # - - - - - - - - - - - - - - - - - - - - -
    dsc_input = Input(shape = dim)
    dsc = Reshape(dim, input_shape = dim)(dsc_input)
    
    dsc = channel_increase_block(NCHAN, FILT_SIZE)(dsc)
    
    dsc = downsample_block(NCHAN, FILT_SIZE)(dsc)
    dsc = downsample_block(NCHAN, FILT_SIZE)(dsc)
    dsc = residual_block(NCHAN, FILT_SIZE)(dsc)
    dsc = residual_block(NCHAN, FILT_SIZE)(dsc)

    dsc = channel_decrease_block(NCHAN, FILT_SIZE)(dsc)

    dsc = Flatten()(dsc)
    
    dsc = Dense(DENSE_SIZE, activation = 'linear', kernel_initializer = W_INIT)(dsc)
    dsc = activation()(dsc)

    # sigmoid output (probability of window being real or reconstructed)
    dsc = Dense(1, activation = 'sigmoid', kernel_initializer = W_INIT)(dsc)

    dsc = Model(inputs = dsc_input, outputs = dsc)
    return dsc

In [12]:
# we can compute the entropy of a batch directly
def code_entropy(placeholder, code):
    all_onehots = K.reshape(code, (code.shape[0] * code.shape[1], NBINS))
    onehot_hist = K.sum(all_onehots, axis = 0)
    onehot_hist /= K.sum(onehot_hist)

    entropy = -K.sum(onehot_hist * K.log(onehot_hist + K.epsilon()) / K.log(2.0))
    return entropy

def code_sparsity(placeholder, code):
    return K.mean(K.sum(K.sqrt(code + K.epsilon()), axis = -1), axis = -1) - 1.0

In [13]:
# construct autoencoder
ac_input = Input(shape = input_dim)
ac_enc, ac_dec = autoencoder_structure(input_dim)
ac_embedding = ac_enc(ac_input)
ac_reconstructed = ac_dec(ac_embedding)
autoencoder = Model(inputs = [ac_input], outputs = [ac_reconstructed])



In [14]:
# construct perceptual model
perceptual_model = perceptual_model_structure(input_dim)
perceptual_model.name = 'perceptual'
perceptual_model.layers[2].name = 'perceptual_siamese_half'

In [15]:
# construct discriminator
dsc_input = Input(shape = input_dim)
dsc_struct = discriminator_structure(input_dim)
dsc_label = dsc_struct(dsc_input)
discriminator = Model(inputs = [dsc_input], outputs = [dsc_label])
discriminator.name = 'discriminator'

In [16]:
# compile perceptual model
perceptual_model.compile(loss = 'mae', optimizer = Adam(lr = 0.001))

# compile discriminator
make_trainable(perceptual_model, False)
discriminator.compile(loss = 'binary_crossentropy', optimizer = Adam(lr = 0.001))
make_trainable(perceptual_model, True)

# compile overall autoencoder model
ac_dsc_label = discriminator(ac_reconstructed)
ac_percept_score = perceptual_model([ac_input, ac_reconstructed])

loss_weights = [300.0, 1.0, 1.0]
loss_functions = ['mae', 'mae', code_sparsity]
n_recons = 1
n_percept = 1
n_discrim = 0
n_code = 1
assert(n_recons + n_percept + n_discrim + n_code == len(loss_weights))
assert(len(loss_weights) == len(loss_functions))

make_trainable(discriminator, False)
make_trainable(perceptual_model, False)
model = Model(inputs = [ac_input], outputs = [ac_reconstructed] * n_recons + \
                                            [ac_percept_score] * n_percept + \
                                            [ac_dsc_label] * n_discrim + \
                                            [ac_embedding] * n_code)
model.compile(loss = loss_functions,
              loss_weights = loss_weights,
              optimizer = Adam(lr = 0.00075))
make_trainable(discriminator, True)
make_trainable(perceptual_model, True)



autoencoder.summary()
discriminator.summary()
perceptual_model.summary()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 512, 1)            0         
_________________________________________________________________
encoder (Model)              (None, 128, 256)          112609    
_________________________________________________________________
decoder (Model)              (None, 512, 1)            111585    
Total params: 224,194.0
Trainable params: 224,194.0
Non-trainable params: 0.0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 512, 1)            0         
_________________________________________________________________
model_6 (Model)              (None, 1)                 76642     
Total params: 76,642.0
Trainable params: 76,642.0
Non-trainable 

In [17]:
# create pairs for the discriminator, given the autoencoder and a batch
def create_discrim_pairs(autoencoder, batch):
    num = batch.shape[0]
    generated = autoencoder.predict(batch)
    
    X = interleave([batch, generated])
    y = interleave([np.ones(num), np.zeros(num)])
    
    return X, y

In [18]:
# test discriminator, given the autoencoder and a set of samples (speech windows, not
# necessarily in any order)
def test_discriminator(discriminator, autoencoder, orig_samples, verbose = True):
    X, y = create_discrim_pairs(autoencoder, orig_samples)
    
    # verify discriminator was trained properly
    y_hat = discriminator.predict(X)

    y_hat[y_hat >= 0.5] = 1
    y_hat[y_hat < 0.5] = 0
    
    n_total = y.shape[0]
    n_correct = np.sum(np.ravel(y_hat) == y)

    acc = n_correct * 100.0 / n_total
    
    if (verbose):
        print "Discriminator evaluation: %0.02f"%(acc)
    return acc

In [19]:
# test model on a set of speech windows (which should originally have been extracted in
# order from some speech waveform)
def test_model_on_windows(orig_windows, wparams, autoencoder, argmax = False):
    # first, get desired reconstruction
    desired = reconstruct_from_windows(orig_windows, OVERLAP_SIZE, OVERLAP_FUNC)
    desired = unpreprocess_waveform(desired, wparams)
    desired = np.clip(desired, -32767, 32767)
    
    # then, run NN on windows to get our model's reconstruction
    transformed = np.reshape(orig_windows, (orig_windows.shape[0], WINDOW_SIZE, 1))
    enc = autoencoder.layers[1]
    embed = enc.predict(transformed, batch_size = 128, verbose = 0)
    if (argmax):
        for wnd in xrange(0, embed.shape[0]):
            max_idxs = np.argmax(embed[wnd], axis = -1)
            embed[wnd] = np.eye(NBINS)[max_idxs]
    
    dec = autoencoder.layers[2]
    autoencOutput = dec.predict(embed, batch_size = 128, verbose = 0)
    autoencOutput = np.reshape(autoencOutput, (autoencOutput.shape[0], WINDOW_SIZE))
    recons = reconstruct_from_windows(autoencOutput, OVERLAP_SIZE, OVERLAP_FUNC)
    recons = unpreprocess_waveform(recons, wparams)
    recons = np.clip(recons, -32767, 32767)
    
    # compute PESQ between desired and reconstructed waveforms
    pesq = run_pesq_waveforms(desired, recons)
    
    # return some metrics, as well as the two waveforms
    metrics = [
        mse(recons, desired),
        avgErr(recons, desired),
        pesq
    ]
    
    return metrics, desired, recons

In [20]:
# test model given the filename for a .wav file
def test_model_on_wav(wave_filename, prefix, autoencoder,
                      lead = "", save_recons = True, verbose = True,
                      argmax = False):
    [rate, data] = sciwav.read(wave_filename)
    data = data.astype(np.float32)
    processed_wave, wparams = preprocess_waveform(data)
    windows = extract_windows(processed_wave, STEP_SIZE, OVERLAP_SIZE)
    
    metrics, desired, recons = test_model_on_windows(windows, wparams, autoencoder, argmax)
    
    if (save_recons):
        outFilename = prefix + "_output.wav"
        sciwav.write(outFilename, SAMPLE_RATE, recons.astype(np.int16))
    
    if (verbose):
        print lead + "MSE:        ", metrics[0]
        print lead + "Avg err:    ", metrics[1]
        print lead + "PESQ:       ", metrics[2]
        
    return metrics

In [21]:
def evaluate_training(autoencoder, lead = ""):
    train_eval_idxs = random.sample(range(0, len(train_windows) - 1), TRAIN_EVALUATE)
    val_eval_idxs = random.sample(range(0, len(val_windows) - 1), VAL_EVALUATE)
    
    train_metrics = []
    for idx in train_eval_idxs:
        windows = train_windows[idx]
        wparams = train_wparams[idx]
        metrics, _, _ = test_model_on_windows(windows, wparams, autoencoder)
        
        train_metrics.append(metrics)
        
    val_metrics = []
    for idx in val_eval_idxs:
        windows = val_windows[idx]
        wparams = val_wparams[idx]
        metrics, _, _ = test_model_on_windows(windows, wparams, autoencoder)
        
        val_metrics.append(metrics)
    
    train_metrics = np.array(train_metrics)
    val_metrics = np.array(val_metrics)
    
    print lead + "Format: [MSE, avg err, PESQ]"
    print lead + "    Train: (mean)", np.mean(train_metrics, axis = 0)
    print lead + "    Train: (max) ", np.max(train_metrics, axis = 0)
    print lead + "    Train: (min) ", np.min(train_metrics, axis = 0)
    print lead + "    Val:   (mean)", np.mean(val_metrics, axis = 0)
    print lead + "    Val:   (max) ", np.max(val_metrics, axis = 0)
    print lead + "    Val:   (min) ", np.min(val_metrics, axis = 0)
    
    # returns mean PESQ on validation
    return np.mean(val_metrics, axis = 0)[2]

In [22]:
def save_model():
    os.system('rm ./best_model.h5')
    os.system('rm ./best_auto.h5')
    os.system('rm ./best_discrim.h5')
    os.system('rm ./best_percept.h5')
    
    model.save('./best_model.h5')
    autoencoder.save('./best_auto.h5')
    discriminator.save('./best_discrim.h5')
    perceptual_model.save('./best_percept.h5')

    f = h5py.File('./best_model.h5', 'r+')
    del f['optimizer_weights']
    f.close()

In [23]:
def update_train_structure(X, y, wnd, cor, prm, val = None):
    if (val is None):
        pesq = run_pesq_windows(wnd, cor, prm, prm)
        scaled = (pesq - 1.0) / 3.5
        scaled = np.clip(scaled, 0.0, 1.0)
    else:
        scaled = val
    
    for i in xrange(0, wnd.shape[0]):
        X[0].append(wnd[i])
        X[1].append(cor[i])
        y.append(scaled)
        
    return X, y

def generate_pesq_traindata():
    X = [[], []]
    y = []
    
    # get random waveform from training set
    idx = random.randint(0, TRAIN_SIZE - 1)
    
    wnd = train_windows[idx]
    prm = train_wparams[idx]
    
    # autoencoder prediction
    wnd = np.reshape(wnd, (-1, WINDOW_SIZE, 1))
    cor = autoencoder.predict(wnd, verbose = 0)
    X, y = update_train_structure(X, y, wnd, cor, prm)
    
    # linear mix of prediction and original
    amt = random.uniform(0.25, 0.75)
    cor = cor * amt + wnd * (1.0 - amt)
    X, y = update_train_structure(X, y, wnd, cor, prm)
    
    # random type of noise, at random amount
    noise = random.choice(noise_types)
    noise_func = noise[0]
    noise_prm = random.choice(noise[1])
    
    wnd = np.reshape(wnd, (-1, WINDOW_SIZE))
    cor = noise_func(wnd, noise_prm)
    wnd = np.reshape(wnd, (-1, WINDOW_SIZE, 1))
    cor = np.reshape(cor, (-1, WINDOW_SIZE, 1))
    
    X, y = update_train_structure(X, y, wnd, cor, prm)
    
    X = np.array(X)
    y = np.array(y)

    return X, y

pesq_train_X, pesq_train_y = generate_pesq_traindata()
print pesq_train_X.shape
print pesq_train_y.shape

(2, 375, 512, 1)
(375,)


In [24]:
# get untrained baseline for model
test_model_on_wav("./SA1.wav", "./train_output/SA1_res_uninit_", autoencoder)

MSE:         1.61034e+06
Avg err:     1204.89
PESQ:        1.067


[1610336.1, 1204.8873, 1.067]

In [25]:
np.set_printoptions(formatter={'float_kind':'{:4f}'.format})

X_train = np.copy(train_processed)
ntrain = X_train.shape[0]

BATCH_SIZE = 128
NUM_BATCHES = ntrain / BATCH_SIZE
NUM_EPOCHS = 200

DSC_CLIP_WEIGHTS = False
DSC_CLAMP_RANGE = 0.01
DSC_TIMES_TRAIN = 1

PESQ_TIMES_TRAIN = 1

lead = "    "
d_loss = 0.0
a_losses = []
d_acc = 0.0

best_val_pesq = 0.0

for epoch in range(NUM_EPOCHS):
    print "Epoch " + str(epoch + 1) + ":"

    # present batches randomly each epoch
    lis = range(0, ntrain, BATCH_SIZE)
    random.shuffle(lis)
    
    # keep track of start time and current batch #
    i = 0
    startTime = time.time()
    for idx in lis:
        batch = X_train[idx:idx+BATCH_SIZE, :,  :]
        nbatch = batch.shape[0]
        
        a_losses = ["no auto"]
        d_loss = "no discrim"
        p_loss = "no pesq"
        
        # train perceptual model
        if (n_percept > 0):
            # get 128 random sample-label pairs for this batch
            nsamples = pesq_train_y.shape[0]
            p = np.random.permutation(nsamples)
            if (nsamples > BATCH_SIZE):
                p = p[:BATCH_SIZE]
            pesq_batch_X = [pesq_train_X[0, p], pesq_train_X[1, p]]
            pesq_batch_y = pesq_train_y[p]
                 
            for k in xrange(0, PESQ_TIMES_TRAIN):
                p_loss = perceptual_model.train_on_batch(pesq_batch_X, pesq_batch_y)

        # train discriminator
        if (n_discrim > 0):
            discrim_batch_X, discrim_batch_y =  create_discrim_pairs(autoencoder, batch)
           
            for k in xrange(0, DSC_TIMES_TRAIN):
                d_loss = discriminator.train_on_batch(discrim_batch_X, discrim_batch_y)  
                
        # train autoencoder
        a_y = [batch] * n_recons + \
                [np.ones(nbatch)] * n_percept + \
                [np.ones(nbatch)] * n_discrim + \
                [np.zeros((nbatch, WINDOW_SIZE, NBINS))] * n_code
        a_losses = model.train_on_batch(batch, a_y)
        
        # update tau
        K.set_value(tau, np.min([K.get_value(tau) * anneal_rate, max_tau]))
        
        # print statistics every 10 batches so we know what's going on
        if (i % 10 == 0):
            printStr = "        \r" + lead + str(i * BATCH_SIZE) + ": " + \
                                             str(d_loss) + " " + \
                                             str(p_loss) + " "
            print printStr,
            
            loss_arr = np.asarray(a_losses)
            print loss_arr,
            
            if (len(loss_weights) > 1 and len(loss_arr) > 1):
                for w in xrange(0, len(loss_weights)):
                    loss_arr[w + 1] *= loss_weights[w]
                print loss_arr,
            
            print K.get_value(tau),
                
            if (n_percept > 0):
                pesq_train_X, pesq_train_y = generate_pesq_traindata()\
            
        i += 1
    print ""
    
    # print elapsed time for epoch
    elapsed = time.time() - startTime
    print lead + "Total time for epoch: " + str(elapsed) + "s"   
    
    # ---------------------------------------------------------
    # evaluate discriminator on random samples every epoch
    # ---------------------------------------------------------
    startTime = time.time()
    print lead + "----------------"
    if (n_discrim > 0):
        NUM = 200
        rows = np.random.randint(X_train.shape[0], size = NUM)
        d_acc = test_discriminator(discriminator, autoencoder,
                                   X_train[rows, :], verbose = False)

        print lead + "Evaluated the discriminator: " + str(d_acc)
        elapsed = time.time() - startTime
        print lead + "Total time for evaluation: " + str(elapsed) + "s"
    else:
        print lead + "No discriminator"
    
    
    # ---------------------------------------------------------
    # generate code histogram from random samples
    # ---------------------------------------------------------
    '''
    NUM = 200
    rows = np.random.randint(X_train.shape[0], size = NUM)
    code = ac_enc.predict(X_train[rows, :], verbose = 0)
    
    print lead + "----------------"
    print lead + "Code histogram:"
    scalars = code.flatten()
    
    b = np.linspace(-1.0, 1.0, NBINS + 1)
    hist = np.histogram(scalars, bins = b)
    sample_hist_probs = hist[0].astype('float32')
    sample_hist_probs /= np.sum(sample_hist_probs)

    entropy = 0
    for i in sample_hist_probs:
        if (i < 1e-4): continue
        entropy += i * math.log(i, 2)
    entropy = -entropy
    
    print "       Entropy:", entropy
    '''
    
    
    # ---------------------------------------------------------
    # evaluate autoencoder on training/validation data evey epoch
    # ---------------------------------------------------------
    startTime = time.time()
    print lead + "----------------"
    
    print lead + "Evaluating autoencoder..."
    if (K.get_value(tau) >= max_tau):
        val_pesq = evaluate_training(autoencoder, lead)
        if (val_pesq > best_val_pesq):
            print lead + "NEW best model! Validation mean-PESQ", val_pesq
            print lead + "Saving model..."
            save_model()
            best_val_pesq = val_pesq
        else:
            print lead + "Best validation mean-PESQ seen:", best_val_pesq
    else:
        print lead + "    (Not saving model yet)"
    
    metrics = test_model_on_wav("./SA1.wav", "./train_output/SA1_res_reg_train_epoch" + str(epoch+1),
                              autoencoder, lead = lead, verbose = False, argmax = True)
    print lead + "SA1:        ", metrics
    metrics_tst = test_model_on_wav("./SX383.wav", "./train_output/SX383_res_reg_train_epoch" + str(epoch+1),
                                  autoencoder, lead = lead, verbose = False, argmax = True)
    print lead + "SX383:      ", metrics_tst
    
    elapsed = time.time() - startTime
    print lead + "Total time for evaluation: " + str(elapsed) + "s"

Epoch 1:
    1280: no discrim 0.317066669464  [17.900555 0.058298 0.312913 0.098167] [17.900555 17.489473 0.312913 0.098167] 1500.0

KeyboardInterrupt: 

In [26]:
# map for load_model
KERAS_LOAD_MAP = {'PhaseShiftUp1D' : PhaseShiftUp1D,
                  'Quantize' : Quantize,
                  'StochasticQuantize' : StochasticQuantize,
                  'StochasticQuantizeLayer' : StochasticQuantizeLayer,
                  'NBINS' : NBINS,
                  'entropy_estimate' : entropy_estimate,
                  'rmse' : rmse,
                  'EuclideanDistance': EuclideanDistance,
                  'QUANT_BINS' : QUANT_BINS,
                  'SoftmaxQuantization' : SoftmaxQuantization,
                  'SoftmaxDequantization' : SoftmaxDequantization}




In [27]:
model = load_model('best_model.h5', KERAS_LOAD_MAP)

autoencoder = load_model('best_auto.h5', KERAS_LOAD_MAP)
discriminator = load_model('best_discrim.h5', KERAS_LOAD_MAP)
perceptual_model = load_model('best_percept.h5', KERAS_LOAD_MAP)



RuntimeError: Graph disconnected: cannot obtain value for tensor /input_6 at layer "input_6". The following previous layers were accessed without issue: []

In [None]:
enc = model.layers[1].layers
dec = model.layers[2].layers

In [None]:
NUM = 400
rows = np.random.randint(X_train.shape[0], size = NUM)

d_acc = test_discriminator(discriminator, autoencoder,
                           X_train[rows, :], verbose = True)

In [None]:
test_model_on_wav("./SA1.wav", "SA1_final_", autoencoder)
test_model_on_wav("./SA1.wav", "SA1_final_", autoencoder, argmax = True)

test_model_on_wav("./SX383.wav", "SX383_final_", autoencoder)
test_model_on_wav("./SX383.wav", "SX383_final_", autoencoder, argmax = True)

test_model_on_wav("./fiveYears.wav", "fy_final_", autoencoder)
test_model_on_wav("./fiveYears.wav", "fy_final_", autoencoder, argmax = True)

In [None]:
all_embed = ac_enc.predict(X_train[:10000], batch_size = BATCH_SIZE, verbose = 1)

In [None]:
probs = np.reshape(all_embed, (all_embed.shape[0] * all_embed.shape[1], NBINS))
hist = np.sum(probs, axis = 0)
hist /= np.sum(hist)

sample_hist_bins = np.linspace(0, NBINS - 1, NBINS)
plt.bar(sample_hist_bins, hist, align = 'center', width = 1)
plt.show()

print "Bins:", quant_bins.eval()

entropy = 0
for i in hist:
    if (i < 1e-4): continue
    entropy += i * math.log(i, 2)
entropy = -entropy
print "Entropy of distribution:", entropy

In [None]:
[rate, data] = sciwav.read("./SA1.wav")
data = data.astype(np.float32)
processedWave, wparams = preprocess_waveform(data)
windows = extract_windows(processedWave, STEP_SIZE, OVERLAP_SIZE)

transformed = np.reshape(windows, (windows.shape[0], WINDOW_SIZE, 1))
embed = ac_enc.predict(transformed, batch_size = BATCH_SIZE, verbose = 1)

In [None]:
recons = ac_dec.predict(embed, batch_size = BATCH_SIZE, verbose = 1)

In [None]:
print np.max(embed[31], axis = -1)

In [None]:
idx = 30

orig = windows[idx].flatten()
recn = recons[idx].flatten()

plt.plot(orig)
ylim = plt.gca().get_ylim()
plt.show()

plt.plot(recn)
plt.ylim(ylim)
plt.show()

plt.plot(unquantize_vec(embed[idx]).eval())
plt.show()

plt.plot(abs(orig - recn))
plt.show()