In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np

import random
import h5py
from keras.datasets import cifar10
from keras.models import *
from keras.layers import *
from keras.layers.core import *
from keras.layers.normalization import *
from keras.optimizers import *
from keras.callbacks import *
from keras import backend as K
from keras.regularizers import *
import theano.tensor as T
import theano
from theano.tensor.shared_randomstreams import RandomStreams
from sklearn import metrics
from skimage.measure import compare_ssim
from scipy.misc import toimage
from sklearn.preprocessing import *

import os
import random
import time
from skimage import io, exposure, feature, color, transform
import matplotlib
import matplotlib.pyplot as plt
import glob

import scipy.signal as sig
import operator
import math

# for reproducibility
np.random.seed(1337) 
random.seed(1337)

Using Theano backend.
Using gpu device 1: GeForce GTX 690 (CNMeM is disabled, cuDNN 5105)


In [2]:
# parameters for sliding window, and window function (Hann)
STEP_SIZE = 480
OVERLAP_SIZE = 32
WINDOW_SIZE = STEP_SIZE + OVERLAP_SIZE
OVERLAP_FUNC = sig.hann(OVERLAP_SIZE * 2)

# directory that contains TIMIT files
TIMIT_DIR = "/home/sri/Desktop/timit"

# directory that contains .wav files to process
NUM_EPOCHS = 1
BATCH_SIZE = 64

# randomly shuffle data before partitioning into training/validation?
RANDOM_SHUFFLE = True

# sample rate of input file (used in MFCC calculation)
SAMPLE_RATE = 16000

In [3]:
from load_TIMIT import *
from windowingFunctions import *
from utility import *

In [4]:
# read in 100 WAVs from TIMIT training set
rawWaveforms = load_TIMIT_train(TIMIT_DIR, 2000)

Reading in .wav files...


In [5]:
# waveform preprocessing
def preprocessWaveform(waveform):   
    return waveform, ()
   
def unpreprocessWaveform(waveform, params):
    return waveform



# window preprocessing
def preprocessWindows(windows):
    # scale window between -1 and 1
    processed = np.copy(windows)
   
    mn = np.min(processed, axis = 1)
    mx = np.max(processed, axis = 1)

    maxabs = np.maximum(np.abs(mn), np.abs(mx))

    for i in xrange(0, processed.shape[0]):
        processed[i] /= maxabs[i]
    processed *= 0.98
   
    #processed = (processed + 1.0) / 2.0
   
    return processed, (maxabs,)

def unpreprocessWindows(windows, params):
    # scale window from [-1, 1] to [-32768, 32768]
    scl = params[0]
   
    unprocessed = np.copy(windows)
    unprocessed /= 0.98
   
    #nprocessed = (unprocessed * 2.0) - 1.0
   
    for i in xrange(0, unprocessed.shape[0]):
        unprocessed[i] *= scl[i]

    return unprocessed

In [6]:
# waveform preprocessing
processedWaveforms = np.copy(rawWaveforms)

# we maximize the volume of every waveform
for i in xrange(0, len(processedWaveforms)):
    processedWaveforms[i], _ = preprocessWaveform(processedWaveforms[i])

In [7]:
# extract windows
rawWindows = extractWindowsMultiple(processedWaveforms, STEP_SIZE, OVERLAP_SIZE,
                                    collapse = True)

# randomly shuffle data
if (RANDOM_SHUFFLE):
    rawWindows = np.random.permutation(rawWindows)

print "Raw windows shape: ", rawWindows.shape
print "Max: ", np.amax(rawWindows)
print "Min: ", np.amin(rawWindows)

Raw windows shape:  (203086, 512)
Max:  17885.0
Min:  -17139.0


In [8]:
# data augmentation goes here, at some point
augWindows = np.copy(rawWindows)

print "Aug windows shape: ", augWindows.shape

Aug windows shape:  (203086, 512)


In [9]:
processedWindows, pwParams = preprocessWindows(augWindows)

In [10]:
# reshape into vector form
processedWindows = np.reshape(processedWindows, (processedWindows.shape[0], WINDOW_SIZE, 1))

In [11]:
print processedWindows.shape

print np.mean(processedWindows, axis=None)
print np.std(processedWindows, axis=None)
print np.min(processedWindows, axis = None)
print np.max(processedWindows, axis = None)

(203086, 512, 1)
0.0184928
0.28096
-0.98
0.98


In [12]:
class Binarize(T.Op):
    # properties attribute
    __props__ = ()
    
    def __init__(self):
        super(Binarize, self).__init__()
        
    def make_node(self, x):
        assert hasattr(self, '_props'), "Your version of theano is too old to support __props__."
        x = T.as_tensor_variable(x)
        return theano.Apply(self, [x], [x.type()])
    
    def perform(self, node, inputs, output_storage):
        x, = inputs
        z, = output_storage
        
        # TODO: learn threshold per parameter?
        z[0] = np.copy(x)
        z[0][z[0] < 0] = -1
        z[0][z[0] >= 0] = 1
    
    def grad(self, input, output_gradients):
        # pass through gradients unchanged
        # (i don't think there's a mathematical justification for this?)
        g = output_gradients[0]
        
        return [g]
        
    def infer_shape(self, node, i0_shapes):
        # output shape is same as input shape
        return i0_shapes



In [13]:
class StochasticBinarize(T.Op):
    # properties attribute
    __props__ = ()
    
    def __init__(self):
        super(StochasticBinarize, self).__init__()
        
    def make_node(self, x):
        assert hasattr(self, '_props'), "Your version of theano is too old to support __props__."
        x = T.as_tensor_variable(x)
        return theano.Apply(self, [x], [x.type()])
    
    def perform(self, node, inputs, output_storage):
        x, = inputs
        z, = output_storage
        
        prob_thresh = (x + 1.0) / 2.0
        probs = np.random.random_sample(x.shape)
        res = np.greater(probs, prob_thresh)
        res = res.astype('float32') * 2.0 - 1.0
        res = -res
        
        # TODO: learn threshold per parameter?
        z[0] = np.copy(res)
    
    def grad(self, input, output_gradients):
        # pass through gradients unchanged (since expected value
        # is just x)
        return [output_gradients[0]]
        
    def infer_shape(self, node, i0_shapes):
        # output shape is same as input shape
        return i0_shapes

In [14]:
class QuantizeProbabilities(T.Op):
    # properties attribute
    __props__ = ()
    
    def __init__(self, num_bins):
        super(QuantizeProbabilities, self).__init__()
        self.num_bins = num_bins
        
    def make_node(self, x):
        assert hasattr(self, '_props'), "Your version of theano is too old to support __props__."
        x = T.as_tensor_variable(x)
        return theano.Apply(self, [x], [x.type()])
    
    def perform(self, node, inputs, output_storage):
        x, = inputs
        z, = output_storage
        
        # TODO: learn threshold per parameter?
        z[0] = np.round(x * (self.num_bins - 1)) / float(self.num_bins - 1)
    
    def grad(self, input, output_gradients):
        # pass through gradients unchanged
        #     "straight-through" estimator
        g = output_gradients[0]
        
        return [g]
        
    def infer_shape(self, node, i0_shapes):
        # output shape is same as input shape
        return i0_shapes

In [15]:
class OneHotArgmax(T.Op):
    # properties attribute
    __props__ = ()
    
    def __init__(self, n):
        super(OneHotArgmax, self).__init__()
        
    def make_node(self, x):
        assert hasattr(self, '_props'), "Your version of theano is too old to support __props__."
        x = T.as_tensor_variable(x)
        return theano.Apply(self, [x], [x.type()])
    
    def perform(self, node, inputs, output_storage):
        x, = inputs
        z, = output_storage
        
        m = np.repeat(np.max(x, axis = 2).reshape(x.shape[0], x.shape[1], 1), x.shape[2], axis = 2)
        one = x - m
        one[one >= 0] = 1
        one[one < 0] = 0
        z[0] = one
    
    def grad(self, input, output_gradients):
        # pass through gradients unchanged
        g = output_gradients[0]
        
        return [g]
        
    def infer_shape(self, node, i0_shapes):
        # output shape is same as input shape
        return i0_shapes

In [16]:
class PhaseShift1D(Layer):
    """ PhaseShift1D
    Takes vector of size: B x S x nF
    And returns vector: B x nS x F
    """
    def __init__(self, n, **kwargs):
        super(PhaseShift1D, self).__init__(**kwargs)
        self.n = n
    
    def build(self, input_shape):
        # no trainable parameters
        self.trainable_weights = []
    
    def call(self, x, mask=None):
        r = T.reshape(x, (x.shape[0], x.shape[1], x.shape[2] / self.n, self.n))
        r = T.transpose(r, (0, 1, 3, 2))
        r = T.reshape(r, (x.shape[0], x.shape[1] * self.n, x.shape[2] / self.n))
        return r

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[1] * self.n, input_shape[2] / self.n)
    
    def get_config(self):
        config = {'n' : self.n}
        base_config = super(PhaseShift1D, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [30]:
from keras.activations import softmax, sigmoid

# lambda to compute MSE between 2 vectors
def mse_lambda(vects):
    x, y = vects
    return K.mean(K.square(x - y))

# freeze weights for stacked training
def make_trainable(net, val):
    net.trainable = val
    for l in net.layers:
        l.trainable = val

# we generate a new optimizer of the same kind for every model
# we train
def opti():
    return Adam()

input_dim = (WINDOW_SIZE, 1)
input_size = np.prod(input_dim)
bottleneck_size = 128
num_cats = 2


# softmax "upsampling" initialization
# identity matrix repeated, plus uniform noise
def categorical_upsampling(input_dim, var_dim):
    def init(shape, name = None):
        assert(shape[-1] == input_dim * var_dim)
        
        random_additive = np.random.uniform(-0.1, 0.1, shape)
        ident = np.eye(input_dim).repeat(var_dim, axis = 1)
        random_multiplicative = np.random.normal(1.0, 0.1, shape)
        
        return K.variable(ident * random_multiplicative + random_additive)
    
    return init






# Gumbel-Max sampling 
tau = K.variable(2.0, name="temperature")
anneal_rate = 0.01
min_temperature = 0.01

'''
def sampling(logits_y):
    u = K.random_uniform(K.shape(logits_y), 0, 1)
    gumbel_noise = -K.log(-K.log(u + 1e-20) + 1e-20)

    probs = K.sigmoid((gumbel_noise + logits_y) / tau)
    return probs
'''

#discrete_values = K.variable([0.0, 1.0])
def sampling(logits_y):
    # gumbel noise
    #u = K.random_uniform(K.shape(logits_y), 0, 1)
    #gumbel_noise = -K.log(-K.log(u + 1e-20) + 1e-20)
    
    # calculate softmax probabilities and retrieve expected value
    final_probs = softmax(K.reshape(logits_y, (-1, bottleneck_size, num_cats)) / tau)
    
    return final_probs
    
    # final output is expected value
    #expected = T.dot(final_probs, discrete_values)
    #return expected



def encoder_residual_block(output_dim = 64, filt_size = 5, subsample = True):
    def f(input):
        stride = 1
        if (subsample):
            stride = 2
        
        conv1 = Convolution1D(output_dim, filt_size, border_mode = 'same',
                          init = 'he_uniform', activation = 'linear',
                          subsample_length = stride, bias = True)(input)
        #if (subsample):
        #    conv1 = MaxPooling1D(2)(conv1)
        #conv1 = SpatialDropout1D(0.1)(conv1)
        act1 = LeakyReLU(0.3)(conv1)
        
        conv2 = Convolution1D(output_dim, filt_size, border_mode = 'same',
                          init = 'he_uniform', activation = 'linear',
                          bias = True)(act1)
        #conv2 = SpatialDropout1D(0.1)(conv2)
        
        residual = conv2
        shortcut = Convolution1D(output_dim, 1, border_mode = 'same',
                                 init = 'he_uniform', activation = 'linear',
                                 subsample_length = stride, bias = True)(input)
        #if (subsample):
        #    shortcut = MaxPooling1D(2)(shortcut)
        
        m = merge([shortcut, residual], mode = 'sum')
        return LeakyReLU(0.3)(m)
    
    return f

def decoder_residual_block(output_dim = 64, filt_size = 5, upsample = True):
    def f(input):
        nfilts = output_dim
        if (upsample):
            nfilts = output_dim * 2

        conv1 = Convolution1D(nfilts, filt_size, border_mode = 'same',
                          init = 'he_uniform', activation = 'linear',
                          bias = True)(input)
        act1 = LeakyReLU(0.3)(conv1)
        
        conv2 = Convolution1D(nfilts, filt_size, border_mode = 'same',
                          init = 'he_uniform', activation = 'linear',
                          bias = True)(act1)
        
        residual = conv2
        shortcut = Convolution1D(nfilts, 1, border_mode = 'same',
                                 init = 'he_uniform', activation = 'linear',
                                 bias = True)(input)
        
        m = merge([shortcut, residual], mode = 'sum')
        r = LeakyReLU(0.3)(m)
        if (upsample):
            return PhaseShift1D(2)(r)
        else:
            return r
    
    return f
#'''

def hard_tanh(x):
    return K.clip(x, -1.0, 1.0)

# ---------------------------------------------------------------------------
# autoencoder: takes an audio window, compresses it, and tries to reconstruct it
# ---------------------------------------------------------------------------
def autoencoder_structure(dim):
    enc_input = Input(shape = dim)
    
    # corrupt input slightly as a form of regularization
    #enc = GaussianDropout(0.05, input_shape = dim)(enc_input)

    # (512x1) => (256x48)
    enc = encoder_residual_block(48, 9, False)(enc_input)
    
    # (256x48) => (256x48)
    #enc = encoder_residual_block(48, 9, False)(enc)

    # (256x48) => (128x48)
    enc = encoder_residual_block(48, 9, True)(enc)
    
    # (128x48) => (128x48)
    enc = encoder_residual_block(48, 9, False)(enc)

    # (128x48) => (64x48)
    enc = encoder_residual_block(48, 9, True)(enc)

    # (64x64) => (64)
    enc = Convolution1D(1, 9, border_mode = 'same',
                              init = 'he_uniform', activation = 'linear',
                              bias = True)(enc)
    enc = LeakyReLU(0.3)(enc)
    #enc = Lambda(lambda x : K.sigmoid(tau * x))(enc)
    enc = Reshape((bottleneck_size,))(enc)
    enc = Dense(bottleneck_size * num_cats,
                init = categorical_upsampling(bottleneck_size, num_cats),
                activation = 'linear')(enc)
    #enc = GumbelMaxSampling()(enc)
    #enc = GaussianNoise(0.05)(enc)
    #enc = Activation('tanh')(enc)
    enc = (Lambda(sampling, output_shape=(bottleneck_size, num_cats)))(enc)
    #enc = (Lambda(lambda x : QuantizeProbabilities(16)(x), output_shape=(bottleneck_size, num_cats)))(enc)
    
    enc = Model(input = enc_input, output = enc)
    
    
    
    
    dec_input = Input(shape = (bottleneck_size, num_cats))
    
    dec = Convolution1D(1, 1, border_mode = 'same',
                              init = 'he_uniform', activation = 'linear',
                              bias = True)(dec_input)
    
    dec = Reshape((128,))(dec)
    dec = Dense(bottleneck_size, activation = 'linear', init = 'identity')(dec)
    dec = LeakyReLU(0.3)(dec)
    dec = Reshape((128, 1,))(dec)
    
    # (64x1) => (128x48)
    dec = decoder_residual_block(32, 9, True)(dec)
    
    # (128x48) => (128x48)
    dec = decoder_residual_block(32, 9, False)(dec)
    
    # (128x48) => (256x48)
    dec = decoder_residual_block(32, 9, True)(dec)
    
    # (256x48) => (256x48)
    dec = decoder_residual_block(32, 9, False)(dec)
    
    # (256x48) => (512x48)
    #dec = decoder_residual_block(32, 9, False)(dec)
    
    # (512x48) => (512x48)
    #dec = decoder_residual_block(32, 9, False)(dec)

    # (512x48) => (512x1)
    dec = Convolution1D(1, 9, border_mode = 'same',
                              init = 'he_uniform', activation = 'tanh',
                              bias = True)(dec)
    
    dec = Model(input = dec_input, output = dec)
    
    return enc, dec

# ---------------------------------------------------------------------------
# discriminator: tries to differentiate between original and reconstructed samples
# ---------------------------------------------------------------------------
def discriminator_structure(dim):
    dsc = Sequential()
    
    dsc.add(Convolution1D(32, 5, border_mode='same', input_shape = dim,
                                    init = 'uniform',
                                    subsample_length = 2, activation = 'linear'))
    dsc.add(LeakyReLU(0.3))
    
    dsc.add(Convolution1D(32, 5, border_mode='same', input_shape = dim,
                                    init = 'uniform',
                                    activation = 'linear'))
    dsc.add(LeakyReLU(0.3))

    dsc.add(Convolution1D(32, 5, border_mode='valid', input_shape = dim,
                                    init = 'uniform',
                                    subsample_length = 2, activation = 'linear'))
    dsc.add(LeakyReLU(0.3))
    
    dsc.add(Convolution1D(32, 5, border_mode='same', input_shape = dim,
                                    init = 'uniform',
                                    activation = 'linear'))
    dsc.add(LeakyReLU(0.3))
    
    dsc.add(Convolution1D(32, 5, border_mode='valid', input_shape = dim,
                                    init = 'uniform',
                                    subsample_length = 2, activation = 'linear'))
    dsc.add(LeakyReLU(0.3))
    
    dsc.add(Flatten())
    
    dsc.add(Dense(48, activation = 'linear'))
    dsc.add(LeakyReLU(0.3))
    dsc.add(Dense(1, activation = 'sigmoid'))
    
    return dsc


# construct autoencoder to be used in adversarial training (AAC - Adversarial AutoenCoder)
# uhhhh... whoops i screwed up the acronym
aac_input = Input(shape = input_dim)
aac_enc, aac_dec = autoencoder_structure(input_dim)
aac_embedding = aac_enc(aac_input)
#aac_embedding_quant = Lambda(lambda x : QuantizeProbabilities(4)(x))(aac_embedding)
aac_reconstructed = aac_dec(aac_embedding)

aac_autoencoder = Model(input = [aac_input], output = [aac_reconstructed])
aac_autoencoder.compile(loss = 'mean_squared_error', optimizer = opti())



# construct discriminator: regular
regdsc_input_dim = (WINDOW_SIZE, 1)
regdsc_input = Input(shape = input_dim)
regdsc_struct = discriminator_structure(regdsc_input_dim)

regdsc_label = regdsc_struct(regdsc_input)
aac_reg_label = regdsc_struct(aac_reconstructed)



def prob_quantization_loss(placeholder, code):
    loss = K.minimum(code, 1.0 - code)
    loss_weight = 10.0

    return K.switch(tau < 0.25, loss * loss_weight, \
                                K.zeros_like(code))





# compile model
loss_weights = [350.0, 1.0]
n_discrim = 1
n_code = 0
lmult = len(loss_weights) - n_discrim - n_code




make_trainable(aac_autoencoder, False)

aac_discrim_reg = Model(input = [regdsc_input], output = [regdsc_label])
aac_discrim_reg.compile(loss = ['binary_crossentropy'], optimizer = opti())
aac_discrim_reg.summary()

aac_autoencoder.summary()

make_trainable(aac_discrim_reg, False)
make_trainable(aac_autoencoder, True)
model = Model(input = [aac_input], output = [aac_reconstructed] * lmult + \
                                            [aac_reg_label] + \
                                            [aac_embedding] * n_code)
model.compile(loss = ['mean_squared_error', \
                      'binary_crossentropy', \
                     ],#  prob_quantization_loss],
              loss_weights = loss_weights,
              optimizer = opti())
model.summary()

X_train = np.copy(processedWindows)
ntrain = X_train.shape[0]



____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_10 (InputLayer)            (None, 512, 1)        0                                            
____________________________________________________________________________________________________
sequential_2 (Sequential)        (None, 1)             114593      input_10[0][0]                   
Total params: 114593
____________________________________________________________________________________________________
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_7 (InputLayer)             (None, 512, 1)        0                                            
______________________________________________________________________

In [31]:
def test_discriminator(discriminator, autoencoder, X, y, verbose = True):
    # verify discriminator was trained properly
    y_hat = discriminator.predict(X)
    y_hat[y_hat >= 0.5] = 1
    y_hat[y_hat < 0.5] = 0

    n_total = y.shape[0]
    n_correct = np.sum(np.ravel(y_hat) == y)

    acc = n_correct * 100.0 / n_total
    if (verbose):
        print "Discriminator accuracy: %0.02f pct (%d of %d) right"%(acc, n_correct, n_total)
    return acc

In [32]:
def autoencoderTest(waveFilename, prefix, autoencoder, verbose = True):
    [rate, data] = sciwav.read(waveFilename)
    processedWave, wparams = preprocessWaveform(data)
    windows = extractWindows(processedWave, STEP_SIZE, OVERLAP_SIZE)
    

    # first, write desired reconstruction
    transformed, tparams = preprocessWindows(windows)
    if (verbose):
        print transformed.shape
    
    desired = unpreprocessWindows(transformed, tparams)
    desired = reconstructFromWindows(desired, OVERLAP_SIZE, OVERLAP_FUNC)
    desired = unpreprocessWaveform(desired, wparams)
    #sciwav.write(prefix + "_res_desired.wav", rate, desired.astype(np.int16))
    
    # then, run NN on transformed windows
    transformed, tparams = preprocessWindows(windows)
    
    transformed = np.reshape(transformed, (transformed.shape[0], WINDOW_SIZE, 1))
    autoencOutput = autoencoder.predict(transformed, batch_size = BATCH_SIZE, verbose = (1 if verbose else 0))
    if (verbose):
        print autoencOutput.shape
    autoencOutput = np.reshape(autoencOutput, (autoencOutput.shape[0], WINDOW_SIZE))
    
    if (verbose):
        print autoencOutput.shape
    recons = unpreprocessWindows(autoencOutput, tparams)
    recons = reconstructFromWindows(recons, OVERLAP_SIZE, OVERLAP_FUNC)
    recons = unpreprocessWaveform(recons, wparams)
    
    sciwav.write(prefix + "_output.wav", rate, recons.astype(np.int16))
    
    metrics = [
        np.max(desired),
        np.min(desired),
        np.max(recons),
        np.min(recons),
        mse(recons, desired),
        avgErr(recons, desired)
    ]
    
    if (verbose):
        print "Max/min desired:", metrics[0], metrics[1]
        print "Max/min recons: ", metrics[2], metrics[3]
        print waveFilename, " mse: ", metrics[4]
        print waveFilename, " avg err: ", metrics[5]
        
    return metrics

In [33]:
# interleave two numpy arrays of the same size along the first axis
def interleave(a, b):    
    r = np.empty(a.shape)
    r = np.repeat(r, 2, axis = 0)
    
    r[::2] = a
    r[1::2] = b
    return r

In [34]:
np.set_printoptions(formatter={'float_kind':'{:4f}'.format})

BATCH_SIZE = 128
NUM_BATCHES = ntrain / BATCH_SIZE
NUM_EPOCHS = 10

lead = "    "
d_loss = 0.0
a_losses = []
d_acc = 0.0
discrim_train_y = np.concatenate((np.ones(ntrain), np.zeros(ntrain)))


for epoch in range(NUM_EPOCHS):
    print "Epoch " + str(epoch + 1) + ":"

    # present batches randomly each epoch
    lis = range(0, ntrain, BATCH_SIZE)
    random.shuffle(lis)
    
    # keep track of start time and current batch #
    i = 0
    startTime = time.time()
    for idx in lis:
        batch = X_train[idx:idx+BATCH_SIZE, :,  :]
        nbatch = batch.shape[0]
        
        a_losses = ["autoencoder not training"]
        d_loss = "discriminator not training"
        
        # train autoencoder, if discriminator accuracy is greater than 70%
        if (epoch >= 0):
            make_trainable(aac_autoencoder, True)
            make_trainable(aac_discrim_reg, False)
            
            a_y = [batch] * lmult + \
                  [np.ones(nbatch)] * n_discrim + \
                  [np.zeros((nbatch, bottleneck_size, num_cats))] * n_code
            a_losses = model.train_on_batch(batch, a_y)
        
        # train discriminator(s) on what the autoencoder now generates
        generated = aac_autoencoder.predict(batch)
        discrim_batch_X = interleave(batch, generated)
        discrim_batch_y = interleave(np.ones(nbatch), np.zeros(nbatch))
        
        make_trainable(aac_autoencoder, False)
        make_trainable(aac_discrim_reg, True)
        d_loss = aac_discrim_reg.train_on_batch(discrim_batch_X, discrim_batch_y)
        
        if (epoch < 0 and d_loss < 0.2):
            print ""
            print lead + "Terminating epoch early (don't wanna overfit!)"
            break
        
        # print statistics every 10 batches so we know stuff is still going down
        if (i % 10 == 0):
            printStr = "        \r" + lead + str(i * BATCH_SIZE) + ": " + str(d_loss) + " "
            print printStr,
            
            loss_arr = np.asarray(a_losses)
            print loss_arr,
            
            if (len(loss_arr) > 1):
                for i in xrange(0, len(loss_weights)):
                    loss_arr[i + 1] *= loss_weights[i]
                print loss_arr,
            
            print K.get_value(tau),
            K.set_value(tau, np.max([K.get_value(tau) * np.exp(-anneal_rate * (epoch + 1)), min_temperature]))
            #K.set_value(tau, np.min([K.get_value(tau) * (1 + anneal_rate), max_temperature]))
            
        i += 1
    print ""
    
    # print elapsed time for epoch
    elapsed = time.time() - startTime
    print lead + "Total time for epoch: " + str(elapsed) + "s"
    
    
    # ---------------------------------------------------------
    # evaluate discriminator on random samples every epoch
    # ---------------------------------------------------------
    startTime = time.time()
    print lead + "----------------"

    NUM = 200
    rows = np.random.randint(X_train.shape[0], size = NUM)
    generated = aac_autoencoder.predict(X_train[rows, :], verbose = 0)
    d_X = np.concatenate((X_train[rows, :], generated))
    d_y = np.concatenate((np.ones(NUM), np.zeros(NUM)))
    d_acc = test_discriminator(aac_discrim_reg, aac_autoencoder,
                               d_X, d_y, verbose = False)

    print lead + "Evaluated the discriminator: " + str(d_acc) + "% d_acc"
    elapsed = time.time() - startTime
    print lead + "Total time for evaluation: " + str(elapsed) + "s"
    
    
    # ---------------------------------------------------------
    # evaluate autoencoder on real data every epoch
    # ---------------------------------------------------------
    startTime = time.time()
    print lead + "----------------"
    
    print lead + "Evaluating autoencoder..."
    metrics = autoencoderTest("./SA1.WAV", "SA1_res_reg_train_epoch" + str(epoch+1), aac_autoencoder, verbose = False)
    
    print lead + "Max/min desired:", metrics[0], metrics[1]
    print lead + "Max/min recons: ", metrics[2], metrics[3]
    print lead + "MSE:     ", metrics[4]
    print lead + "Avg err: ", metrics[5]
    elapsed = time.time() - startTime
    print lead + "Total time for evaluation: " + str(elapsed) + "s"
    
    print ""

Epoch 1:
    1280: 0.501149356365  [7.408620 0.016820 1.521719] [7.408620 5.886900 1.521719] 0.344089895487 
    Total time for epoch: 815.398489952s
    ----------------
    Evaluated the discriminator: 71.5% d_acc
    Total time for evaluation: 2.15358710289s
    ----------------
    Evaluating autoencoder...
    Max/min desired: 4899.0 -4013.0
    Max/min recons:  4177.64 -2923.8
    MSE:      12605.4
    Avg err:  58.0959
    Total time for evaluation: 0.133603096008s

Epoch 2:
    1280: 0.556713700294  [5.926343 0.013644 1.151003] [5.926343 4.775341 1.151003] 0.0541037097573

KeyboardInterrupt: 

In [35]:
'''model.save('model_reg_adversary.h5')
aac_autoencoder.save('auto_reg_adversary.h5')
aac_discrim_reg.save('discrim_reg_adversary.h5')
'''

"model.save('model_reg_adversary.h5')\naac_autoencoder.save('auto_reg_adversary.h5')\naac_discrim_reg.save('discrim_reg_adversary.h5')\n"




In [36]:
'''
from keras.models import load_model

objs = {'PhaseShift1D' : PhaseShift1D,
        'BinarizeLayer' : BinarizeLayer,
        'code_binary_loss' : code_binary_loss,
        'code_maximize_variance' : code_maximize_variance}

model = load_model('model_reg_adversary.h5', objs)
aac_autoencoder = load_model('auto_reg_adversary.h5', objs)
aac_discrim_reg = load_model('discrim_reg_adversary.h5', objs)
'''

"\nfrom keras.models import load_model\n\nobjs = {'PhaseShift1D' : PhaseShift1D,\n        'BinarizeLayer' : BinarizeLayer,\n        'code_binary_loss' : code_binary_loss,\n        'code_maximize_variance' : code_maximize_variance}\n\nmodel = load_model('model_reg_adversary.h5', objs)\naac_autoencoder = load_model('auto_reg_adversary.h5', objs)\naac_discrim_reg = load_model('discrim_reg_adversary.h5', objs)\n"

In [37]:
enc = model.layers[1].layers
dec = model.layers[2].layers


In [38]:
NUM = 400
rows = np.random.randint(X_train.shape[0], size = NUM)
generated = aac_autoencoder.predict(X_train[rows, :], verbose = 0)
d_X = np.concatenate((X_train[rows, :], generated))
d_y = np.concatenate((np.ones(NUM), np.zeros(NUM)))
d_acc = test_discriminator(aac_discrim_reg, aac_autoencoder,
                           d_X, d_y, verbose = False)

print lead + "Evaluated the discriminator: " + str(d_acc) + "% d_acc"

    Evaluated the discriminator: 72.625% d_acc


In [39]:
autoencoderTest("./SA1.WAV", "SA1_aac_reg_", aac_autoencoder)
autoencoderTest("./SX383.WAV", "SX383_aac_reg_", aac_autoencoder)
autoencoderTest("./fiveYears.wav", "fy_aac_reg_", aac_autoencoder)

(112, 512)
(112, 512, 1)
(112, 512)
Max/min desired: 4899.0 -4013.0
Max/min recons:  4528.44 -2987.4
./SA1.WAV  mse:  12414.9
./SA1.WAV  avg err:  55.9701
(93, 512)
(93, 512, 1)
(93, 512)
Max/min desired: 2961.0 -3057.0
Max/min recons:  2859.69 -2652.09
./SX383.WAV  mse:  5752.58
./SX383.WAV  avg err:  40.7908
(181, 512)
(181, 512, 1)
(181, 512)
Max/min desired: 24636.0 -20122.0
Max/min recons:  21431.6 -17516.6
./fiveYears.wav  mse:  2.95299e+06
./fiveYears.wav  avg err:  1219.3


[24636.0, -20122.0, 21431.572, -17516.637, 2952988.2, 1219.2957]

In [40]:
[rate, data] = sciwav.read("./SA1.WAV")
processedWave, wparams = preprocessWaveform(data)
windows = extractWindows(processedWave, STEP_SIZE, OVERLAP_SIZE)

transformed, tparams = preprocessWindows(windows)

transformed = np.reshape(transformed, (transformed.shape[0], WINDOW_SIZE, 1))
embed = aac_enc.predict(transformed, batch_size = BATCH_SIZE, verbose = 1)



In [41]:
r = K.max(K.reshape(embed, (-1, bottleneck_size, 2)), axis=-1, keepdims=True)
r = K.equal(K.reshape(embed, (-1, bottleneck_size, 2)), r)
r = r.eval()

#r = np.round(embed)
print embed[0]
#print r[0]

autoencOutput = aac_dec.predict(r, batch_size = BATCH_SIZE, verbose = 1)
print autoencOutput.shape
autoencOutput = np.reshape(autoencOutput, (autoencOutput.shape[0], WINDOW_SIZE))

print autoencOutput.shape
recons = unpreprocessWindows(autoencOutput, tparams)

wav = reconstructFromWindows(recons, OVERLAP_SIZE, OVERLAP_FUNC)
wav = unpreprocessWaveform(wav, wparams)
    
sciwav.write("tst_output_reg.wav", rate, wav.astype(np.int16))

idx = 45

[[0.649480 0.350520]
 [0.486485 0.513515]
 [0.561945 0.438055]
 [0.464455 0.535545]
 [0.599743 0.400257]
 [0.613211 0.386789]
 [0.655272 0.344728]
 [0.691473 0.308527]
 [0.463022 0.536978]
 [0.452715 0.547285]
 [0.528035 0.471965]
 [0.505902 0.494098]
 [0.555262 0.444738]
 [0.598373 0.401627]
 [0.525736 0.474264]
 [0.531259 0.468741]
 [0.565394 0.434606]
 [0.433938 0.566062]
 [0.423601 0.576399]
 [0.676604 0.323396]
 [0.690718 0.309282]
 [0.500715 0.499285]
 [0.557301 0.442699]
 [0.635570 0.364430]
 [0.483987 0.516013]
 [0.553902 0.446098]
 [0.507907 0.492093]
 [0.496926 0.503074]
 [0.627263 0.372737]
 [0.394495 0.605505]
 [0.562014 0.437986]
 [0.446421 0.553579]
 [0.495109 0.504891]
 [0.598407 0.401593]
 [0.570480 0.429520]
 [0.595832 0.404168]
 [0.658765 0.341235]
 [0.498896 0.501104]
 [0.626392 0.373608]
 [0.511221 0.488779]
 [0.474161 0.525840]
 [0.516441 0.483559]
 [0.575016 0.424984]
 [0.661143 0.338857]
 [0.579046 0.420954]
 [0.551811 0.448189]
 [0.607478 0.392522]
 [0.557611 0.

In [42]:
logit_output = K.function([model.layers[1].layers[0].input], [model.layers[1].layers[-2].output])

logits = logit_output([[transformed[0]]])
print logits

[array([[0.026407, -0.006301, -0.005199, -0.002332, 0.003140, -0.010068,
        0.002300, 0.009853, 0.001155, -0.020291, 0.020618, -0.003820,
        0.038830, 0.004768, -0.023819, -0.066617, -0.029880, -0.022022,
        -0.029591, -0.019530, -0.022398, -0.028352, -0.003694, -0.004946,
        -0.021778, -0.033549, -0.000278, -0.021422, 0.004698, -0.000766,
        -0.010947, -0.017586, -0.026243, -0.040195, -0.018468, -0.004372,
        0.023586, 0.039920, 0.018397, -0.020752, -0.001003, -0.043614,
        -0.021103, -0.021255, -0.007967, -0.020176, -0.011804, -0.041300,
        -0.023727, -0.020328, -0.021384, -0.032863, -0.000224, -0.001901,
        -0.027614, -0.026962, -0.013652, -0.041255, -0.029561, -0.006839,
        -0.032146, -0.045369, -0.017231, -0.005822, -0.011916, -0.010878,
        -0.010292, -0.031443, -0.006313, -0.021364, -0.009375, -0.029958,
        0.009647, -0.025238, -0.005584, -0.005350, 0.001364, -0.026042,
        -0.025537, -0.027918, -0.038595, -0.033109,

In [43]:
print model.layers[2].layers[1].weights[0].eval()
print model.layers[2].layers[1].weights[1].eval()

CudaNdarray([[[[1.493956]
   [-1.301944]]]])
CudaNdarray([0.006829])


In [44]:
print model.layers[2].layers[3].weights[0].eval()

CudaNdarray([[1.365355 0.247365 -0.044743 ..., -0.018078 0.034547 -0.006794]
 [0.007955 1.120362 0.207719 ..., 0.090497 0.065630 -0.182380]
 [0.006915 0.130459 1.002491 ..., -0.080762 -0.144568 0.013299]
 ..., 
 [0.019009 -0.093426 0.008402 ..., 0.933239 0.125922 0.153007]
 [-0.062852 -0.015699 0.083822 ..., 0.049544 1.143798 0.045510]
 [0.121907 0.136193 -0.016438 ..., 0.055595 0.129919 1.182899]])
