In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np

import random
import h5py
from keras.datasets import cifar10
from keras.models import *
from keras.layers import *
from keras.layers.core import *
from keras.layers.normalization import *
from keras.optimizers import *
from keras.callbacks import *
from keras import backend as K
from keras.regularizers import *
import theano.tensor as T
import theano
from theano.tensor.shared_randomstreams import RandomStreams
from sklearn import metrics
from skimage.measure import compare_ssim
from scipy.misc import toimage
from sklearn.preprocessing import *

import os
import random
import time
from skimage import io, exposure, feature, color, transform
import matplotlib
import matplotlib.pyplot as plt
import glob

import scipy.signal as sig
import operator
import math

# for reproducibility
np.random.seed(1337) 
random.seed(1337)

Using Theano backend.
Using gpu device 0: GeForce GTX 690 (CNMeM is disabled, cuDNN 5105)


In [2]:
# parameters for sliding window, and window function (Hann)
STEP_SIZE = 480
OVERLAP_SIZE = 32
WINDOW_SIZE = STEP_SIZE + OVERLAP_SIZE
OVERLAP_FUNC = sig.hann(OVERLAP_SIZE * 2)

# directory that contains TIMIT files
TIMIT_DIR = "/home/sri/Desktop/timit"

# directory that contains .wav files to process
NUM_EPOCHS = 1
BATCH_SIZE = 64

# randomly shuffle data before partitioning into training/validation?
RANDOM_SHUFFLE = True

# sample rate of input file (used in MFCC calculation)
SAMPLE_RATE = 16000

In [3]:
from load_TIMIT import *
from windowingFunctions import *
from utility import *

In [4]:
# read in 100 WAVs from TIMIT training set
rawWaveforms = load_TIMIT_train(TIMIT_DIR, 2000)

Reading in .wav files...


In [5]:
s = np.array([[-2, 2], [-3, 4], [-12, 6]]).astype('float32')

mn = np.min(s, axis = 1)
mx = np.max(s, axis = 1)

maxabs = np.maximum(np.abs(mn), np.abs(mx))

for i in xrange(0, s.shape[0]):
    s[i] /= maxabs[i]

print s

[[-1.    1.  ]
 [-0.75  1.  ]
 [-1.    0.5 ]]


In [6]:
# waveform preprocessing
def preprocessWaveform(waveform):   
    return waveform, ()
   
def unpreprocessWaveform(waveform, params):
    return waveform



# window preprocessing
def preprocessWindows(windows):
    # scale window between -1 and 1
    processed = np.copy(windows)
   
    mn = np.min(processed, axis = 1)
    mx = np.max(processed, axis = 1)

    maxabs = np.maximum(np.abs(mn), np.abs(mx))

    for i in xrange(0, processed.shape[0]):
        processed[i] /= maxabs[i]
    processed *= 0.98
   
    #processed = (processed + 1.0) / 2.0
   
    return processed, (maxabs,)

def unpreprocessWindows(windows, params):
    # scale window from [-1, 1] to [-32768, 32768]
    scl = params[0]
   
    unprocessed = np.copy(windows)
    unprocessed /= 0.98
   
    #nprocessed = (unprocessed * 2.0) - 1.0
   
    for i in xrange(0, unprocessed.shape[0]):
        unprocessed[i] *= scl[i]

    return unprocessed

In [7]:
# waveform preprocessing
processedWaveforms = np.copy(rawWaveforms)

# we maximize the volume of every waveform
for i in xrange(0, len(processedWaveforms)):
    processedWaveforms[i], _ = preprocessWaveform(processedWaveforms[i])

In [8]:
# extract windows
rawWindows = extractWindowsMultiple(processedWaveforms, STEP_SIZE, OVERLAP_SIZE,
                                    collapse = True)

# randomly shuffle data
if (RANDOM_SHUFFLE):
    rawWindows = np.random.permutation(rawWindows)

print "Raw windows shape: ", rawWindows.shape
print "Max: ", np.amax(rawWindows)
print "Min: ", np.amin(rawWindows)

Raw windows shape:  (203086, 512)
Max:  17885.0
Min:  -17139.0


In [9]:
# data augmentation goes here, at some point
augWindows = np.copy(rawWindows)

print "Aug windows shape: ", augWindows.shape

Aug windows shape:  (203086, 512)


In [10]:
processedWindows, pwParams = preprocessWindows(augWindows)

In [11]:
# reshape into vector form
processedWindows = np.reshape(processedWindows, (processedWindows.shape[0], WINDOW_SIZE, 1))

In [12]:
print processedWindows.shape

print np.mean(processedWindows, axis=None)
print np.std(processedWindows, axis=None)
print np.min(processedWindows, axis = None)
print np.max(processedWindows, axis = None)

(203086, 512, 1)
0.0184928
0.28096
-0.98
0.98


In [13]:
class Binarize(T.Op):
    # properties attribute
    __props__ = ()
    
    def __init__(self):
        super(Binarize, self).__init__()
        
    def make_node(self, x):
        assert hasattr(self, '_props'), "Your version of theano is too old to support __props__."
        x = T.as_tensor_variable(x)
        return theano.Apply(self, [x], [x.type()])
    
    def perform(self, node, inputs, output_storage):
        x, = inputs
        z, = output_storage
        
        # TODO: learn threshold per parameter?
        z[0] = np.copy(x)
        z[0][z[0] < 0] = -1
        z[0][z[0] >= 0] = 1
    
    def grad(self, input, output_gradients):
        # pass through gradients unchanged
        # (i don't think there's a mathematical justification for this?)
        g = output_gradients[0]
        
        return [g]
        
    def infer_shape(self, node, i0_shapes):
        # output shape is same as input shape
        return i0_shapes



In [16]:
class StochasticBinarize(T.Op):
    # properties attribute
    __props__ = ()
    
    def __init__(self):
        super(StochasticBinarize, self).__init__()
        
    def make_node(self, x):
        assert hasattr(self, '_props'), "Your version of theano is too old to support __props__."
        x = T.as_tensor_variable(x)
        return theano.Apply(self, [x], [x.type()])
    
    def perform(self, node, inputs, output_storage):
        x, = inputs
        z, = output_storage
        
        prob_thresh = (x + 1.0) / 2.0
        probs = np.random.random_sample(x.shape)
        res = np.greater(probs, prob_thresh)
        res = res.astype('float32') * 2.0 - 1.0
        res = -res
        
        # TODO: learn threshold per parameter?
        z[0] = np.copy(res)
    
    def grad(self, input, output_gradients):
        # pass through gradients unchanged (since expected value
        # is just x)
        return [output_gradients[0]]
        
    def infer_shape(self, node, i0_shapes):
        # output shape is same as input shape
        return i0_shapes

In [153]:
a = np.array([0.1, 0.5, 0.6, 0.9])

a[(a > 0.2) & (a < 0.6)] = 0
print a

[0.100000 0.000000 0.600000 0.900000]


In [202]:
class Quantize(T.Op):
    # properties attribute
    __props__ = ()
    
    def __init__(self):
        super(Quantize, self).__init__()
        
    def make_node(self, x):
        assert hasattr(self, '_props'), "Your version of theano is too old to support __props__."
        x = T.as_tensor_variable(x)
        return theano.Apply(self, [x], [x.type()])
    
    def perform(self, node, inputs, output_storage):
        x, = inputs
        z, = output_storage
        
        # TODO: learn threshold per parameter?
        z[0] = np.copy(x)
        z[0][z[0] < -0.5] = -1
        z[0][z[0] > 0.5] = 1
        z[0][(z[0] > -0.5) & (z[0] < 0)] = 0.0
        z[0][(z[0] < 0.5) & (z[0] >= 0)] = 0.0
    
    def grad(self, input, output_gradients):
        # pass through gradients unchanged
        #     "straight-through" estimator
        g = output_gradients[0]
        g = (g * (T.gt(g, -1) * T.lt(g, 1)))
        
        return [g]
        
    def infer_shape(self, node, i0_shapes):
        # output shape is same as input shape
        return i0_shapes



In [203]:
# verify Binarize op works
x = T.matrix()
f = theano.function([x], Quantize()(x))
inp = np.random.uniform(high = 1, low = -1, size = (6, 6)).astype('float32')

print(inp)
print(f(inp))

[[-0.982349 0.307907 -0.776469 -0.997820 0.691836 -0.615329]
 [0.112066 0.760906 -0.640342 -0.275526 -0.027226 -0.034462]
 [-0.942956 0.583716 0.611643 -0.457133 -0.214127 -0.681998]
 [0.630336 -0.317181 0.288090 0.894566 -0.352252 0.619418]
 [-0.174802 -0.091286 0.082802 0.272072 -0.439580 0.150305]
 [0.789294 0.785343 0.113989 -0.466960 -0.510564 -0.956677]]
[[-1.000000 0.000000 -1.000000 -1.000000 1.000000 -1.000000]
 [0.000000 1.000000 -1.000000 0.000000 0.000000 0.000000]
 [-1.000000 1.000000 1.000000 0.000000 0.000000 -1.000000]
 [1.000000 0.000000 0.000000 1.000000 0.000000 1.000000]
 [0.000000 0.000000 0.000000 0.000000 0.000000 0.000000]
 [1.000000 1.000000 0.000000 0.000000 -1.000000 -1.000000]]


In [204]:
from scipy.fftpack import dct as scidct

# ====================================================================
#  DCT (Discrete Cosine Transform)
# ====================================================================

# generate square dct matrix
#     how to use: generate n-by-n matrix M. then, if you have a signal w, then:
#                 dct(w) = M * w
#     where w must be n-by-1
#
#     backed by scipy
def generate_dct_mat(n, norm = 'ortho'):
    return (scidct(np.eye(n), norm = norm))

# DCT matrix is precomputed at start of program
dctMat = generate_dct_mat(WINDOW_SIZE)
th_dctMat = K.variable(dctMat)

# given a (symbolic Theano) array of size M x WINDOW_SIZE
#     this returns an array M x WINDOW_SIZE where every one of the M samples has been independently
#     filtered by the DCT
def theano_dct(x, dctMat = None):
    global th_dctMat
    
    if (dctMat is None):
        dctMat = th_dctMat
        
    # reshape x into 2D array, and perform appropriate matrix operation
    reshaped_x = x.reshape((1, x.shape[0], x.shape[1]))

    result = T.tensordot(dctMat, reshaped_x, [[0], [2]])
    result = result.reshape((result.shape[0], result.shape[2])).T

    return result

# M x WINDOW_SIZE x 1 => M x WINDOW_SIZE x 1
def theano_batch_dct(x):
    result = theano_dct(x)
    reshaped_result = result.reshape((result.shape[0], result.shape[1], 1))

    return reshaped_result

In [205]:
class PhaseShift1D(Layer):
    """ PhaseShift1D
    Takes vector of size: B x S x nF
    And returns vector: B x nS x F
    """
    def __init__(self, n, **kwargs):
        super(PhaseShift1D, self).__init__(**kwargs)
        self.n = n
    
    def build(self, input_shape):
        # no trainable parameters
        self.trainable_weights = []
    
    def call(self, x, mask=None):
        r = T.reshape(x, (x.shape[0], x.shape[1], x.shape[2] / self.n, self.n))
        r = T.transpose(r, (0, 1, 3, 2))
        r = T.reshape(r, (x.shape[0], x.shape[1] * self.n, x.shape[2] / self.n))
        return r

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[1] * self.n, input_shape[2] / self.n)
    
    def get_config(self):
        config = {'n' : self.n}
        base_config = super(PhaseShift1D, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [206]:
# lambda to compute MSE between 2 vectors
def mse_lambda(vects):
    x, y = vects
    return K.mean(K.square(x - y))

# freeze weights for stacked training
def make_trainable(net, val):
    net.trainable = val
    for l in net.layers:
        l.trainable = val

# we generate a new optimizer of the same kind for every model
# we train
def opti():
    return Adam()

input_dim = (WINDOW_SIZE, 1)
input_size = np.prod(input_dim)
bottleneck_size = 128

def encoder_residual_block(output_dim = 64, filt_size = 5, subsample = True):
    def f(input):
        stride = 1
        if (subsample):
            stride = 2
        
        conv1 = Convolution1D(output_dim, filt_size, border_mode = 'same',
                          init = 'he_uniform', activation = 'linear',
                          subsample_length = stride, bias = True)(input)
        #if (subsample):
        #    conv1 = MaxPooling1D(2)(conv1)
        conv1 = SpatialDropout1D(0.1)(conv1)
        act1 = LeakyReLU(0.3)(conv1)
        
        conv2 = Convolution1D(output_dim, filt_size, border_mode = 'same',
                          init = 'he_uniform', activation = 'linear',
                          bias = True)(act1)
        conv2 = SpatialDropout1D(0.1)(conv2)
        
        residual = conv2
        shortcut = Convolution1D(output_dim, 1, border_mode = 'same',
                                 init = 'he_uniform', activation = 'linear',
                                 subsample_length = stride, bias = True)(input)
        #if (subsample):
        #    shortcut = MaxPooling1D(2)(shortcut)
        
        m = merge([shortcut, residual], mode = 'sum')
        return LeakyReLU(0.3)(m)
    
    return f

'''
def decoder_residual_block(output_dim = 64, filt_size = 5, upsample = True):
    def f(input):
        x = input
        if (upsample):
            x = UpSampling1D(2)(x)

        conv1 = Convolution1D(output_dim, filt_size, border_mode = 'same',
                          init = 'he_uniform', activation = 'linear',
                          bias = True)(x)
        conv1 = SpatialDropout1D(0.1)(conv1)
        act1 = LeakyReLU(0.2)(conv1)
        
        conv2 = Convolution1D(output_dim, filt_size, border_mode = 'same',
                          init = 'he_uniform', activation = 'linear',
                          bias = True)(act1)
        conv2 = SpatialDropout1D(0.1)(conv2)
        
        residual = conv2
        shortcut = Convolution1D(output_dim, 1, border_mode = 'same',
                                 init = 'he_uniform', activation = 'linear',
                                 bias = True)(x)
        
        m = merge([shortcut, residual], mode = 'sum')
        return LeakyReLU(0.2)(m)
    
    return f

'''
def decoder_residual_block(output_dim = 64, filt_size = 5, upsample = True):
    def f(input):
        nfilts = output_dim
        if (upsample):
            nfilts = output_dim * 2

        conv1 = Convolution1D(nfilts, filt_size, border_mode = 'same',
                          init = 'he_uniform', activation = 'linear',
                          bias = True)(input)
        act1 = LeakyReLU(0.3)(conv1)
        
        conv2 = Convolution1D(nfilts, filt_size, border_mode = 'same',
                          init = 'he_uniform', activation = 'linear',
                          bias = True)(act1)
        
        residual = conv2
        shortcut = Convolution1D(nfilts, 1, border_mode = 'same',
                                 init = 'he_uniform', activation = 'linear',
                                 bias = True)(input)
        
        m = merge([shortcut, residual], mode = 'sum')
        r = LeakyReLU(0.3)(m)
        if (upsample):
            return PhaseShift1D(2)(r)
        else:
            return r
    
    return f
#'''

def hard_tanh(x):
    return K.clip(x, -1.0, 1.0)

# ---------------------------------------------------------------------------
# autoencoder: takes an audio window, compresses it, and tries to reconstruct it
# ---------------------------------------------------------------------------
def autoencoder_structure(dim):
    enc_input = Input(shape = dim)
    
    # corrupt input slightly as a form of regularization
    #enc = GaussianDropout(0.05, input_shape = dim)(enc_input)

    # (512x1) => (256x48)
    enc = encoder_residual_block(48, 9, True)(enc_input)
    
    # (256x48) => (256x48)
    enc = encoder_residual_block(48, 9, False)(enc)

    # (256x48) => (128x48)
    enc = encoder_residual_block(48, 9, True)(enc)
    
    # (128x48) => (128x48)
    enc = encoder_residual_block(48, 9, False)(enc)

    # (128x48) => (128x48)
    enc = encoder_residual_block(48, 9, False)(enc)

    # (64x64) => (64)
    enc = Convolution1D(1, 9, border_mode = 'same',
                              init = 'he_uniform', activation = 'tanh',
                              bias = True)(enc)
    enc = Reshape((bottleneck_size,))(enc)
    enc = Lambda(lambda x : Quantize()(x))(enc)
    
    enc = Model(input = enc_input, output = enc)
    
    
    
    
    dec_input = Input(shape = (bottleneck_size,))
    
    dec = Reshape((128, 1,), input_shape = (bottleneck_size,))(dec_input)
    
    # (128x1) => (128x32)
    dec = decoder_residual_block(32, 9, False)(dec)
    
    # (128x32) => (128x32)
    dec = decoder_residual_block(32, 9, False)(dec)
    
    # (128x32) => (256x32)
    dec = decoder_residual_block(32, 9, True)(dec)
    
    # (256x32) => (256x32)
    dec = decoder_residual_block(32, 9, False)(dec)
    
    # (256x32) => (512x32)
    dec = decoder_residual_block(32, 9, True)(dec)
    
    # (512x32) => (512x32)
    #dec = decoder_residual_block(32, 9, False)(dec)

    # (512x32) => (512x1)
    dec = Convolution1D(1, 9, border_mode = 'same',
                              init = 'he_uniform', activation = 'tanh',
                              bias = True)(dec)
    
    dec = Model(input = dec_input, output = dec)
    
    return enc, dec

# ---------------------------------------------------------------------------
# discriminator: tries to differentiate between original and reconstructed samples
# ---------------------------------------------------------------------------
def discriminator_structure(dim):
    dsc = Sequential()
    
    dsc.add(Convolution1D(32, 5, border_mode='same', input_shape = dim,
                                    init = 'uniform',
                                    subsample_length = 2, activation = 'linear'))
    dsc.add(LeakyReLU(0.3))
    
    dsc.add(Convolution1D(32, 5, border_mode='same', input_shape = dim,
                                    init = 'uniform',
                                    activation = 'linear'))
    dsc.add(LeakyReLU(0.3))

    dsc.add(Convolution1D(32, 5, border_mode='valid', input_shape = dim,
                                    init = 'uniform',
                                    subsample_length = 2, activation = 'linear'))
    dsc.add(LeakyReLU(0.3))
    
    dsc.add(Convolution1D(32, 5, border_mode='same', input_shape = dim,
                                    init = 'uniform',
                                    activation = 'linear'))
    dsc.add(LeakyReLU(0.3))
    
    dsc.add(Convolution1D(32, 5, border_mode='valid', input_shape = dim,
                                    init = 'uniform',
                                    subsample_length = 2, activation = 'linear'))
    dsc.add(LeakyReLU(0.3))
    
    dsc.add(Flatten())
    
    dsc.add(Dense(48, activation = 'linear'))
    dsc.add(LeakyReLU(0.3))
    dsc.add(Dense(1, activation = 'sigmoid'))
    
    return dsc


# plain autoencoder
'''
plain_input = Input(shape = input_dim)
plain_enc, plain_dec = autoencoder_structure(input_dim)
plain_embedding = plain_enc(plain_input)
plain_reconstructed = plain_dec(plain_embedding)
plain_autoencoder = Model(input = [plain_input], output = [plain_reconstructed])
plain_autoencoder.compile(loss = 'mean_squared_error', optimizer = opti())
'''
# construct autoencoder to be used in adversarial training (AAC - Adversarial AutoenCoder)
# uhhhh... whoops i screwed up the acronym
aac_input = Input(shape = input_dim)
aac_enc, aac_dec = autoencoder_structure(input_dim)
aac_embedding = aac_enc(aac_input)
#aac_embedding_binarized = BinarizeLayer()(aac_embedding)
aac_reconstructed = aac_dec(aac_embedding)

aac_autoencoder = Model(input = [aac_input], output = [aac_reconstructed])
aac_autoencoder.compile(loss = 'mean_squared_error', optimizer = opti())



# construct discriminator 1: fft-based
'''
fft_transform = Lambda(theano_batch_dft,
                       output_shape = lambda x : (x[0], x[1] * 2, 1))

fftdsc_input_dim = (WINDOW_SIZE * 2, 1)
fftdsc_input = Input(shape = input_dim)
fftdsc_struct = discriminator_structure(fftdsc_input_dim)

fftdsc_label = fftdsc_struct(fft_transform(fftdsc_input))
aac_fft_label = fftdsc_struct(fft_transform(aac_reconstructed))
'''


# construct discriminator 2: regular
regdsc_input_dim = (WINDOW_SIZE, 1)
regdsc_input = Input(shape = input_dim)
regdsc_struct = discriminator_structure(regdsc_input_dim)

regdsc_label = regdsc_struct(regdsc_input)
aac_reg_label = regdsc_struct(aac_reconstructed)


'''
nlvls = 8
haar_transform = Lambda(lambda x: haar_multilevel(x, nlvls, True),
                        output_shape = lambda x : (x[0], x[1], nlvls + 1))
regdsc_input_dim = (WINDOW_SIZE, nlvls + 1)
regdsc_input = Input(shape = input_dim)
regdsc_struct = discriminator_structure(regdsc_input_dim)

regdsc_label = regdsc_struct(haar_transform(regdsc_input))
aac_reg_label = regdsc_struct(haar_transform(aac_reconstructed))
'''

'''
dct_transform = Lambda(theano_batch_dct,
                       output_shape = lambda x : (x[0], x[1], 1))
regdsc_input_dim = (WINDOW_SIZE, 1)
regdsc_input = Input(shape = input_dim)
regdsc_struct = discriminator_structure(regdsc_input_dim)

regdsc_label = regdsc_struct(dct_transform(regdsc_input))
aac_reg_label = regdsc_struct(dct_transform(aac_reconstructed))
'''


def code_binary_loss(placeholder, code):
    #minim = K.minimum(1.0 - K.abs(code), K.abs(code))
    return 1.0 - K.abs(code)

def code_variance_loss(placeholder, code):
    variance = T.var(code + 0.0001, axis = 1)
    inverse_var = 1.0 / (variance + 0.0001)
    return inverse_var


def upper_dct_loss(y_true, y_pred):
    dct_true = theano_batch_dct(y_true)
    dct_pred = theano_batch_dct(y_pred)
    cutoff_idx = WINDOW_SIZE / 4
    
    # compute MSE in frequency domain
    error = T.mean(T.sqr(dct_pred[:, cutoff_idx:] - dct_true[:, cutoff_idx:]), axis = 1)
    return error


# compile model
loss_weights = [100.0, 2.0]
n_discrim = 1
n_code = 0
lmult = len(loss_weights) - n_discrim - n_code




make_trainable(aac_autoencoder, False)

aac_discrim_reg = Model(input = [regdsc_input], output = [regdsc_label])
aac_discrim_reg.compile(loss = ['binary_crossentropy'], optimizer = opti())
aac_discrim_reg.summary()

aac_autoencoder.summary()

make_trainable(aac_discrim_reg, False)
make_trainable(aac_autoencoder, True)
model = Model(input = [aac_input], output = [aac_reconstructed] * lmult + \
                                            [aac_reg_label] + \
                                            [aac_embedding] * n_code)
model.compile(loss = ['mean_squared_error', \
                      'binary_crossentropy', \
                     ],
              loss_weights = loss_weights,
              optimizer = opti())
model.summary()

X_train = np.copy(processedWindows)
ntrain = X_train.shape[0]



____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_36 (InputLayer)            (None, 512, 1)        0                                            
____________________________________________________________________________________________________
sequential_8 (Sequential)        (None, 1)             114593      input_36[0][0]                   
Total params: 114593
____________________________________________________________________________________________________
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_33 (InputLayer)            (None, 512, 1)        0                                            
______________________________________________________________________

In [207]:
def test_discriminator(discriminator, autoencoder, X, y, verbose = True):
    # verify discriminator was trained properly
    y_hat = discriminator.predict(X)
    y_hat[y_hat >= 0.5] = 1
    y_hat[y_hat < 0.5] = 0

    n_total = y.shape[0]
    n_correct = np.sum(np.ravel(y_hat) == y)

    acc = n_correct * 100.0 / n_total
    if (verbose):
        print "Discriminator accuracy: %0.02f pct (%d of %d) right"%(acc, n_correct, n_total)
    return acc

In [208]:
def autoencoderTest(waveFilename, prefix, autoencoder, verbose = True):
    [rate, data] = sciwav.read(waveFilename)
    processedWave, wparams = preprocessWaveform(data)
    windows = extractWindows(processedWave, STEP_SIZE, OVERLAP_SIZE)
    

    # first, write desired reconstruction
    transformed, tparams = preprocessWindows(windows)
    if (verbose):
        print transformed.shape
    
    desired = unpreprocessWindows(transformed, tparams)
    desired = reconstructFromWindows(desired, OVERLAP_SIZE, OVERLAP_FUNC)
    desired = unpreprocessWaveform(desired, wparams)
    #sciwav.write(prefix + "_res_desired.wav", rate, desired.astype(np.int16))
    
    # then, run NN on transformed windows
    transformed, tparams = preprocessWindows(windows)
    
    transformed = np.reshape(transformed, (transformed.shape[0], WINDOW_SIZE, 1))
    autoencOutput = autoencoder.predict(transformed, batch_size = BATCH_SIZE, verbose = (1 if verbose else 0))
    if (verbose):
        print autoencOutput.shape
    autoencOutput = np.reshape(autoencOutput, (autoencOutput.shape[0], WINDOW_SIZE))
    
    if (verbose):
        print autoencOutput.shape
    recons = unpreprocessWindows(autoencOutput, tparams)
    recons = reconstructFromWindows(recons, OVERLAP_SIZE, OVERLAP_FUNC)
    recons = unpreprocessWaveform(recons, wparams)
    
    sciwav.write(prefix + "_output.wav", rate, recons.astype(np.int16))
    
    metrics = [
        np.max(desired),
        np.min(desired),
        np.max(recons),
        np.min(recons),
        mse(recons, desired),
        avgErr(recons, desired)
    ]
    
    if (verbose):
        print "Max/min desired:", metrics[0], metrics[1]
        print "Max/min recons: ", metrics[2], metrics[3]
        print waveFilename, " mse: ", metrics[4]
        print waveFilename, " avg err: ", metrics[5]
        
    return metrics

In [209]:
# interleave two numpy arrays of the same size along the first axis
def interleave(a, b):    
    r = np.empty(a.shape)
    r = np.repeat(r, 2, axis = 0)
    
    r[::2] = a
    r[1::2] = b
    return r

In [210]:
np.set_printoptions(formatter={'float_kind':'{:4f}'.format})

BATCH_SIZE = 128
NUM_BATCHES = ntrain / BATCH_SIZE
NUM_EPOCHS = 10

lead = "    "
d_loss = 0.0
a_losses = []
d_acc = 0.0
discrim_train_y = np.concatenate((np.ones(ntrain), np.zeros(ntrain)))


for epoch in range(NUM_EPOCHS):
    print "Epoch " + str(epoch + 1) + ":"

    # present batches randomly each epoch
    lis = range(0, ntrain, BATCH_SIZE)
    random.shuffle(lis)
    
    # keep track of start time and current batch #
    i = 0
    startTime = time.time()
    for idx in lis:
        batch = X_train[idx:idx+BATCH_SIZE, :,  :]
        nbatch = batch.shape[0]
        
        a_losses = ["autoencoder not training"]
        d_loss = "discriminator not training"
        
        # train autoencoder, if discriminator accuracy is greater than 70%
        if (epoch >= 0):
            make_trainable(aac_autoencoder, True)
            make_trainable(aac_discrim_reg, False)
            
            a_y = [batch] * lmult + \
                  [np.ones(nbatch)] * n_discrim + \
                  [np.zeros((nbatch, bottleneck_size))] * n_code
            a_losses = model.train_on_batch(batch, a_y)
        
        # train discriminator(s) on what the autoencoder now generates
        generated = aac_autoencoder.predict(batch)
        discrim_batch_X = interleave(batch, generated)
        discrim_batch_y = interleave(np.ones(nbatch), np.zeros(nbatch))
        
        make_trainable(aac_autoencoder, False)
        make_trainable(aac_discrim_reg, True)
        d_loss = aac_discrim_reg.train_on_batch(discrim_batch_X, discrim_batch_y)
        
        if (epoch < 0 and d_loss < 0.2):
            print ""
            print lead + "Terminating epoch early (don't wanna overfit!)"
            break
        
        # print statistics every 10 batches so we know stuff is still going down
        if (i % 10 == 0):
            printStr = "        \r" + lead + str(i * BATCH_SIZE) + ": " + str(d_loss) + " "
            print printStr,
            
            loss_arr = np.asarray(a_losses)
            print loss_arr,
            
            if (len(loss_arr) > 1):
                for i in xrange(0, len(loss_weights)):
                    loss_arr[i + 1] *= loss_weights[i]
                print loss_arr,
            
        i += 1
    print ""
    
    # print elapsed time for epoch
    elapsed = time.time() - startTime
    print lead + "Total time for epoch: " + str(elapsed) + "s"
    
    
    # ---------------------------------------------------------
    # evaluate discriminator on random samples every epoch
    # ---------------------------------------------------------
    startTime = time.time()
    print lead + "----------------"

    NUM = 200
    rows = np.random.randint(X_train.shape[0], size = NUM)
    generated = aac_autoencoder.predict(X_train[rows, :], verbose = 0)
    d_X = np.concatenate((X_train[rows, :], generated))
    d_y = np.concatenate((np.ones(NUM), np.zeros(NUM)))
    d_acc = test_discriminator(aac_discrim_reg, aac_autoencoder,
                               d_X, d_y, verbose = False)

    print lead + "Evaluated the discriminator: " + str(d_acc) + "% d_acc"
    elapsed = time.time() - startTime
    print lead + "Total time for evaluation: " + str(elapsed) + "s"
    
    
    # ---------------------------------------------------------
    # evaluate autoencoder on real data every epoch
    # ---------------------------------------------------------
    startTime = time.time()
    print lead + "----------------"
    
    print lead + "Evaluating autoencoder..."
    metrics = autoencoderTest("./SA1.WAV", "SA1_res_reg_train_epoch" + str(epoch+1), aac_autoencoder, verbose = False)
    
    print lead + "Max/min desired:", metrics[0], metrics[1]
    print lead + "Max/min recons: ", metrics[2], metrics[3]
    print lead + "MSE:     ", metrics[4]
    print lead + "Avg err: ", metrics[5]
    elapsed = time.time() - startTime
    print lead + "Total time for evaluation: " + str(elapsed) + "s"
    
    print ""

Epoch 1:
    1280: 0.595627307892  [4.599453 0.029016 0.848931] [4.599453 2.901591 1.697861] 
    Total time for epoch: 750.664128065s
    ----------------
    Evaluated the discriminator: 61.75% d_acc
    Total time for evaluation: 1.05848002434s
    ----------------
    Evaluating autoencoder...
    Max/min desired: 4899.0 -4013.0
    Max/min recons:  4699.44 -3884.86
    MSE:      28102.0
    Avg err:  84.6243
    Total time for evaluation: 0.116758108139s

Epoch 2:
    1280: 0.647861242294  [3.860081 0.023943 0.732881] [3.860081 2.394320 1.465761] 
    Total time for epoch: 713.930218935s
    ----------------
    Evaluated the discriminator: 63.5% d_acc
    Total time for evaluation: 0.260028839111s
    ----------------
    Evaluating autoencoder...
    Max/min desired: 4899.0 -4013.0
    Max/min recons:  4432.8 -3018.13
    MSE:      21910.1
    Avg err:  77.7952
    Total time for evaluation: 0.116559028625s

Epoch 3:
    1280: 0.650050580502  [4.734525 0.028285 0.953011] [4.7345

KeyboardInterrupt: 

In [211]:
model.save('model_reg_adversary.h5')
aac_autoencoder.save('auto_reg_adversary.h5')
aac_discrim_reg.save('discrim_reg_adversary.h5')




In [229]:
'''
from keras.models import load_model

objs = {'PhaseShift1D' : PhaseShift1D,
        'BinarizeLayer' : BinarizeLayer,
        'code_binary_loss' : code_binary_loss,
        'code_maximize_variance' : code_maximize_variance}

model = load_model('model_reg_adversary.h5', objs)
aac_autoencoder = load_model('auto_reg_adversary.h5', objs)
aac_discrim_reg = load_model('discrim_reg_adversary.h5', objs)
'''

"\nfrom keras.models import load_model\n\nobjs = {'PhaseShift1D' : PhaseShift1D,\n        'BinarizeLayer' : BinarizeLayer,\n        'code_binary_loss' : code_binary_loss,\n        'code_maximize_variance' : code_maximize_variance}\n\nmodel = load_model('model_reg_adversary.h5', objs)\naac_autoencoder = load_model('auto_reg_adversary.h5', objs)\naac_discrim_reg = load_model('discrim_reg_adversary.h5', objs)\n"

In [213]:
enc = model.layers[1].layers
dec = model.layers[2].layers


In [214]:
NUM = 400
rows = np.random.randint(X_train.shape[0], size = NUM)
generated = aac_autoencoder.predict(X_train[rows, :], verbose = 0)
d_X = np.concatenate((X_train[rows, :], generated))
d_y = np.concatenate((np.ones(NUM), np.zeros(NUM)))
d_acc = test_discriminator(aac_discrim_reg, aac_autoencoder,
                           d_X, d_y, verbose = False)

print lead + "Evaluated the discriminator: " + str(d_acc) + "% d_acc"

    Evaluated the discriminator: 63.875% d_acc


In [215]:
autoencoderTest("./SA1.WAV", "SA1_aac_reg_", aac_autoencoder)
autoencoderTest("./SX383.WAV", "SX383_aac_reg_", aac_autoencoder)
autoencoderTest("./fiveYears.wav", "fy_aac_reg_", aac_autoencoder)

(112, 512)
(112, 512, 1)
(112, 512)
Max/min desired: 4899.0 -4013.0
Max/min recons:  4676.88 -3760.37
./SA1.WAV  mse:  20706.7
./SA1.WAV  avg err:  77.9277
(93, 512)
(93, 512, 1)
(93, 512)
Max/min desired: 2961.0 -3057.0
Max/min recons:  2939.12 -2749.06
./SX383.WAV  mse:  12240.2
./SX383.WAV  avg err:  63.4815
(181, 512)
(181, 512, 1)
(181, 512)
Max/min desired: 24636.0 -20122.0
Max/min recons:  24965.3 -17999.2
./fiveYears.wav  mse:  5.50814e+06
./fiveYears.wav  avg err:  1735.32


[24636.0, -20122.0, 24965.309, -17999.203, 5508137.0, 1735.325]

In [216]:
[rate, data] = sciwav.read("./SA1.WAV")
processedWave, wparams = preprocessWaveform(data)
windows = extractWindows(processedWave, STEP_SIZE, OVERLAP_SIZE)

transformed, tparams = preprocessWindows(windows)

transformed = np.reshape(transformed, (transformed.shape[0], WINDOW_SIZE, 1))
embed = aac_enc.predict(transformed, batch_size = BATCH_SIZE, verbose = 1)



In [217]:
idx = 45

e = np.sign(embed)

for i in xrange(0, embed.shape[0]):
    num_pos = np.count_nonzero(e[idx] > 0)
    num_neg = e[idx].shape[0] - num_pos
    
print embed[idx]

[1.000000 0.000000 1.000000 1.000000 1.000000 0.000000 -1.000000 -1.000000
 -1.000000 -1.000000 -1.000000 -1.000000 1.000000 1.000000 0.000000
 0.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 0.000000
 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 -1.000000
 -1.000000 0.000000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000
 0.000000 -1.000000 -1.000000 1.000000 1.000000 1.000000 1.000000 -1.000000
 -1.000000 -1.000000 -1.000000 -1.000000 0.000000 1.000000 1.000000
 1.000000 0.000000 -1.000000 -1.000000 -1.000000 0.000000 0.000000 0.000000
 1.000000 1.000000 1.000000 0.000000 -1.000000 -1.000000 -1.000000 0.000000
 0.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000
 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 0.000000
 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000 0.000000
 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
 0.000000 0.000000 0.000000 0.000000 0.000000 0.0000

In [218]:
all_embed = aac_enc.predict(X_train, batch_size = BATCH_SIZE, verbose = 1)



KeyboardInterrupt: 

In [134]:
from pyclustering.cluster.kmedoids import kmedoids

# generate 300 random points
data = np.random.normal(0.0, 2.0, (300, 2))

clust = kmedoids(data, [0, 1])
clust.process()
clust.get_medoids()

[array([-0.636428, -1.654416]), array([0.115110, 1.393575])]

In [135]:
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.neighbors import KDTree, NearestNeighbors

skip = 1
codebooks = []
for i in xrange(0, all_embed.shape[1], skip):
    #km = MiniBatchKMeans(n_clusters = 16, batch_size = 128, verbose = 0)
    #km.fit(all_embed[:, i:i+skip])
    #kmeans.append(km)
    dat = all_embed[:, i:i+skip]
    rnd = dat[np.random.choice(dat.shape[0], 500, replace=False), :]
    init = np.random.choice(500, 4, replace = False)
    
    clust = kmedoids(rnd, init)
    clust.process()
    medoids = clust.get_medoids()
    
    nn = KDTree(medoids)
    codebooks.append(nn)
    print i


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127


In [136]:
print codebooks[0].data.shape
print np.asarray(codebooks[0].data)

quantized_embed = None

for i in xrange(0, embed.shape[1], skip):
    nn = codebooks[i / skip]
    
    part = embed[:, i:i+skip]
    cluster_idxs = nn.query(part, 1, False)
    cluster_idxs = list(cluster_idxs.flatten())
    
    quantized_part = [nn.data[j] for j in cluster_idxs]
    quantized_part = np.array(quantized_part)
    
    if (quantized_embed is None):
        quantized_embed = quantized_part
    else:
        quantized_embed = np.concatenate((quantized_embed, quantized_part), axis = 1)
    
print quantized_embed.shape

(1, 1)
[[0.989762]]
(112, 128)


In [137]:
autoencOutput = aac_dec.predict(quantized_embed, batch_size = BATCH_SIZE, verbose = 1)
print autoencOutput.shape
autoencOutput = np.reshape(autoencOutput, (autoencOutput.shape[0], WINDOW_SIZE))

print autoencOutput.shape
recons = unpreprocessWindows(autoencOutput, tparams)

wav = reconstructFromWindows(recons, OVERLAP_SIZE, OVERLAP_FUNC)
wav = unpreprocessWaveform(wav, wparams)
    
sciwav.write("tst_output_reg_quant.wav", rate, wav.astype(np.int16))

(112, 512, 1)
(112, 512)


In [138]:
quantized_embed = np.round(embed * 8.0) / 8.0

autoencOutput = aac_dec.predict(quantized_embed, batch_size = BATCH_SIZE, verbose = 1)
print autoencOutput.shape
autoencOutput = np.reshape(autoencOutput, (autoencOutput.shape[0], WINDOW_SIZE))

print autoencOutput.shape
recons = unpreprocessWindows(autoencOutput, tparams)

wav = reconstructFromWindows(recons, OVERLAP_SIZE, OVERLAP_FUNC)
wav = unpreprocessWaveform(wav, wparams)
    
sciwav.write("tst_output_reg.wav", rate, wav.astype(np.int16))

(112, 512, 1)
(112, 512)
