In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np

import random
import h5py
from keras.datasets import cifar10
from keras.models import *
from keras.layers import *
from keras.layers.core import *
from keras.layers.normalization import *
from keras.optimizers import *
from keras.callbacks import *
from keras import backend as K
from keras.regularizers import *
import theano.tensor as T
import theano
from theano.tensor.shared_randomstreams import RandomStreams
from sklearn import metrics
from skimage.measure import compare_ssim
from scipy.misc import toimage
from sklearn.preprocessing import *

import os
import random
import time
from skimage import io, exposure, feature, color, transform
import matplotlib
import matplotlib.pyplot as plt
import glob

import scipy.signal as sig
import operator
import math

# for reproducibility
np.random.seed(1337) 
random.seed(1337)

Using Theano backend.
Using gpu device 1: GeForce GTX 690 (CNMeM is disabled, cuDNN 5105)


In [2]:
# parameters for sliding window, and window function (Hann)
STEP_SIZE = 480
OVERLAP_SIZE = 32
WINDOW_SIZE = STEP_SIZE + OVERLAP_SIZE
OVERLAP_FUNC = sig.hann(OVERLAP_SIZE * 2)

# directory that contains TIMIT files
TIMIT_DIR = "/home/sri/Desktop/timit"

# directory that contains .wav files to process
NUM_EPOCHS = 1
BATCH_SIZE = 64

# randomly shuffle data before partitioning into training/validation?
RANDOM_SHUFFLE = True

# sample rate of input file (used in MFCC calculation)
SAMPLE_RATE = 16000

In [3]:
from load_TIMIT import *
from windowingFunctions import *
from utility import *

In [4]:
# read in 100 WAVs from TIMIT training set
rawWaveforms = load_TIMIT_train(TIMIT_DIR, 1000)

Reading in .wav files...
0: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SA1.WAV1: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SX339.WAV2: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SI1059.WAV3: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SI1689.WAV4: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SX429.WAV5: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SX69.WAV6: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SX159.WAV7: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SI2319.WAV8: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SA2.WAV9: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SX249.WAV10: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MCRE0/SX221.WAV11: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MCRE0/SA1.WAV12: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MCRE0/SX41.WAV13: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MCRE0/SI1751.WAV14: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MCRE0/SI1725.WAV15: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MCRE0/SX401.WAV16: /home/sri/Desktop/timit/

In [22]:
# waveform preprocessing
def preprocessWaveform(waveform):
    mn = np.min(waveform)
    mx = np.max(waveform)
    
    maxabs = max(abs(mn), abs(mx))
    scl = 32768.0 / maxabs    
    
    processed = waveform * scl
    
    return processed, (scl,)
    
def unpreprocessWaveform(waveform, params):
    unprocessed = np.copy(waveform)
    
    #unprocessed = (waveform + 32768.0) / 65536.0
    #unprocessed = (unprocessed * (mx - mn)) + mn
    unprocessed = waveform / params[0]
    return unprocessed



# window preprocessing
def preprocessWindows(windows):
    # scale window between -1 and 1
    processed = np.copy(windows)
    processed /= 32768.0
    
    return processed, ()

def unpreprocessWindows(windows, params):
    # scale window from [-1, 1] to [-32768, 32768]
    unprocessed *= 32768.0
    return unprocessed

In [23]:
# waveform preprocessing
processedWaveforms = np.copy(rawWaveforms)

# we maximize the volume of every waveform
for i in xrange(0, len(processedWaveforms)):
    processedWaveforms[i], _ = preprocessWaveform(processedWaveforms[i])

In [24]:
# extract windows
rawWindows = extractWindowsMultiple(processedWaveforms, STEP_SIZE, OVERLAP_SIZE,
                                    collapse = True)

# randomly shuffle data
if (RANDOM_SHUFFLE):
    rawWindows = np.random.permutation(rawWindows)

print "Raw windows shape: ", rawWindows.shape
print "Max: ", np.amax(rawWindows)
print "Min: ", np.amin(rawWindows)

Raw windows shape:  (101135, 512)
Max:  32768.0
Min:  -32768.0


In [25]:
# data augmentation goes here, at some point
augWindows = np.copy(rawWindows)

print "Aug windows shape: ", augWindows.shape

Aug windows shape:  (101135, 512)


In [26]:
processedWindows, pwParams = preprocessWindows(augWindows)

In [27]:
# reshape into vector form
processedWindows = np.reshape(processedWindows, (processedWindows.shape[0], WINDOW_SIZE, 1))

In [28]:
print processedWindows.shape

print np.mean(processedWindows, axis=None)
print np.std(processedWindows, axis=None)
print np.min(processedWindows, axis = None)
print np.max(processedWindows, axis = None)

(101135, 512, 1)
9.77606e-07
0.0997659
-1.0
1.0


In [29]:
# operations for binarization layer (THEANO ONLY)

class Binarize(T.Op):
    # properties attribute
    __props__ = ()
    
    def __init__(self):
        super(Binarize, self).__init__()
        
    def make_node(self, x):
        assert hasattr(self, '_props'), "Your version of theano is too old to support __props__."
        x = T.as_tensor_variable(x)
        return theano.Apply(self, [x], [x.type()])
    
    def perform(self, node, inputs, output_storage):
        x, = inputs
        z, = output_storage
        
        # TODO: learn threshold per parameter?
        z[0] = np.copy(x)
        z[0][z[0] < 0] = -1
        z[0][z[0] >= 0] = 1
    
    def grad(self, input, output_gradients):
        # pass through gradients unchanged
        # (i don't think there's a mathematical justification for this?)
        return [output_gradients[0]]
        
    def infer_shape(self, node, i0_shapes):
        # output shape is same as input shape
        return i0_shapes

    
class BinarizeLayer(Layer):
    """ Binarizes input 
    <feedforward> binarizes output of tanh to -1 and 1
    <backward> returns delta unchanged
    """
    def __init__(self, **kwargs):
        super(BinarizeLayer, self).__init__(**kwargs)
        self.op = Binarize()
    
    def build(self, input_shape):
        # no trainable parameters
        self.trainable_weights = []
    
    def call(self, x, mask=None):
        return self.op(x)

    #def get_output_shape_for(self, input_shape):
    #    return input_shape
    
    def get_config(self):
        config = {"name": self.__class__.__name__}
        base_config = super(BinarizeLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


In [30]:
# verify Binarize op works
x = T.matrix()
f = theano.function([x], Binarize()(x))
inp = np.random.uniform(high = 1, low = -1, size = (6, 6)).astype('float32')
out = f(inp)

print(inp)
print(out)

[[-0.869698 0.257294 0.316507 0.147893 0.970888 -0.417560]
 [-0.107174 0.022471 0.502539 0.983777 0.514540 0.311905]
 [-0.162142 0.274707 0.984470 0.712251 -0.171646 -0.755081]
 [0.361122 -0.705993 -0.830186 -0.887585 0.643208 0.032562]
 [-0.020691 -0.900804 -0.963156 0.919230 -0.351911 -0.974039]
 [-0.339275 -0.842296 0.299581 -0.810932 0.196863 -0.626178]]
[[-1.000000 1.000000 1.000000 1.000000 1.000000 -1.000000]
 [-1.000000 1.000000 1.000000 1.000000 1.000000 1.000000]
 [-1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000]
 [1.000000 -1.000000 -1.000000 -1.000000 1.000000 1.000000]
 [-1.000000 -1.000000 -1.000000 1.000000 -1.000000 -1.000000]
 [-1.000000 -1.000000 1.000000 -1.000000 1.000000 -1.000000]]


In [31]:
from scipy.fftpack import dct as scidct

# ====================================================================
#  DCT (Discrete Cosine Transform)
# ====================================================================

# generate square dct matrix
#     how to use: generate n-by-n matrix M. then, if you have a signal w, then:
#                 dct(w) = M * w
#     where w must be n-by-1
#
#     backed by scipy
def generate_dct_mat(n, norm = 'ortho'):
    return (scidct(np.eye(n), norm = norm))

# DCT matrix is precomputed at start of program
dctMat = generate_dct_mat(WINDOW_SIZE)
th_dctMat = theano.shared(dctMat)

# given a (symbolic Theano) array of size M x WINDOW_SIZE
#     this returns an array M x WINDOW_SIZE where every one of the M samples has been independently
#     filtered by the DCT
def theano_dct(x, dctMat = None):
    global th_dctMat
    
    if (dctMat is None):
        dctMat = th_dctMat
        
    # reshape x into 2D array, and perform appropriate matrix operation
    reshaped_x = x.reshape((1, x.shape[0], x.shape[1]))

    result = T.tensordot(dctMat, reshaped_x, [[0], [2]])
    result = result.reshape((result.shape[0], result.shape[2])).T

    return result


# ====================================================================
#  DFT (Discrete Fourier Transform)
# ====================================================================

# generate square dft matrix (similar to how we generate the DFT one)
#     note that this matrix will have real and imaginary components
def generate_dft_mat(n):
    return (np.fft.fft(np.eye(n)))

# we compute both the real and imaginary part of the FFT separately, at program start
dftMat = generate_dft_mat(WINDOW_SIZE)

th_dftMat_imag = theano.shared(np.imag(dftMat))
th_dftMat_real = theano.shared(np.real(dftMat))

# given a (symbolic Theano) array of size M x WINDOW_SIZE
#     this returns an array M x WINDOW_SIZE where every one of the M samples has been replaced by
#     its DFT magnitude
def theano_dft_mag(x):
    global th_dftMat_imag
    global th_dftMat_real

    reshaped_x = x.reshape((1, x.shape[0], x.shape[1]))

    imag = T.tensordot(th_dftMat_imag, reshaped_x, [[0], [2]])
    imag = imag.reshape((imag.shape[0], imag.shape[2])).T

    real = T.tensordot(th_dftMat_real, reshaped_x, [[0], [2]])
    real = real.reshape((real.shape[0], real.shape[2])).T

    result = T.sqrt(T.sqr(real) + T.sqr(imag))

    return result

# given a (symbolic Theano) array of size M x WINDOW_SIZE
#     this returns an array M x (2 * WINDOW_SIZE) where coefficients are alternating real and imaginary
#     FFT coeffs
def theano_dft(x):
    global th_dftMat_imag
    global th_dftMat_real

    reshaped_x = x.reshape((1, x.shape[0], x.shape[1]))

    imag = T.tensordot(th_dftMat_imag, reshaped_x, [[0], [2]])
    imag = imag.reshape((imag.shape[0], imag.shape[2])).T

    real = T.tensordot(th_dftMat_real, reshaped_x, [[0], [2]])
    real = real.reshape((real.shape[0], real.shape[2])).T

    result = T.concatenate([imag, real], axis=1)

    return result


In [32]:
# based on a combination of this article:
#     http://practicalcryptography.com/miscellaneous/machine-learning/...
#         guide-mel-frequency-cepstral-coefficients-mfccs/
# and some of this code:
#     http://stackoverflow.com/questions/5835568/...
#         how-to-get-mfcc-from-an-fft-on-a-signal

from numpy.fft import fft
from scipy.fftpack import idct, dct

NUM_MFCC_COEFFS = 32

def freqToMel(freq):
    return 1127.01048 * math.log(1 + freq / 700.0)

def melToFreq(mel):
    return 700 * (math.exp(mel / 1127.01048) - 1)

def melFilterBank(numCoeffs):
    minHz = 0
    maxHz = SAMPLE_RATE / 2            # by Nyquist theorem
    numFFTBins = WINDOW_SIZE

    maxMel = freqToMel(maxHz)
    minMel = freqToMel(minHz)

    # we need (numCoeffs + 2) points to create (numCoeffs) filterbanks
    melRange = np.array(xrange(numCoeffs + 2))
    melRange = melRange.astype(np.float32)

    # create (numCoeffs + 2) points evenly spaced between minMel and maxMel
    melCenterFilters = melRange * (maxMel - minMel) / (numCoeffs + 1) + minMel

    
    for i in xrange(numCoeffs + 2):
        # mel domain => frequency domain
        melCenterFilters[i] = melToFreq(melCenterFilters[i])

        # frequency domain => FFT bins
        melCenterFilters[i] = math.floor(numFFTBins * melCenterFilters[i] / maxHz)       

    # create matrix of filters (one row is one filter)
    filterMat = np.zeros((numCoeffs, numFFTBins))

    # generate filters (in frequency domain) and plot
    for i in range(1, numCoeffs + 1):
        filter = np.zeros(numFFTBins)
        
        startRange = melCenterFilters[i - 1]
        midRange   = melCenterFilters[i]
        endRange   = melCenterFilters[i + 1]

        for j in range(startRange, midRange):
            filter[j] = (float(j) - startRange) / (midRange - startRange)
        for j in range(midRange, endRange):
            filter[j] = 1 - ((float(j) - midRange) / (endRange - midRange))
        
        filterMat[i - 1] = filter
        #plt.plot(filter)
    #plt.show()

    # return filterbank as matrix
    return filterMat



# precomputed Mel filterbank
#     (transpose so we can do dot products with the power spectrum)
FILTERBANK = melFilterBank(NUM_MFCC_COEFFS).transpose()
th_filterbank = theano.shared(FILTERBANK)

# we also need to precompute another DCT matrix
th_mfcc_dct = theano.shared(generate_dct_mat(NUM_MFCC_COEFFS, None))



# given a (symbolic Theano) array of size M x WINDOW_SIZE
#     this returns an array M x NUM_MFCC_COEFFS where each window has been replaced
#     by its MFCC coeffs
def theano_mfcc(x):
    powerSpectrum = T.pow(theano_dft_mag(x), 2)
    
    filteredSpectrum = T.tensordot(powerSpectrum, th_filterbank, axes = 1)
    
    # replace places where filtered spectrum is zero
    filteredSpectrum = T.switch(T.eq(filteredSpectrum, 0), np.finfo(float).eps, \
                                filteredSpectrum)
    
    logSpectrum = T.log(filteredSpectrum)
    mfccs = theano_dct(logSpectrum, th_mfcc_dct)
    return mfccs
    
    





# compute MFCC for single window
def mfcc(signal):
    # preemphasize signal
    #preemphasizedSignal = np.copy(signal)
    #for i in xrange(1, len(signal)):
    #    preemphasizedSignal[i] = signal[i] - 0.9 * signal[i - 1]

    complexSpectrum = fft(signal)
    
    powerSpectrum = abs(complexSpectrum) ** 2
    filteredSpectrum = np.dot(powerSpectrum, FILTERBANK)

    # replace places where filtered spectrum is zero
    filteredSpectrum = np.where(filteredSpectrum == 0, np.finfo(float).eps, \
                                filteredSpectrum)

    # get log spectrum and take DCT to get MFCC
    logSpectrum = np.log(filteredSpectrum)
    mfcc = dct(logSpectrum, type=2)

    return mfcc

# compute MFCC for list of windows
def getMFCCsForWindows(windows):
    numWindows = windows.shape[0]

    mfccs = np.zeros((numWindows, NUM_MFCC_COEFFS))

    i = 0
    for window in windows:
        windowMFCC = mfcc(window)
        windowMFCC = np.reshape(np.array(windowMFCC), (1, len(windowMFCC)))

        mfccs[i, :] = windowMFCC

        i += 1
        if (VERBOSE):
            if (i % 500 == 0):
                print i, "/", numWindows
    
    return mfccs

In [33]:
# verification for MFCCs
for i in xrange(9, 11):
    w = np.copy(processedWindows[i])
    w = np.reshape(w, (512,))

    powerSpectrum = abs(fft(w)) ** 2
    filteredSpectrum = np.dot(powerSpectrum, FILTERBANK)
    
    logSpectrum = np.log(filteredSpectrum)
    mfcc = dct(logSpectrum, type=2)
    print mfcc


x = np.copy(processedWindows[9:11])
print theano_mfcc(x).eval()



[-40.888092 95.528517 80.803546 -44.299588 24.235716 -37.670973 1.076368
 -34.372020 7.667968 -28.145428 3.836016 -1.038048 -1.100543 -21.599239
 -6.775992 2.794840 14.299310 -6.146914 1.315986 -3.737948 4.881260
 -7.178720 5.685290 1.735394 1.576460 -2.189510 4.126370 3.696754 0.417652
 -0.883408 0.199139 0.120875]
[-21.321678 23.137954 38.696428 -10.110935 12.181227 -21.755430 7.749690
 9.311823 5.828579 -15.321066 -1.887780 1.538241 0.365915 2.562277 7.512800
 -3.456027 3.078921 -7.209214 2.330733 -6.579783 5.335410 2.946226
 -0.105487 -3.409612 -0.188387 -0.519416 -3.413348 3.765252 0.446035
 0.646355 -0.794289 1.009097]
[[-40.888092 95.528517 80.803546 -44.299588 24.235716 -37.670973 1.076368
  -34.372020 7.667968 -28.145428 3.836016 -1.038048 -1.100543 -21.599239
  -6.775992 2.794840 14.299310 -6.146914 1.315986 -3.737948 4.881260
  -7.178720 5.685290 1.735394 1.576460 -2.189510 4.126370 3.696754 0.417652
  -0.883408 0.199139 0.120875]
 [-21.321678 23.137954 38.696428 -10.110935 

In [34]:
# lambda to compute MSE between 2 vectors
def mse_lambda(vects):
    x, y = vects
    return K.mean(K.square(x - y))

# freeze weights for stacked training
def make_trainable(net, val):
    net.trainable = val
    for l in net.layers:
        l.trainable = val

# we generate a new optimizer of the same kind for every model
# we train
def opti():
    return Adam()

input_dim = (WINDOW_SIZE, 1)
input_size = np.prod(input_dim)
bottleneck_size = 320

# ---------------------------------------------------------------------------
# autoencoder: takes an audio window, compresses it, and tries to reconstruct it
# ---------------------------------------------------------------------------
def autoencoder_structure(dim):
    enc = Sequential()
    dec = Sequential()
    
    # based on architecture in this paper:
    #     http://arxiv.org/pdf/1602.02644.pdf
    # adapted to a 32x32 image instead of 64x64
    
    # dropout at input layer
    #enc.add(GaussianDropout(0.1, input_shape = dim))
    
    # (512x1) => (256x64) [9]    
    enc.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          input_shape = dim, activation = 'relu',
                          init = 'uniform', bias = True))
    #enc.add(PReLU())
    enc.add(MaxPooling1D(2))
    
    # (256x64) => (128x64) [9]    
    enc.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          activation = 'relu',
                          init = 'uniform', bias = True))
    #enc.add(PReLU())
    enc.add(MaxPooling1D(2))
    
    # (128x64) => (64x64) [9]
    enc.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          activation = 'relu',
                          init = 'uniform', bias = True))
    #enc.add(PReLU())
    enc.add(MaxPooling1D(2))
    
    # (64x64) => (32x10) [9]
    enc.add(Convolution1D(10, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          activation = 'tanh',
                          init = 'uniform', bias = True))
    enc.add(MaxPooling1D(2))
    
    # binarize   
    enc.add(Reshape((bottleneck_size,)))
    enc.add(BinarizeLayer())
    
    dec.add(Reshape((32, 10,), input_shape = (bottleneck_size,)))
    
    # (32x10) => (64x64) [5x5]
    dec.add(UpSampling1D(2))
    dec.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          init = 'uniform', activation = 'relu',
                          bias = True))
    
    # (64x64) => (128x64) [5x5]
    dec.add(UpSampling1D(2))
    dec.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          init = 'uniform', activation = 'relu',
                          bias = True))
    #dec.add(PReLU())
    
    # (128x64) => (256x64) [5x5]
    dec.add(UpSampling1D(2))
    dec.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          init = 'uniform', activation = 'relu',
                          bias = True))
    #dec.add(PReLU())
    
    # (256x64) => (512x64) [5x5]
    dec.add(UpSampling1D(2))
    dec.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          init = 'uniform', activation = 'relu',
                          bias = True))
    #dec.add(PReLU())
    
    # (512x64) => (512x64) [5x5]
    dec.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          init = 'uniform', activation = 'relu',
                          bias = True))
    #dec.add(PReLU())
    
    # (512x64) => (512x1)
    dec.add(Convolution1D(1, 1, W_regularizer = l2(0.001), border_mode = 'same',
                          init = 'uniform', activation = 'tanh',
                          bias = True))

    return enc, dec


# ---------------------------------------------------------------------------
# discriminator: tries to differentiate between original and reconstructed samples
# ---------------------------------------------------------------------------
def discriminator_structure(dim):
    dsc = Sequential()
    dsc.add(Convolution1D(64, 12, border_mode='valid', input_shape = dim,
                                    W_regularizer = l2(0.001), init = 'uniform',
                                    activation = 'relu'))

    dsc.add(Convolution1D(64, 8, border_mode='same',
                                    W_regularizer = l2(0.001), init = 'uniform',
                                    activation = 'relu'))

    dsc.add(AtrousConvolution1D(64, 12, border_mode='valid',
                                    W_regularizer = l2(0.001), init = 'uniform',
                                    atrous_rate = 2, activation = 'relu'))

    dsc.add(AtrousConvolution1D(64, 8, border_mode='same',
                                    W_regularizer = l2(0.001), init = 'uniform',
                                    atrous_rate = 2, activation = 'relu'))
    
    dsc.add(AtrousConvolution1D(64, 12, border_mode='valid',
                                    W_regularizer = l2(0.001), init = 'uniform',
                                    atrous_rate = 4, activation = 'relu'))

    dsc.add(AtrousConvolution1D(64, 8, border_mode='same',
                                    W_regularizer = l2(0.001), init = 'uniform',
                                    atrous_rate = 8, activation = 'relu'))

    dsc.add(Flatten())
    #dsc.add(MinibatchDiscrimination())
    dsc.add(Dense(1, activation = 'sigmoid'))
    
    return dsc


# plain autoencoder
plain_input = Input(shape = input_dim)
plain_enc, plain_dec = autoencoder_structure(input_dim)
plain_embedding = plain_enc(plain_input)
plain_reconstructed = plain_dec(plain_embedding)
plain_autoencoder = Model(input = [plain_input], output = [plain_reconstructed])
plain_autoencoder.compile(loss = 'mean_squared_error', optimizer = opti())

# construct autoencoder to be used in adversarial training (AAC - Adversarial AutoenCoder)
# uhhhh... whoops i screwed up the acronym
aac_input = Input(shape = input_dim)
aac_enc, aac_dec = autoencoder_structure(input_dim)
aac_embedding = aac_enc(aac_input)
aac_reconstructed = aac_dec(aac_embedding)

aac_autoencoder = Model(input = [aac_input], output = [aac_reconstructed])
aac_autoencoder.compile(loss = 'mean_squared_error', optimizer = opti())

# construct discriminator
dsc_input = Input(shape = input_dim)
dsc_struct = discriminator_structure(input_dim)

# output: activation on original image (should be 1) or reconstruction (should be 0)
dsc_label = dsc_struct(dsc_input)

# also compute label of reconstruction, for autoencoder feedback
aac_recons_discrim = dsc_struct(aac_reconstructed)



def dft_loss(y_true, y_pred):
    # transfer signals from time to frequency domain
    dft_true = theano_dft(y_true)
    dft_pred = theano_dft(y_pred)
    
    # compute MSE in frequency domain
    error = T.sum(T.sqr(dft_true - dft_pred)) / dft_true.shape[0]
    #error = T.sqrt(T.mean(T.sqr(dft_pred - dft_true)))
    return error


def mfcc_loss(y_true, y_pred):
    # transfer signals from time to frequency domain
    mfcc_true = theano_mfcc(y_true)
    mfcc_pred = theano_mfcc(y_pred)
    
    # compute MSE in frequency domain
    error = T.mean(T.sqr(mfcc_true - mfcc_pred))
    return error




def sum_squared_error(y_true, y_pred):
    return T.sum(T.abs_(y_true - y_pred)) / y_true.shape[0]
    #return T.sqrt(T.mean(T.abs_(y_true - y_pred)))
    #return K.mean(K.abs(y_pred - y_true), axis=-1)
    
def rmse(y_true, y_pred):
    return T.sqrt(T.mean(T.sqr(y_true - y_pred)))

def ulaw_rmse(y_true, y_pred):
    # transformation from wavenet paper
    ulaw_true = K.sign(y_true) * K.log(1.0 + 255.0 * K.abs(y_true)) / K.log(1.0 + 255.0)
    ulaw_pred = K.sign(y_pred) * K.log(1.0 + 255.0 * K.abs(y_pred)) / K.log(1.0 + 255.0)
    
    return T.sqrt(T.mean(T.sqr(ulaw_true - ulaw_pred)))




# compile model
loss_weights = [350.0, 1.0 / 10.0, 5.0]



make_trainable(aac_autoencoder, False)
aac_discriminator = Model(input = [dsc_input], output = [dsc_label])
aac_discriminator.compile(loss = ['binary_crossentropy'], optimizer = opti())
aac_discriminator.summary()
aac_autoencoder.summary()

make_trainable(aac_discriminator, False)
make_trainable(aac_autoencoder, True)
model = Model(input = [aac_input], output = [aac_reconstructed, aac_reconstructed, aac_recons_discrim])
model.compile(loss = [ulaw_rmse, mfcc_loss, 'binary_crossentropy'],
              loss_weights = loss_weights,
              optimizer = opti())
model.summary()

X_train = np.copy(processedWindows)
ntrain = X_train.shape[0]



____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_6 (InputLayer)             (None, 512, 1)        0                                            
____________________________________________________________________________________________________
sequential_10 (Sequential)       (None, 1)             225601      input_6[0][0]                    
Total params: 225601
____________________________________________________________________________________________________
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 512, 1)        0                                            
______________________________________________________________________

In [35]:
s = theano_dft(X_train).eval()
print X_train.shape
print s.shape

print np.linalg.norm(s[3])
print np.linalg.norm(X_train[3])


(101135, 512, 1)
(101135, 1024)
65.0342547698
2.87414


In [36]:
def test_discriminator(discriminator, autoencoder, X, y, verbose = True):
    # verify discriminator was trained properly
    y_hat = discriminator.predict(X)
    y_hat[y_hat >= 0.5] = 1
    y_hat[y_hat < 0.5] = 0

    n_total = y.shape[0]
    n_correct = np.sum(np.ravel(y_hat) == y)

    acc = n_correct * 100.0 / n_total
    if (verbose):
        print "Discriminator accuracy: %0.02f pct (%d of %d) right"%(acc, n_correct, n_total)
    return acc

In [37]:
def autoencoderTest(waveFilename, prefix, autoencoder, verbose = True):
    [rate, data] = sciwav.read(waveFilename)
    processedWave, wparams = preprocessWaveform(data)
    windows = extractWindows(processedWave, STEP_SIZE, OVERLAP_SIZE)
    

    # first, write desired reconstruction
    transformed, tparams = preprocessWindows(windows)
    if (verbose):
        print transformed.shape
    desired = unpreprocessWindows(transformed, tparams)
    desired = reconstructFromWindows(desired, OVERLAP_SIZE, OVERLAP_FUNC)
    desired = unpreprocessWaveform(desired, wparams)
    sciwav.write(prefix + "desired.wav", rate, desired.astype(np.int16))
    
    # then, run NN on transformed windows
    transformed, tparams = preprocessWindows(windows)
    
    transformed = np.reshape(transformed, (transformed.shape[0], WINDOW_SIZE, 1))
    autoencOutput = autoencoder.predict(transformed, batch_size = BATCH_SIZE, verbose = (1 if verbose else 0))
    if (verbose):
        print autoencOutput.shape
    autoencOutput = np.reshape(autoencOutput, (autoencOutput.shape[0], WINDOW_SIZE))
    
    if (verbose):
        print autoencOutput.shape
    recons = unpreprocessWindows(autoencOutput, tparams)
    recons = reconstructFromWindows(recons, OVERLAP_SIZE, OVERLAP_FUNC)
    recons = unpreprocessWaveform(recons, wparams)
    
    sciwav.write(prefix + "output.wav", rate, recons.astype(np.int16))
    
    metrics = [
        np.max(desired),
        np.min(desired),
        np.max(recons),
        np.min(recons),
        mse(recons, desired),
        avgErr(recons, desired)
    ]
    
    if (verbose):
        print "Max/min desired:", metrics[0], metrics[1]
        print "Max/min recons: ", metrics[2], metrics[3]
        print waveFilename, " mse: ", metrics[4]
        print waveFilename, " avg err: ", metrics[5]
        
    return metrics

In [38]:
np.set_printoptions(formatter={'float_kind':'{:4f}'.format})

BATCH_SIZE = 64
NUM_BATCHES = ntrain / BATCH_SIZE
NUM_EPOCHS = 300

lead = "    "
d_loss = 0.0
a_losses = []
d_acc = 0.0
discrim_train_y = np.concatenate((np.ones(ntrain), np.zeros(ntrain)))


for epoch in range(NUM_EPOCHS):
    print "Epoch " + str(epoch + 1) + ":"

    # present batches randomly each epoch
    lis = range(0, ntrain, BATCH_SIZE)
    random.shuffle(lis)
    
    # keep track of start time and current batch #
    i = 0
    startTime = time.time()
    for idx in lis:
        batch = X_train[idx:idx+BATCH_SIZE, :,  :]
        nbatch = batch.shape[0]
        
        a_losses = ["autoencoder not training"]
        d_loss = "discriminator not training"
        
        # train autoencoder, if discriminator accuracy is greater than 70%
        if (epoch > 0):
            make_trainable(aac_discriminator, False)
            make_trainable(aac_autoencoder, True)
            a_losses = model.train_on_batch(batch, [batch, batch, np.ones(nbatch)])
        
        # train discriminator on what the autoencoder now generates
        make_trainable(aac_discriminator, True)
        make_trainable(aac_autoencoder, False)
        generated = aac_autoencoder.predict(batch)
        discrim_batch_X = np.concatenate((batch, generated))
        discrim_batch_y = np.concatenate((np.ones(nbatch), np.zeros(nbatch)))
        d_loss = aac_discriminator.train_on_batch(discrim_batch_X, discrim_batch_y)
        
        # print statistics every 10 batches so we know stuff is still going down
        if (i % 10 == 0):
            printStr = "        \r" + lead + str(i * BATCH_SIZE) + ": " + str(d_loss) + " "
            print printStr,
            
            loss_arr = np.asarray(a_losses)
            print loss_arr,
            
            if (len(loss_arr) > 1):
                for i in xrange(0, len(loss_weights)):
                    loss_arr[i + 1] *= loss_weights[i]
                print loss_arr,
            
        i += 1
    print ""
    
    # print elapsed time
    elapsed = time.time() - startTime
    print lead + "Total time for epoch: " + str(elapsed) + "s"
    
    # evaluate on full training set every 10 epochs
    if (epoch == 0 or (epoch + 1) % 10 == 0):
        startTime = time.time()
        
        a_losses = model.evaluate(X_train, [X_train, X_train, np.ones(ntrain)], verbose = 0)
        generated = aac_autoencoder.predict(X_train, verbose = 0)
        discrim_train_X = np.concatenate((X_train, generated))
        d_acc = test_discriminator(aac_discriminator, aac_autoencoder,
                                   discrim_train_X, discrim_train_y, verbose = False)
    
        print lead + "Evaluated on full training set: " + str(d_acc) + "% d_acc -- a_losses ",
        print np.asarray(a_losses)
        elapsed = time.time() - startTime
        print lead + "Total time for evaluation: " + str(elapsed) + "s"
        
    # evaluate on real data every 5 epochs
    if (epoch % 5 == 0):
        startTime = time.time()
        print lead + "Evaluating..."
        metrics = autoencoderTest("./SA1.WAV", "SA1_reg_eval_", aac_autoencoder, verbose = False)
        
        print lead + "Max/min desired:", metrics[0], metrics[1]
        print lead + "Max/min recons: ", metrics[2], metrics[3]
        print lead + "MSE:     ", metrics[4]
        print lead + "Avg err: ", metrics[5]
        
        elapsed = time.time() - startTime
        print lead + "Total time for evaluation: " + str(elapsed) + "s"
        
    print ""

Epoch 1:
    101120: 0.0303344223648  ['autoencoder not training'] 
    Total time for epoch: 697.655938864s
    Evaluated on full training set: 99.3330696594% d_acc -- a_losses  [411.331134 0.440947 2048.557748 10.428798]
    Total time for evaluation: 429.555788994s
    Evaluating...


UnboundLocalError: local variable 'unprocessed' referenced before assignment

In [None]:
autoencoderTest("./SA1.WAV", "SA1_aac_", aac_autoencoder)
autoencoderTest("./SX383.WAV", "SX383_aac_", aac_autoencoder)
autoencoderTest("./fiveYears.wav", "fy_aac_", aac_autoencoder)