In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np

import random
import h5py
from keras.datasets import cifar10
from keras.models import *
from keras.layers import *
from keras.layers.core import *
from keras.layers.normalization import *
from keras.optimizers import *
from keras.callbacks import *
from keras import backend as K
from keras.regularizers import *
import theano.tensor as T
import theano
from theano.tensor.shared_randomstreams import RandomStreams
from sklearn import metrics
from skimage.measure import compare_ssim
from scipy.misc import toimage
from sklearn.preprocessing import *

import os
import random
import time
from skimage import io, exposure, feature, color, transform
import matplotlib
import matplotlib.pyplot as plt
import glob

import scipy.signal as sig
import operator
import math

# for reproducibility
np.random.seed(1337) 
random.seed(1337)

Using Theano backend.
Using gpu device 0: GeForce GTX 690 (CNMeM is disabled, cuDNN 5105)


In [2]:
# parameters for sliding window, and window function (Hann)
STEP_SIZE = 480
OVERLAP_SIZE = 32
WINDOW_SIZE = STEP_SIZE + OVERLAP_SIZE
OVERLAP_FUNC = sig.hann(OVERLAP_SIZE * 2)

# directory that contains TIMIT files
TIMIT_DIR = "/home/sri/Desktop/timit"

# directory that contains .wav files to process
NUM_EPOCHS = 1
BATCH_SIZE = 64

# randomly shuffle data before partitioning into training/validation?
RANDOM_SHUFFLE = True

# sample rate of input file (used in MFCC calculation)
SAMPLE_RATE = 16000

In [3]:
from load_TIMIT import *
from windowingFunctions import *
from utility import *

In [4]:
# read in 100 WAVs from TIMIT training set
rawWaveforms = load_TIMIT_train(TIMIT_DIR, 100)

Reading in .wav files...
0: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SA1.WAV1: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SX339.WAV2: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SI1059.WAV3: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SI1689.WAV4: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SX429.WAV5: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SX69.WAV6: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SX159.WAV7: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SI2319.WAV8: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SA2.WAV9: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MKLR0/SX249.WAV10: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MCRE0/SX221.WAV11: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MCRE0/SA1.WAV12: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MCRE0/SX41.WAV13: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MCRE0/SI1751.WAV14: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MCRE0/SI1725.WAV15: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/MCRE0/SX401.WAV16: /home/sri/Desktop/timit/

In [5]:
# waveform preprocessing
def preprocessWaveform(waveform):
    mn = np.min(waveform)
    mx = np.max(waveform)
    
    maxabs = max(abs(mn), abs(mx))
    scl = 32768.0 / maxabs    
    
    processed = waveform * scl
    
    return processed, (scl,)
    
def unpreprocessWaveform(waveform, params):
    unprocessed = waveform / params[0]
    return unprocessed



# window preprocessing
def preprocessWindows(windows):
    # scale window between -1 and 1
    processed = np.copy(windows)
    processed /= 32768.0
    
    # apply u-law transformation
    processed = np.sign(processed) * np.log(1.0 + 255.0 * np.abs(processed)) / np.log(1.0 + 255.0)
    
    # scale from 0 to 255
    processed = ((processed + 1.0) / 2.0) * 255.0
    
    # quantize into 255 bins
    processed = np.round(processed)
    processed = processed.astype(np.int16)
    
    return processed, ()

def unpreprocessWindows(windows, params):
    # de-quantize (go from ints [0, 255] to float [-1.0, 1.0])
    unprocessed = np.copy(windows)
    unprocessed = unprocessed.astype(np.float32)
    unprocessed = (unprocessed / 255.0) * 2.0 - 1.0
    
    # un-apply u-law transformation
    unprocessed = np.sign(unprocessed) * (np.power(256, np.abs(unprocessed)) - 1) / 255.0
    
    # scale window from [-1, 1] to [-32768, 32768]
    unprocessed *= 32768.0
    return unprocessed

In [6]:
# test data preprocessing and loss
data = np.copy(rawWaveforms[55])

processedWave, wparams = preprocessWaveform(data)
windows = extractWindows(processedWave, STEP_SIZE, OVERLAP_SIZE)
transformed, tparams = preprocessWindows(windows)

print np.max(transformed)
print np.min(transformed)

desired = unpreprocessWindows(transformed, tparams)
desired = reconstructFromWindows(desired, OVERLAP_SIZE, OVERLAP_FUNC)
desired = unpreprocessWaveform(desired, wparams)

# trim desired down to length of data
if (desired.shape[0] > data.shape[0]):
    desired = desired[:data.shape[0]]
desired = np.round(desired)
    
sciwav.write("orig.wav", 16000, data.astype(np.int16))
sciwav.write("processed.wav", 16000, desired.astype(np.int16))

print "mean l1:", np.mean(np.abs(data - desired))
print "max l1: ", np.max(np.abs(data - desired))
print "means:", np.mean(data), np.mean(desired)
print "stds: ", np.std(data), np.std(desired)
print "max/min orig:  ", np.max(data), np.min(data)
print "max/min recons:", np.max(desired), np.min(desired)

255
7
mean l1: 1.56021363606
max l1:  38.0
means: 0.471988200854 0.466357
stds:  210.915280373 210.674
max/min orig:   1379.0 -1023.0
max/min recons: 1379.0 -1016.0


In [7]:
# waveform preprocessing
processedWaveforms = np.copy(rawWaveforms)

# we maximize the volume of every waveform
for i in xrange(0, len(processedWaveforms)):
    processedWaveforms[i], _ = preprocessWaveform(processedWaveforms[i])

In [8]:
# extract windows
rawWindows = extractWindowsMultiple(processedWaveforms, STEP_SIZE, OVERLAP_SIZE,
                                    collapse = True)

# randomly shuffle data
if (RANDOM_SHUFFLE):
    rawWindows = np.random.permutation(rawWindows)

print "Raw windows shape: ", rawWindows.shape
print "Max: ", np.amax(rawWindows)
print "Min: ", np.amin(rawWindows)

Raw windows shape:  (10116, 512)
Max:  32768.0
Min:  -32768.0


In [9]:
# data augmentation goes here, at some point
augWindows = np.copy(rawWindows)

print "Aug windows shape: ", augWindows.shape

Aug windows shape:  (10116, 512)


In [10]:
processedWindows, pwParams = preprocessWindows(augWindows)

# reshape into (WINDOW_SIZE x 1) vector form for training
X_train = np.reshape(processedWindows, (processedWindows.shape[0], WINDOW_SIZE, 1))
ntrain = X_train.shape[0]

print processedWindows.shape

print np.mean(processedWindows, axis=None)
print np.std(processedWindows, axis=None)
print np.min(processedWindows, axis = None)
print np.max(processedWindows, axis = None)

(10116, 512)
124.970972655
50.8355320904
0
255


In [11]:
# operations for binarization layer (THEANO ONLY)
class Binarize(T.Op):
    # properties attribute
    __props__ = ()
    
    def __init__(self):
        super(Binarize, self).__init__()
        
    def make_node(self, x):
        assert hasattr(self, '_props'), "Your version of theano is too old to support __props__."
        x = T.as_tensor_variable(x)
        return theano.Apply(self, [x], [x.type()])
    
    def perform(self, node, inputs, output_storage):
        x, = inputs
        z, = output_storage
        
        # TODO: learn threshold per parameter?
        z[0] = np.copy(x)
        z[0][z[0] < 0] = -1
        z[0][z[0] >= 0] = 1
    
    def grad(self, input, output_gradients):
        # pass through gradients unchanged
        # (i don't think there's a mathematical justification for this?)
        return [output_gradients[0]]
        
    def infer_shape(self, node, i0_shapes):
        # output shape is same as input shape
        return i0_shapes

    
class BinarizeLayer(Layer):
    """ Binarizes input 
    <feedforward> binarizes output of tanh to -1 and 1
    <backward> returns delta unchanged
    """
    def __init__(self, **kwargs):
        super(BinarizeLayer, self).__init__(**kwargs)
        self.op = Binarize()
    
    def build(self, input_shape):
        # no trainable parameters
        self.trainable_weights = []
    
    def call(self, x, mask=None):
        return self.op(x)

    #def get_output_shape_for(self, input_shape):
    #    return input_shape
    
    def get_config(self):
        config = {"name": self.__class__.__name__}
        base_config = super(BinarizeLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


In [18]:
input_dim = (WINDOW_SIZE, 1)
input_size = np.prod(input_dim)
bottleneck_size = 320

# ---------------------------------------------------------------------------
# autoencoder: takes an audio window, compresses it, and tries to reconstruct it
# ---------------------------------------------------------------------------
def autoencoder_structure(dim):
    enc = Sequential()
    dec = Sequential()
    
    # based on architecture in this paper:
    #     http://arxiv.org/pdf/1602.02644.pdf
    # adapted to a 32x32 image instead of 64x64
   
    # (512x1) => (256x64)   
    enc.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          input_shape = dim, activation = 'relu',
                          init = 'uniform', bias = True))
    enc.add(MaxPooling1D(2))
    
    # (256x64) => (128x64)    
    enc.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          activation = 'relu',
                          init = 'uniform', bias = True))
    enc.add(MaxPooling1D(2))
    
    # (128x64) => (64x64)
    enc.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          activation = 'relu',
                          init = 'uniform', bias = True))
    enc.add(MaxPooling1D(2))
    
    # (64x64) => (32x10)
    enc.add(Convolution1D(10, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          activation = 'tanh',
                          init = 'uniform', bias = True))
    enc.add(MaxPooling1D(2))
    
    # binarize   
    enc.add(Reshape((bottleneck_size,)))
    enc.add(BinarizeLayer())
    
    dec.add(Reshape((32, 10,), input_shape = (bottleneck_size,)))
    
    # (32x10) => (64x64)
    dec.add(UpSampling1D(2))
    dec.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          init = 'uniform', activation = 'relu',
                          bias = True))
    
    # (64x64) => (128x64)
    dec.add(UpSampling1D(2))
    dec.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          init = 'uniform', activation = 'relu',
                          bias = True))
    
    # (128x64) => (256x64)
    dec.add(UpSampling1D(2))
    dec.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          init = 'uniform', activation = 'relu',
                          bias = True))
    
    # (256x64) => (512x64)
    dec.add(UpSampling1D(2))
    dec.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          init = 'uniform', activation = 'relu',
                          bias = True))
    
    # (512x64) => (512x64)
    dec.add(Convolution1D(64, 16, W_regularizer = l2(0.001), border_mode = 'same',
                          init = 'uniform', activation = 'relu',
                          bias = True))
    
    # (512x64) => (512x256) where each 256-len vector is a softmax
    dec.add(Convolution1D(256, 1, W_regularizer = l2(0.001), border_mode = 'same',
                          init = 'uniform', activation = 'softmax',
                          bias = True))

    return enc, dec




from keras import metrics
all_metrics = [
    metrics.categorical_accuracy,
    metrics.mean_absolute_error
]
    
# plain autoencoder
inp = Input(shape = input_dim)
enc, dec = autoencoder_structure(input_dim)
embedding = enc(inp)
recons = dec(embedding)
autoencoder = Model(input = inp, output = recons)
autoencoder.compile(loss = 'categorical_crossentropy', optimizer = Adam())
autoencoder.summary()

# compile another model (in case i want to do other stuff during training)
model = Model(input = inp, output = recons)
model.compile(loss = 'categorical_crossentropy',
              optimizer = Adam(), metrics = all_metrics)



____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 512, 1)        0                                            
____________________________________________________________________________________________________
sequential_3 (Sequential)        (None, 320)           142538      input_2[0][0]                    
____________________________________________________________________________________________________
sequential_4 (Sequential)        (None, 512, 256)      289344      sequential_3[1][0]               
Total params: 431882
____________________________________________________________________________________________________


In [19]:
import scipy

# turns quantized windows to one-hot vectors
#     (optimized function ;) 
def windows_to_one_hot(windows, soft = False, sz = 9, std = 1.0):
    one_hot = np.zeros((windows.shape[0], windows.shape[1], 256))
    
    if (soft):
        # soft one-hot vector
        kernel = scipy.signal.gaussian(sz, std)
        kernel /= sum(kernel)
        
        # for each window in windows
        for i in xrange(0, windows.shape[0]):
            # this is really hacky, but it's the fastest way to do this
            # special convolution
            for j in xrange(-4, 5):
                try:
                    one_hot[i, np.arange(windows[i].shape[0]),
                            np.where(windows[i] + j < 0, 256+1, windows[i] + j)] = kernel[j+4]
                except:
                    pass
    else:
        # for each window in windows
        for i in xrange(0, windows.shape[0]):
            # set indices to 1
            one_hot[i, np.arange(windows[i].shape[0]), windows[i]] = 1
    
    return one_hot
    
# turns one-hot-vectors into quantized (by taking argmax)
def one_hot_to_windows(one_hot):
    return np.array([[np.argmax(v) for v in w] for w in one_hot])
    
print X_train.shape

batch = X_train[:128, :,  0]
batch_onehot = windows_to_one_hot(batch, True)
batch_rec = one_hot_to_windows(batch_onehot)

print np.mean(batch_rec - batch)


t0 = time.time()
for i in xrange(0, 80):
    windows_to_one_hot(batch, True)
t1 = time.time()
print t1-t0, "secs"

t0 = time.time()
for i in xrange(0, 80):
    sz = 9
    std = 1.0
    kernel = scipy.signal.gaussian(sz, std)
    kernel /= sum(kernel)
t1 = time.time()
print t1-t0, "secs"


#one_hot = np.eye(16)[[5, 4]]
#print one_hot
#print [np.convolve(w, kernel, mode = 'same') / ksum for w in one_hot]


(10116, 512, 1)
0.0
1.88981103897 secs
0.000838994979858 secs


In [20]:
def autoencoderTest(waveFilename, prefix, autoencoder):
    [rate, data] = sciwav.read(waveFilename)
    processedWave, wparams = preprocessWaveform(data)
    windows = extractWindows(processedWave, STEP_SIZE, OVERLAP_SIZE)
    

    # first, write desired reconstruction
    transformed, tparams = preprocessWindows(windows)
    print transformed.shape
    desired = unpreprocessWindows(transformed, tparams)
    desired = reconstructFromWindows(desired, OVERLAP_SIZE, OVERLAP_FUNC)
    desired = unpreprocessWaveform(desired, wparams)
    sciwav.write(prefix + "desired.wav", rate, desired.astype(np.int16))
    
    # then, run NN on transformed windows
    transformed, tparams = preprocessWindows(windows)
    
    transformed = np.reshape(transformed, (transformed.shape[0], WINDOW_SIZE, 1))
    autoencOutput = autoencoder.predict(transformed, batch_size = 64, verbose = 1)
    print autoencOutput.shape
    autoencOutput = np.reshape(autoencOutput, (autoencOutput.shape[0], WINDOW_SIZE, 256))
    autoencOutput = one_hot_to_windows(autoencOutput)
    
    print autoencOutput.shape
    recons = unpreprocessWindows(autoencOutput, tparams)
    recons = reconstructFromWindows(recons, OVERLAP_SIZE, OVERLAP_FUNC)
    recons = unpreprocessWaveform(recons, wparams)
    
    print "Max desired:", np.max(desired)
    print "Min desired:", np.min(desired)
    print "Max recons: ", np.max(recons)
    print "Min recons: ", np.min(recons)
    
    sciwav.write(prefix + "output.wav", rate, recons.astype(np.int16))

    print waveFilename, " mse: ", mse(recons, desired)
    print waveFilename, " avg err: ", avgErr(recons, desired)

In [None]:
np.set_printoptions(formatter={'float_kind':'{:4f}'.format})

BATCH_SIZE = 128
NUM_BATCHES = ntrain / BATCH_SIZE
NUM_EPOCHS = 100

lead = "    "
d_loss = 0.0
a_losses = []
d_acc = 0.0
discrim_train_y = np.concatenate((np.ones(ntrain), np.zeros(ntrain)))


for epoch in range(NUM_EPOCHS):
    print "Epoch " + str(epoch + 1) + ":"

    # present batches randomly each epoch
    lis = range(0, ntrain, BATCH_SIZE)
    random.shuffle(lis)
    
    # keep track of start time and current batch #
    i = 0
    startTime = time.time()
    totalLosses = []
    for idx in lis:
        batch = X_train[idx:idx+BATCH_SIZE, :,  :]
        nbatch = batch.shape[0]
        
        batch_onehot = windows_to_one_hot(batch[:, :, 0], True)
        
        losses = model.train_on_batch(batch, batch_onehot)
        if (totalLosses == []):
            totalLosses = np.array(losses)
        else:
            totalLosses += np.array(losses)
           
        avgLosses = totalLosses / (i + 1)
        
        # print statistics every 10 batches so we know stuff is still going down
        if (i % 10 == 0):
            printStr = "        \r" + lead + str(i * BATCH_SIZE) + ": " + str(avgLosses)
            print printStr,
            
        i += 1
    print ""
    
    # print elapsed time
    elapsed = time.time() - startTime
    print lead + "Total time for epoch: " + str(elapsed) + "s"
    
    print ""

Epoch 1:


KeyboardInterrupt: 

In [None]:
autoencoderTest("./SA1.WAV", "SA1_aac_", autoencoder)
autoencoderTest("./SX383.WAV", "SX383_aac_", autoencoder)
autoencoderTest("./fiveYears.wav", "fy_aac_", autoencoder)

(112, 512)
