In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np

import random
import h5py
from keras.datasets import cifar10
from keras.models import *
from keras.layers import *
from keras.layers.core import *
from keras.layers.normalization import *
from keras.optimizers import *
from keras.callbacks import *
from keras import backend as K
from keras.regularizers import *
from keras.models import load_model
import theano.tensor as T
import theano
from theano.tensor.shared_randomstreams import RandomStreams
from sklearn import metrics
from skimage.measure import compare_ssim
from scipy.misc import toimage
from sklearn.preprocessing import *
from scipy.fftpack import dct, idct

import os
import random
import time
from skimage import io, exposure, feature, color, transform
import matplotlib
import matplotlib.pyplot as plt
import glob

import scipy.signal as sig
import operator
import math
import re

# for reproducibility
np.random.seed(1337) 
random.seed(1337)

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 690 (CNMeM is disabled, cuDNN 5105)


In [2]:
# external custom code I wrote
from load_TIMIT import *
from windowingFunctions import *
from utility import *

In [3]:
# parameters for sliding window, and window function (Hann)
STEP_SIZE = 480
OVERLAP_SIZE = 32
WINDOW_SIZE = STEP_SIZE + OVERLAP_SIZE
OVERLAP_FUNC = sig.hann(OVERLAP_SIZE * 2)

# sample rate of input files
SAMPLE_RATE = 16000

# randomly shuffle data before partitioning into training/validation?
RANDOM_SHUFFLE = True

# number of speech files for train, val, and test
TRAIN_SIZE = 1000
VAL_SIZE = 100
TEST_SIZE = 500

# during training, we evaluate PESQ and RMSE and such on full speech files every epoch, which
# is kind of expensive. so instead of selecting the full training and validation set, we
# randomly select this many waveforms
TRAIN_EVALUATE = 50
VAL_EVALUATE = 50

In [4]:
# generate train/test split and load waveforms
train_paths, val_paths, test_paths = \
    timit_train_test_val(TRAIN_SIZE, VAL_SIZE, TEST_SIZE)

train_waveforms = load_raw_waveforms(train_paths)
val_waveforms = load_raw_waveforms(val_paths)
test_waveforms = load_raw_waveforms(test_paths)

0: /home/sri/Desktop/timit/TIMIT/TRAIN/DR1/FSJK1/SI1025.WAV1: /home/sri/Desktop/timit/TIMIT/TRAIN/DR2/MPPC0/SX152.WAV2: /home/sri/Desktop/timit/TIMIT/TRAIN/DR3/FLAC0/SI1339.WAV3: /home/sri/Desktop/timit/TIMIT/TRAIN/DR4/MLBC0/SX339.WAV4: /home/sri/Desktop/timit/TIMIT/TRAIN/DR5/FLOD0/SX117.WAV5: /home/sri/Desktop/timit/TIMIT/TRAIN/DR6/FSDJ0/SA1.WAV6: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/FCJS0/SA1.WAV7: /home/sri/Desktop/timit/TIMIT/TRAIN/DR8/MCXM0/SI721.WAV8: /home/sri/Desktop/timit/TIMIT/TRAIN/DR1/FDML0/SA1.WAV9: /home/sri/Desktop/timit/TIMIT/TRAIN/DR2/FLMC0/SI1372.WAV10: /home/sri/Desktop/timit/TIMIT/TRAIN/DR3/MAKR0/SX272.WAV11: /home/sri/Desktop/timit/TIMIT/TRAIN/DR4/FJWB1/SI2055.WAV12: /home/sri/Desktop/timit/TIMIT/TRAIN/DR5/FLOD0/SA1.WAV13: /home/sri/Desktop/timit/TIMIT/TRAIN/DR6/FSDJ0/SX305.WAV14: /home/sri/Desktop/timit/TIMIT/TRAIN/DR7/FLEH0/SI2311.WAV15: /home/sri/Desktop/timit/TIMIT/TRAIN/DR8/MKRG0/SX411.WAV16: /home/sri/Desktop/timit/TIMIT/TRAIN/DR1/MMRP0/SX5

In [5]:
# waveform preprocessing functions
def preprocess_waveform(waveform):
    # scale waveform between -1 and 1 (maximizing its volume)
    mn = np.min(waveform)
    mx = np.max(waveform)
    maxabs = np.maximum(np.abs(mn), np.abs(mx))
        
    return np.copy(waveform) / maxabs, (maxabs,)

def unpreprocess_waveform(waveform, params):
    return np.copy(waveform) * params[0]

In [6]:
# waveform preprocessing in action
train_procwave = np.copy(train_waveforms)
val_procwave = np.copy(val_waveforms)
test_procwave = np.copy(test_waveforms)

train_wparams = [()] * len(train_procwave)
val_wparams = [()] * len(val_procwave)
test_wparams = [()] * len(test_procwave)


# we maximize the volume of every waveform
for i in xrange(0, len(train_procwave)):
    train_procwave[i], train_wparams[i] = \
        preprocess_waveform(train_procwave[i])
for i in xrange(0, len(val_procwave)):
    val_procwave[i], val_wparams[i] = \
        preprocess_waveform(val_procwave[i])
for i in xrange(0, len(test_procwave)):
    test_procwave[i], test_wparams[i] = \
        preprocess_waveform(test_procwave[i])

In [7]:
# turn each waveform into a corresponding list of windows
train_windows = extractWindowsMultiple(train_procwave, STEP_SIZE, OVERLAP_SIZE,
                                       collapse = False)
val_windows = extractWindowsMultiple(val_procwave, STEP_SIZE, OVERLAP_SIZE,
                                     collapse = False)
test_windows = extractWindowsMultiple(test_procwave, STEP_SIZE, OVERLAP_SIZE,
                                      collapse = False)

In [8]:
# flatten all of the train windows into vectors
train_processed = np.array([i for z in train_windows for i in z])
train_processed = np.reshape(train_processed, (train_processed.shape[0], WINDOW_SIZE, 1))

# randomly shuffle data, if we want to
if (RANDOM_SHUFFLE):
    train_processed = np.random.permutation(train_processed)
    
print train_processed.shape
print np.mean(train_processed, axis=None)
print np.std(train_processed, axis=None)
print np.min(train_processed, axis = None)
print np.max(train_processed, axis = None)

(101814, 512, 1)
6.41179e-06
0.103588
-1.0
1.0


In [9]:
class CodeRound(T.Op):
    # properties attribute
    __props__ = ()
    
    def __init__(self, nbins):
        self.nbins = nbins
        super(CodeRound, self).__init__()
        
    def make_node(self, x):
        assert hasattr(self, '_props'), "Your version of theano is too old to support __props__."
        x = T.as_tensor_variable(x)
        return theano.Apply(self, [x], [x.type()])
    
    def perform(self, node, inputs, output_storage):
        x, = inputs
        z, = output_storage
        
        s = (x + 1.0) / 2.0
        s = np.round(s * float(self.nbins - 1)) / float(self.nbins - 1)
        s = (s * 2.0) - 1.0
        
        z[0] = s
    
    def grad(self, input, output_gradients):
        # pass through gradients unchanged
        x, = input
        g, = output_gradients
        return [g]
        
    def infer_shape(self, node, i0_shapes):
        # output shape is same as input shape
        return i0_shapes

In [10]:
class PhaseShiftUp1D(Layer):
    """ PhaseShiftUp1D
    Takes vector of size: B x S x nF
    And returns vector: B x nS x F
    """
    def __init__(self, n, **kwargs):
        super(PhaseShiftUp1D, self).__init__(**kwargs)
        self.n = n
    
    def build(self, input_shape):
        # no trainable parameters
        self.trainable_weights = []
        super(PhaseShiftUp1D, self).build(input_shape)
        
    def call(self, x, mask=None):
        r = T.reshape(x, (x.shape[0], x.shape[1], x.shape[2] / self.n, self.n))
        r = T.transpose(r, (0, 1, 3, 2))
        r = T.reshape(r, (x.shape[0], x.shape[1] * self.n, x.shape[2] / self.n))
        return r
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[1] * self.n, input_shape[2] / self.n)
    
    def get_config(self):
        config = {'n' : self.n}
        base_config = super(PhaseShiftUp1D, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [11]:
# ------------------------------------------------------------------
# RMSE LOSS FUNCTION (NaN-safe)
# ------------------------------------------------------------------
def rmse(y_true, y_pred):
    mse = K.mean(K.square(y_pred - y_true), axis=-1)
    return K.sqrt(mse + K.epsilon())

In [12]:
# freeze weights for stacked training
def make_trainable(net, val):
    net.trainable = val
    for l in net.layers:
        l.trainable = val

In [13]:
# ====================================================
# PARAMETERS FOR AUTOENCODER STRUCTURE
# ====================================================
NBINS = 201
TIMES_DOWNSAMPLE = 0

NCHAN = 32
FILT_SIZE = 7
FILT_MID = FILT_SIZE / 2 + 1

NUM_RES_BLOCKS = 4

input_dim = (WINDOW_SIZE, 1)
input_size = np.prod(input_dim)
bottleneck_size = WINDOW_SIZE / int(2 ** TIMES_DOWNSAMPLE)

w_init = 'glorot_normal'

def activation():
    return LeakyReLU(0.3)

In [14]:
# ----------------------------------------------------
# blocks of network
# ----------------------------------------------------

# residual block, going from NCHAN to NCHAN channels
def residual_block(num_chans = NCHAN, filt_size = FILT_SIZE,
                   dilation = 1):
    def f(input):
        shortcut = input
        
        res = Conv1D(num_chans, filt_size, padding = 'same',
                     kernel_initializer = w_init,
                     activation = 'linear',
                     use_bias = True,
                     dilation_rate = dilation)(input)
        res = activation()(res)
        res = Conv1D(num_chans, filt_size, padding = 'same',
                     kernel_initializer = w_init,
                     activation = 'linear',
                     use_bias = True,
                     dilation_rate = dilation)(res)
        
        m = Add()([shortcut, res])
        return m
    
    return f


# increase number of channels from 1 to NCHAN via convolution
def channel_increase_block(num_chans = NCHAN, filt_size = FILT_SIZE):
    def f(input):
        shortcut = Permute((2, 1))(input)
        shortcut = UpSampling1D(NCHAN)(shortcut)
        shortcut = Permute((2, 1))(shortcut)
        
        res = Conv1D(num_chans, filt_size, padding = 'same',
                     kernel_initializer = w_init,
                     activation = 'linear')(input)
        res = activation()(res)
        res = Conv1D(num_chans, filt_size, padding = 'same',
                     kernel_initializer = w_init,
                     activation = 'linear')(res)
        
        m = Add()([shortcut, res])
        return m
        
    return f


# downsample the signal 2x
def downsample_block(num_chans = NCHAN, filt_size = FILT_SIZE):
    def f(input):
        shortcut = AveragePooling1D(2)(input)
        
        res = Conv1D(num_chans, filt_size, padding = 'same',
                     kernel_initializer = w_init,
                     activation = 'linear',
                     strides = 2)(input)
        res = activation()(res)
        res = Conv1D(num_chans, filt_size, padding = 'same',
                     kernel_initializer = w_init,
                     activation = 'linear')(res)
        
        m = Add()([shortcut, res])
        return m
    
    return f


# upsample the signal 2x
def upsample_block(num_chans = NCHAN, filt_size = FILT_SIZE):
    def f(input):
        shortcut = UpSampling1D(2)(input)
        
        res = Conv1D(num_chans * 2, filt_size, padding = 'same',
                     kernel_initializer = w_init,
                     activation = 'linear')(input)
        res = PhaseShiftUp1D(2)(res)
        res = activation()(res)
        res = Conv1D(num_chans, filt_size, padding = 'same',
                     kernel_initializer = w_init,
                     activation = 'linear')(res)
        
        m = Add()([shortcut, res])
        return m
    
    return f


# increase number of channels from NCHAN to 1 via convolution
def channel_decrease_block(num_chans = NCHAN, filt_size = FILT_SIZE):
    def f(input):
        shortcut = Permute((2, 1))(input)
        shortcut = GlobalAveragePooling1D()(shortcut)
        shortcut = Reshape((-1, 1))(shortcut)
        
        res = Conv1D(num_chans, filt_size, padding = 'same',
                     kernel_initializer = w_init,
                     activation = 'linear')(input)
        res = activation()(res)
        res = Conv1D(1, filt_size, padding = 'same',
                     kernel_initializer = w_init,
                     activation = 'linear')(res)

        m = Add()([shortcut, res])
        return m
        
    return f

In [15]:
# ---------------------------------------------------------------------------
# autoencoder: takes an audio window, compresses it, and tries to reconstruct it
# ---------------------------------------------------------------------------
def autoencoder_structure(dim):
    # - - - - - - - - - - - - - - - - - - - - -
    # encoder
    # - - - - - - - - - - - - - - - - - - - - -
    enc_input = Input(shape = dim)
    enc = Reshape(dim, input_shape = dim)(enc_input)
    
    # increase number of channels via convolution
    enc = channel_increase_block()(enc)
    
    # downsampling blocks
    for i in xrange(0, TIMES_DOWNSAMPLE): 
        enc = downsample_block()(enc)
        
    # residual blocks
    for i in xrange(0, NUM_RES_BLOCKS):
        enc = residual_block()(enc)
    
    # decrease back down to 1 channel
    enc = channel_decrease_block()(enc)
    enc = Activation('tanh')(enc)
    
    enc = Reshape((bottleneck_size,))(enc)    
    
    #enc = Lambda(lambda x : K.clip(x, -1.0, 1.0))(enc)
    enc = Lambda(lambda x : CodeRound(NBINS)(x))(enc)
    
    enc = Model(input = enc_input, output = enc)
    
    # - - - - - - - - - - - - - - - - - - - - -
    # decoder
    # - - - - - - - - - - - - - - - - - - - - -
    dec_input = Input(shape = (bottleneck_size,))    
    dec = Reshape((bottleneck_size, 1))(dec_input)
    
    # increase number of channels via convolution
    dec = channel_increase_block()(dec)
    
    # residual blocks
    for i in xrange(0, NUM_RES_BLOCKS):
        dec = residual_block()(dec)
    
    # upsampling blocks
    for i in xrange(0, TIMES_DOWNSAMPLE):
        dec = upsample_block()(dec)
    
    # decrease back down to 1 channel
    dec = channel_decrease_block()(dec)
    dec = Activation('tanh')(dec)
    #dec = Lambda(lambda x : K.clip(x, -1.0, 1.0))(dec)
    
    dec = Model(input = dec_input, output = dec)
    
    # return both encoder and decoder
    return enc, dec

In [16]:
def pesq_net_structure():
    NFILTS = 32
    FSIZE = 5
    DENSE_SIZE = 16
    
    def siamese_half():
        inp = Input(shape = (WINDOW_SIZE, 1))
        ret = Reshape((WINDOW_SIZE, 1))(inp)
        
        ret = channel_increase_block(NFILTS, FSIZE)(ret)
        
        ret = downsample_block(NFILTS, FSIZE)(ret)
        ret = downsample_block(NFILTS, FSIZE)(ret)

        ret = residual_block(NFILTS, FSIZE)(ret)
        ret = residual_block(NFILTS, FSIZE)(ret)
        
        ret = channel_decrease_block(NFILTS, FSIZE)(ret)

        ret = Flatten()(ret)
        ret = Dense(DENSE_SIZE, activation = 'linear', init = w_init)(ret)

        return Model(input = inp, output = ret)

    input_orig = Input(shape = (WINDOW_SIZE, 1))
    input_dirty = Input(shape = (WINDOW_SIZE, 1))
    
    base_network = siamese_half()
    processed_a = base_network(input_orig)
    processed_b = base_network(input_dirty)
    
    out = Concatenate()([processed_a, processed_b])
    out = Dense(DENSE_SIZE, activation = 'linear', init = w_init)(out)
    out = activation()(out)
    out = Dense(1, activation = 'sigmoid', init = w_init)(out)
    
    model = Model(input = [input_orig, input_dirty], output = out)
    return model


pesq_model = load_model('pesq_model.h5')
#pesq_model = pesq_net_structure()
pesq_model.name = 'pesq'
pesq_model.layers[2].name = 'siamese'
pesq_model.compile(loss = rmse, optimizer = Adam(lr = 0.001))

In [17]:
# ---------------------------------------------------------------------------
# discriminator: tries to differentiate between original and reconstructed samples
#     inspired by PatchGAN in [that one paper]
# ---------------------------------------------------------------------------
def discriminator_structure(siamese_model, inp):
    DENSE_SIZE = 16
    
    siamese_transform = siamese_model.layers[2]
    
    out = siamese_transform(inp)
    
    out = Dense(DENSE_SIZE, activation = 'linear', init = w_init)(out)
    out = activation()(out)
    
    out = Dense(1, activation = 'sigmoid', init = w_init)(out)
    
    return out
    
    '''
    dsc_input = Input(shape = dim)
    dsc = Reshape(dim, input_shape = dim)(dsc_input)
    
    # total receptive field: 141 samples
    dsc = channel_increase_block(NFILTS, FSIZE)(dsc)
    
    dsc = downsample_block(NFILTS, FSIZE)(dsc)
    dsc = downsample_block(NFILTS, FSIZE)(dsc)
    
    dsc = residual_block(NFILTS, FSIZE)(dsc)
    dsc = residual_block(NFILTS, FSIZE)(dsc)
    
    dsc = channel_decrease_block(NFILTS, FSIZE)(dsc)

    # sigmoid output (probability of patch being real or fake)
    dsc = Activation('sigmoid')(dsc)
    
    # take mean over all applications of "PatchGAN"
    dsc = Flatten()(dsc)
    dsc = Lambda(lambda x : K.mean(x, axis = -1, keepdims = True),
                 lambda shape : (shape[0], 1))(dsc)

    dsc = Model(input = dsc_input, output = dsc)
    return dsc
    '''

In [18]:
# construct autoencoder to be used in adversarial training
ac_input = Input(shape = input_dim)
ac_enc, ac_dec = autoencoder_structure(input_dim)
ac_embedding = ac_enc(ac_input)
ac_reconstructed = ac_dec(ac_embedding)

# construct discriminator: regular
dsc_input_dim = (WINDOW_SIZE, 1)
dsc_input = Input(shape = input_dim)
#dsc_struct = discriminator_structure(dsc_input_dim)
#dsc_label = dsc_struct(dsc_input)
dsc_label = discriminator_structure(pesq_model, dsc_input)
ac_dsc_label = discriminator_structure(pesq_model, ac_reconstructed)

# construct perceptual model
ac_pesq = pesq_model([ac_input, ac_reconstructed])

In [19]:
# ------------------------------------------------------------------
# PARZEN ENTROPY ESTIMATION
# ------------------------------------------------------------------
# the Parzen kernel is a zero-centered gaussian with bin-width standard deviation
std = (1.0 / (NBINS - 1))
norm = 1.0 / math.sqrt(2.0 * 3.14159 * std * std)
den = (2.0 * std * std)

def parzen_kernel(x):
    num = K.square(x)
    return norm * K.exp(-num / den)

# we use 10,000 samples to create our entropy estimate
N = 10000
log_2 = math.log(2.0)
bins = K.variable(np.linspace(-1.0, 1.0, NBINS))
r_bins = K.repeat_elements(bins.reshape((NBINS, 1)), N, 1)

# we increase the weight of the entropy loss over time while
# training
entropy_weight = K.variable(0.0, name = 'entropy_weight')
max_entropy_weight = 1.0
entropy_weight_rate = 0.1

def entropy_estimate(placeholder, code):
    # if there are less than N samples in this batch, we just use however much data
    # we have
    flt = K.flatten(code)
    end_idx = K.minimum(flt.shape[0], N)
    
    ref = flt[:end_idx]
    r_ref = K.repeat_elements(ref.reshape((1, end_idx)), NBINS, 0)

    r_kern = parzen_kernel(r_ref - r_bins[:, :end_idx])
    r_kern = K.sum(r_kern, axis = 1)
    r_kern /= K.sum(r_kern)

    ent = -K.sum(r_kern * K.log(r_kern + K.epsilon()) / log_2)
    return ent

In [20]:
# compile model
loss_weights = [500.0, 4.0, 1.0, 2.0]
loss_functions = [rmse, rmse, 'binary_crossentropy', entropy_estimate]
n_recons = 1
n_pesq = 1
n_discrim = 1
n_code = 1
assert(n_recons + n_pesq + n_discrim + n_code == len(loss_weights))
assert(len(loss_weights) == len(loss_functions))

def opti():
    return Adam(lr = 0.001)
    
autoencoder = Model(input = [ac_input], output = [ac_reconstructed])
autoencoder.compile(loss = rmse, optimizer = opti())

discriminator = Model(input = [dsc_input], output = [dsc_label])
discriminator.compile(loss = ['binary_crossentropy'], optimizer = opti())
discriminator.summary()

autoencoder.summary()

pesq_model.summary()

model = Model(input = [ac_input], output = [ac_reconstructed] * n_recons + \
                                           [ac_pesq] * n_pesq + \
                                           [ac_dsc_label] * n_discrim + \
                                           [ac_embedding] * n_code)
model.compile(loss = loss_functions,
              loss_weights = loss_weights,
              optimizer = opti())
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 512, 1)            0         
_________________________________________________________________
siamese (Model)              (None, 16)                86928     
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
leaky_re_lu_13 (LeakyReLU)   (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 87,217.0
Trainable params: 87,217.0
Non-trainable params: 0.0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   


In [21]:
# interleave numpy arrays of the same size along the first axis
def interleave(arr):    
    num = len(arr)
    
    r = np.empty(arr[0].shape)
    r = np.repeat(r, num, axis = 0)
    
    for i in xrange(0, num):
        r[i::num] = arr[i]
    
    return r

In [22]:
# interface to PESQ evaluation, taking in two filenames as input
def run_pesq_filenames(clean, to_eval):
    pesq_regex = re.compile("\(MOS-LQO\):  = ([0-9]+\.[0-9]+)")
    
    pesq_out = os.popen("./PESQ +16000 +wb " + clean + " " + to_eval).read()
    regex_result = pesq_regex.search(pesq_out)
    
    if (regex_result is None):
        return 0.0
    else:
        return float(regex_result.group(1))
    
# interface to PESQ evaluation, taking in two waveforms as input
def run_pesq_waveforms(clean_wav, dirty_wav):
    # compute PESQ between original and corrupted waveforms
    sciwav.write("./clean.wav", SAMPLE_RATE, clean_wav.astype(np.int16))
    sciwav.write("./dirty.wav", SAMPLE_RATE, dirty_wav.astype(np.int16))
    pesq = run_pesq_filenames("./clean.wav", "./dirty.wav")
    os.system("rm ./clean.wav")
    os.system("rm ./dirty.wav")
    
    return pesq
    
# interface to PESQ evaluation, taking in two sets of windows as input
def run_pesq_windows(clean_wnd, dirty_wnd, wparam1, wparam2):
    clean_wnd = np.reshape(clean_wnd, (-1, WINDOW_SIZE))
    clean_wav = reconstructFromWindows(clean_wnd, OVERLAP_SIZE, OVERLAP_FUNC)
    clean_wav = unpreprocess_waveform(clean_wav, wparam1)
    clean_wav = np.clip(clean_wav, -32767, 32767)

    dirty_wnd = np.reshape(dirty_wnd, (-1, WINDOW_SIZE))
    dirty_wav = reconstructFromWindows(dirty_wnd, OVERLAP_SIZE, OVERLAP_FUNC)
    dirty_wav = unpreprocess_waveform(dirty_wav, wparam2)
    dirty_wav = np.clip(dirty_wav, -32767, 32767)
    
    return run_pesq_waveforms(clean_wav, dirty_wav)

In [23]:
# create pairs for the discriminator, given the autoencoder and a batch
def create_discrim_pairs(autoencoder, batch):
    num = batch.shape[0]
    generated = autoencoder.predict(batch)
    
    X = interleave([batch, generated])
    y = interleave([np.ones(num), np.zeros(num)])
    
    return X, y

In [24]:
# test discriminator, given the autoencoder and a set of samples (speech windows, not
# necessarily in any order)
def test_discriminator(discriminator, autoencoder, orig_samples, verbose = True):
    X, y = create_discrim_pairs(autoencoder, orig_samples)
    
    # verify discriminator was trained properly
    y_hat = discriminator.predict(X)

    y_hat[y_hat >= 0.5] = 1
    y_hat[y_hat < 0.5] = 0
    
    n_total = y.shape[0]
    n_correct = np.sum(np.ravel(y_hat) == y)

    acc = n_correct * 100.0 / n_total
    
    if (verbose):
        print "Discriminator evaluation: %0.02f"%(acc)
    return acc

In [25]:
# test model on a set of speech windows (which should originally have been extracted in
# order from some speech waveform)
def test_model_on_windows(orig_windows, wparams, autoencoder):
    # first, get desired reconstruction
    desired = reconstructFromWindows(orig_windows, OVERLAP_SIZE, OVERLAP_FUNC)
    desired = unpreprocess_waveform(desired, wparams)
    desired = np.clip(desired, -32767, 32767)
    
    # then, run NN on windows to get our model's reconstruction
    transformed = np.reshape(orig_windows, (orig_windows.shape[0], WINDOW_SIZE, 1))
    autoencOutput = autoencoder.predict(transformed, batch_size = 128, verbose = 0)
    autoencOutput = np.reshape(autoencOutput, (autoencOutput.shape[0], WINDOW_SIZE))
    recons = reconstructFromWindows(autoencOutput, OVERLAP_SIZE, OVERLAP_FUNC)
    recons = unpreprocess_waveform(recons, wparams)
    recons = np.clip(recons, -32767, 32767)
    
    # compute PESQ between desired and reconstructed waveforms
    pesq = run_pesq_waveforms(desired, recons)
    
    # return some metrics, as well as the two waveforms
    metrics = [
        mse(recons, desired),
        avgErr(recons, desired),
        pesq
    ]
    
    return metrics, desired, recons

In [26]:
# test model given the filename for a .wav file
def test_model_on_wav(wave_filename, prefix, autoencoder,
                      lead = "", save_recons = True, verbose = True):
    [rate, data] = sciwav.read(wave_filename)
    data = data.astype(np.float32)
    processed_wave, wparams = preprocess_waveform(data)
    windows = extractWindows(processed_wave, STEP_SIZE, OVERLAP_SIZE)
    
    metrics, desired, recons = test_model_on_windows(windows, wparams, autoencoder)
    
    if (save_recons):
        outFilename = prefix + "_output.wav"
        sciwav.write(outFilename, SAMPLE_RATE, recons.astype(np.int16))
    
    if (verbose):
        print lead + "MSE:        ", metrics[0]
        print lead + "Avg err:    ", metrics[1]
        print lead + "PESQ:       ", metrics[2]
        
    return metrics

In [27]:
def test_amr_on_wav(wave_filename):
    amr_fname = wave_filename + '.amr'
    out_fname = wave_filename + '.out.wav'
    command_encode = 'ffmpeg -i ' + wave_filename + ' -ar 16000 -ab 15.85k ' + \
                         '-acodec libvo_amrwbenc ' + amr_fname
    command_decode = 'ffmpeg -i ' + amr_fname + ' ' + out_fname
    
    os.system(command_encode)
    os.system(command_decode)
    
    pesq = run_pesq_filenames(wave_filename, out_fname)
    os.system('rm ' + amr_fname)
    os.system('rm ' + out_fname)
    
    return pesq

In [28]:
def evaluate_training(autoencoder, lead = ""):
    train_eval_idxs = random.sample(range(0, len(train_windows) - 1), TRAIN_EVALUATE)
    val_eval_idxs = random.sample(range(0, len(val_windows) - 1), VAL_EVALUATE)
    
    train_metrics = []
    for idx in train_eval_idxs:
        windows = train_windows[idx]
        wparams = train_wparams[idx]
        metrics, _, _ = test_model_on_windows(windows, wparams, autoencoder)
        
        train_metrics.append(metrics)
        
    val_metrics = []
    for idx in val_eval_idxs:
        windows = val_windows[idx]
        wparams = val_wparams[idx]
        metrics, _, _ = test_model_on_windows(windows, wparams, autoencoder)
        
        val_metrics.append(metrics)
    
    train_metrics = np.array(train_metrics)
    val_metrics = np.array(val_metrics)
    
    print lead + "Format: [MSE, avg err, PESQ]"
    print lead + "    Train: (mean)", np.mean(train_metrics, axis = 0)
    print lead + "    Train: (max) ", np.max(train_metrics, axis = 0)
    print lead + "    Train: (min) ", np.min(train_metrics, axis = 0)
    print lead + "    Val:   (mean)", np.mean(val_metrics, axis = 0)
    print lead + "    Val:   (max) ", np.max(val_metrics, axis = 0)
    print lead + "    Val:   (min) ", np.min(val_metrics, axis = 0)
    
    # returns mean PESQ on validation
    return np.mean(val_metrics, axis = 0)[2]

In [29]:
test_model_on_wav("./SA1.wav", "./train_output/SA1_res_uninit_", autoencoder)

MSE:         2155.64
Avg err:     16.9344
PESQ:        2.74


[2155.6431, 16.934383, 2.74]

In [30]:
RUN_AMR_BASELINE = False

# AMR baseline (training)
# Mean: 4.00786
# Max:  4.431
# Min:  3.062
#
# AMR baseline (validation)
# Mean: 4.16782
# Max:  4.436
# Min:  3.722

if (RUN_AMR_BASELINE):
    print "AMR baseline (training)"
    
    eval_paths = random.sample(train_paths, 200)
    base_scores = []
    for path in eval_paths:
        pesq = test_amr_on_wav(path)
        base_scores.append(pesq)
    
    print "    Mean:", np.mean(base_scores)
    print "    Max: ", np.max(base_scores)
    print "    Min: ", np.min(base_scores)
    print ""
    
    print "AMR baseline (validation)"
    
    eval_paths = random.sample(val_paths, 100)
    base_scores = []
    for path in eval_paths:
        pesq = test_amr_on_wav(path)
        base_scores.append(pesq)
    
    print "    Mean:", np.mean(base_scores)
    print "    Max: ", np.max(base_scores)
    print "    Min: ", np.min(base_scores)
    print ""

In [31]:
def save_model():
    os.system('rm ./best_model.h5')
    os.system('rm ./best_auto.h5')
    os.system('rm ./best_discrim.h5')
    
    model.save('./best_model.h5')
    autoencoder.save('./best_auto.h5')
    discriminator.save('./best_discrim.h5')

    f = h5py.File('./best_model.h5', 'r+')
    del f['optimizer_weights']
    f.close()

In [32]:
# different types of noise
def identity(window, param):
    return window

def additive_noise(window, param):
    corrupted = np.copy(window)
    corrupted += np.random.uniform(-param, param, corrupted.shape)
    corrupted = np.clip(corrupted, -1.0, 1.0)
    return corrupted

def mult_noise(window, param):
    corrupted = np.copy(window)
    corrupted *= np.random.normal(1.0, param, corrupted.shape)
    corrupted = np.clip(corrupted, -1.0, 1.0)
    return corrupted
    
def high_freq_additive_noise(window, param):
    crange = WINDOW_SIZE / 2
    
    corrupted = np.copy(window)
    corrupted = dct(corrupted, norm = 'ortho')
    corrupted[:, crange:] += np.random.uniform(-param, param, (crange, 1))
    corrupted = idct(corrupted, norm = 'ortho')
    corrupted = np.clip(corrupted, -1.0, 1.0)
    return corrupted

def low_freq_additive_noise(window, param):
    crange = WINDOW_SIZE / 2
    
    corrupted = np.copy(window)
    corrupted = dct(corrupted, norm = 'ortho')
    corrupted[:, :crange] += np.random.uniform(-param, param, (crange, 1))
    corrupted = idct(corrupted, norm = 'ortho')
    corrupted = np.clip(corrupted, -1.0, 1.0)
    return corrupted

# corrupt the windows with these noise functions, with each of this set of parameters
noise_types = [
               (additive_noise,
                   [1.0 / 1024, 1.0 / 256, 1.0 / 64]),
               (mult_noise,
                   [1.0 / 64, 1.0 / 16, 1.0 / 8]),
               (high_freq_additive_noise,
                   [1.0 / 512, 1.0 / 128, 1.0 / 32]),
               (low_freq_additive_noise,
                   [1.0 / 512, 1.0 / 128, 1.0 / 32])
              ]

In [33]:
def update_train_structure(X, y, wnd, cor, prm):
    pesq = run_pesq_windows(wnd, cor, prm, prm)
    scaled = (pesq - 1.0) / 3.5
    scaled = np.clip(scaled, 0.0, 1.0)
    
    for i in xrange(0, wnd.shape[0]):
        X[0].append(wnd[i])
        X[1].append(cor[i])
        y.append(scaled)
        
    return X, y

def generate_pesq_traindata():
    X = [[], []]
    y = []
    
    # get random waveform from training set
    idx = random.randint(0, TRAIN_SIZE - 1)
    
    wnd = train_windows[idx]
    prm = train_wparams[idx]
    
    # autoencoder prediction
    wnd = np.reshape(wnd, (-1, WINDOW_SIZE, 1))
    cor = autoencoder.predict(wnd, verbose = 0)
    X, y = update_train_structure(X, y, wnd, cor, prm)
    
    # random type of noise, at random amount
    noise = random.choice(noise_types)
    noise_func = noise[0]
    noise_prm = random.choice(noise[1])
    cor = noise_func(wnd, noise_prm)
    X, y = update_train_structure(X, y, wnd, cor, prm)
    
    for i in xrange(0, wnd.shape[0]):
        X[0].append(wnd[i])
        X[1].append(wnd[i])
        y.append(1.0)
    
    return X, y

pesq_train_X, pesq_train_y = generate_pesq_traindata()

In [None]:
X_train = np.copy(train_processed)
ntrain = X_train.shape[0]

In [None]:
np.set_printoptions(formatter={'float_kind':'{:4f}'.format})

BATCH_SIZE = 128
NUM_BATCHES = ntrain / BATCH_SIZE
NUM_EPOCHS = 200

DSC_CLIP_WEIGHTS = False
DSC_CLAMP_RANGE = 0.01
DSC_TIMES_TRAIN = 1

PESQ_TIMES_TRAIN = 1

lead = "    "
d_loss = 0.0
a_losses = []
d_acc = 0.0

best_val_pesq = 0.0

for epoch in range(NUM_EPOCHS):
    print "Epoch " + str(epoch + 1) + ":"

    # present batches randomly each epoch
    lis = range(0, ntrain, BATCH_SIZE)
    random.shuffle(lis)
    
    # keep track of start time and current batch #
    i = 0
    startTime = time.time()
    for idx in lis:
        batch = X_train[idx:idx+BATCH_SIZE, :,  :]
        nbatch = batch.shape[0]
        
        a_losses = ["no auto"]
        d_loss = "no discrim"
        p_loss = "no pesq"
        
        make_trainable(autoencoder, False)
        
        # train perceptual model
        if (n_pesq > 0):
            make_trainable(discriminator, False)
            make_trainable(pesq_model, True)
        
            for k in xrange(0, PESQ_TIMES_TRAIN):
                p_loss = pesq_model.train_on_batch([np.array(pesq_train_X[0]), np.array(pesq_train_X[1])],
                                                   np.array(pesq_train_y))
        
        # train discriminator
        if (n_discrim > 0):
            make_trainable(pesq_model, False)
            make_trainable(discriminator, True)
            
             # clip discriminator weights, if necessary
            if (DSC_CLIP_WEIGHTS):
                for l in discriminator.layers:
                    weights = l.get_weights()
                    weights = [np.clip(w, -DSC_CLAMP_RANGE, DSC_CLAMP_RANGE) for w in weights]
                    l.set_weights(weights)
            
            discrim_batch_X, discrim_batch_y =  create_discrim_pairs(autoencoder, batch)
            for k in xrange(0, DSC_TIMES_TRAIN):
                d_loss = discriminator.train_on_batch(discrim_batch_X, discrim_batch_y)  
        
        # train autoencoder ("generator")
        make_trainable(autoencoder, True)
        make_trainable(pesq_model, False)
        make_trainable(discriminator, False)
        
        a_y = [batch] * n_recons + \
              [np.ones(nbatch)] * n_pesq + \
              [np.ones(nbatch)] * n_discrim + \
              [np.zeros(nbatch)] * n_code
        a_losses = model.train_on_batch(batch, a_y)        
        
        # print statistics every 10 batches so we know stuff is still going down
        if (i % 10 == 0):
            printStr = "        \r" + lead + str(i * BATCH_SIZE) + ": " + \
                                             str(d_loss) + " " + \
                                             str(p_loss) + " "
            print printStr,
            
            loss_arr = np.asarray(a_losses)
            print loss_arr,
            
            if (len(loss_weights) > 1 and len(loss_arr) > 1):
                for w in xrange(0, len(loss_weights)):
                    loss_arr[w + 1] *= loss_weights[w]
                print loss_arr,
                
            pesq_train_X, pesq_train_y = generate_pesq_traindata()
            
        i += 1
    print ""
    
    # print elapsed time for epoch
    elapsed = time.time() - startTime
    print lead + "Total time for epoch: " + str(elapsed) + "s"   
    
    # ---------------------------------------------------------
    # evaluate discriminator on random samples every epoch
    # ---------------------------------------------------------
    startTime = time.time()
    print lead + "----------------"
    if (n_discrim > 0):
        NUM = 200
        rows = np.random.randint(X_train.shape[0], size = NUM)
        d_acc = test_discriminator(discriminator, autoencoder,
                                   X_train[rows, :], verbose = False)

        print lead + "Evaluated the discriminator: " + str(d_acc)
        elapsed = time.time() - startTime
        print lead + "Total time for evaluation: " + str(elapsed) + "s"
    else:
        print lead + "No discriminator"
    
    
    # ---------------------------------------------------------
    # generate code histogram from said random samples
    # ---------------------------------------------------------
    NUM = 200
    rows = np.random.randint(X_train.shape[0], size = NUM)
    code = ac_enc.predict(X_train[rows, :], verbose = 0)
    
    print lead + "----------------"
    print lead + "Code histogram:"
    scalars = code.flatten()
    
    b = np.linspace(-1.0, 1.0, NBINS + 1)
    hist = np.histogram(scalars, bins = b)
    sample_hist_probs = hist[0].astype('float32')
    sample_hist_bins = hist[1].astype('float32')
    sample_hist_probs /= np.sum(sample_hist_probs)

    entropy = 0
    for i in sample_hist_probs:
        if (i < 1e-4): continue
        entropy += i * math.log(i, 2)
    entropy = -entropy
    
    zero_prob = sample_hist_probs[NBINS / 2]
    zero_prob = np.clip(zero_prob, 0.001, 0.999)
    mask_entropy = -(zero_prob * math.log(zero_prob, 2) + (1.0 - zero_prob) * math.log(1.0 - zero_prob, 2))
    
    print "       Entropy:", entropy
    print "       Zero prob:", sample_hist_probs[NBINS / 2]
    print "       Mask entropy:", mask_entropy
    print "       Pct. in last bins:", sample_hist_probs[0] + sample_hist_probs[-1]
    
    nnz = 0.0
    for i in xrange(0, code.shape[0]):
        r = np.round(code[i] * 1000.0) / 1000.0
        nnz += np.count_nonzero(r)
    nnz /= code.shape[0]
    print "       Avg # nonzero elts:", nnz
    
    # ---------------------------------------------------------
    # evaluate autoencoder on training/validation data evey epoch
    # ---------------------------------------------------------
    startTime = time.time()
    print lead + "----------------"
    
    print lead + "Evaluating autoencoder..."
    val_pesq = evaluate_training(autoencoder, lead)
    if (val_pesq > best_val_pesq):
        print lead + "NEW best model! Validation mean-PESQ", val_pesq
        print lead + "Saving model..."
        save_model()
        best_val_pesq = val_pesq
    else:
        print lead + "Best validation mean-PESQ seen:", best_val_pesq
    
    metrics = test_model_on_wav("./SA1.wav", "./train_output/SA1_res_reg_train_epoch" + str(epoch+1),
                              autoencoder, lead = lead, verbose = False)
    print lead + "SA1:        ", metrics
    metrics_tst = test_model_on_wav("./SX383.wav", "./train_output/SX383_res_reg_train_epoch" + str(epoch+1),
                                  autoencoder, lead = lead, verbose = False)
    print lead + "SX383:      ", metrics_tst
    
    elapsed = time.time() - startTime
    print lead + "Total time for evaluation: " + str(elapsed) + "s"
    
    # ---------------------------------------------------------
    # decrease the learning rate every so many epochs
    # ---------------------------------------------------------
    if ((epoch + 1) % 50 == 0):
        old_rate = model.optimizer.lr.get_value()
        new_rate = old_rate / 4.0
        new_rate = new_rate.astype('float32')
        model.optimizer.lr.set_value(new_rate)
        
        print lead + "Changed learning rate from", old_rate, "to", new_rate
    
    # ---------------------------------------------------------
    # update entropy loss weight every epoch
    # ---------------------------------------------------------
    if (n_code > 0 and (epoch + 1) >= 5):
        v = K.get_value(entropy_weight)
        
        if (v < max_entropy_weight):
            v += entropy_weight_rate
            print lead + "Updated entropy constraint weight:", v
        else:
            v = max_entropy_weight
            print lead + "Didn't update entropy constraint weight:", v
        
        K.set_value(entropy_weight, v)

Epoch 1:
    11520: 0.732887029648 0.381567984819  [11.705811 0.004893 0.002024 0.000009 4.625633] [11.705811 2.446440 0.008096 0.000009 9.251266]

In [None]:
'''objs = {'PhaseShiftUp1D' : PhaseShiftUp1D,
        'CodeRound' : CodeRound,
        'NBINS' : NBINS,
        'entropy_estimate' : entropy_estimate,
        'rmse' : rmse}

model = load_model('best_model.h5', objs)
autoencoder = load_model('best_auto.h5', objs)
discriminator = load_model('best_discrim.h5', objs)
'''

In [None]:
enc = model.layers[1].layers
dec = model.layers[2].layers

In [None]:
dsc_layers = discriminator.layers[1].layers

print "--- Discriminator layers ---"

i = 0
for l in dsc_layers:
    if type(l) is Convolution1D or type(l) is Dense:
        i += 1
        if type(l) is Convolution1D:
            print "Conv layer", i
        else:
            print "Dense layer", i
        w = l.weights[0].eval()
        print "    Avg weight norm:", np.mean(np.abs(w))
        print "    Max weight norm:", np.max(np.abs(w))
        
        if (len(l.weights) == 1): continue
        b = l.weights[1].eval()
        print "    Avg bias norm:", np.mean(np.abs(b))
        print "    Max bias norm:", np.max(np.abs(b))

In [None]:
NUM = 400
rows = np.random.randint(X_train.shape[0], size = NUM)

d_acc = test_discriminator(discriminator, autoencoder,
                           X_train[rows, :], verbose = True)

In [None]:
test_model_on_wav("./SA1.wav", "SA1_final_", autoencoder)
test_model_on_wav("./SX383.wav", "SX383_final_", autoencoder)
test_model_on_wav("./fiveYears.wav", "fy_final_", autoencoder)

In [None]:
all_embed = ac_enc.predict(X_train[:10000], batch_size = BATCH_SIZE, verbose = 1)

In [None]:
scalars = all_embed.flatten()
log_scalars = np.log((scalars + 1.0) / 2.0)

In [None]:
print np.mean(scalars)
print np.var(scalars)

In [None]:
hist = np.histogram(scalars, bins = np.linspace(-1.0, 1.0, NBINS + 1))
sample_hist_probs = hist[0].astype('float32')
sample_hist_bins = hist[1].astype('float32')
sample_hist_probs /= np.sum(sample_hist_probs)

sample_hist_width = 1 * (sample_hist_bins[1] - sample_hist_bins[0])
sample_hist_centers = (sample_hist_bins[:-1] + sample_hist_bins[1:]) / 2
plt.bar(sample_hist_centers, sample_hist_probs, align='center', width=sample_hist_width)
plt.show()

entropy = 0
for i in sample_hist_probs:
    if (i < 1e-4): continue
    entropy += i * math.log(i, 2)
entropy = -entropy
print "Entropy of distribution:", entropy

In [None]:
[rate, data] = sciwav.read("./SA1.wav")
data = data.astype(np.float32)
processedWave, wparams = preprocess_waveform(data)
windows = extractWindows(processedWave, STEP_SIZE, OVERLAP_SIZE)

transformed = np.reshape(windows, (windows.shape[0], WINDOW_SIZE, 1))
embed = ac_enc.predict(transformed, batch_size = BATCH_SIZE, verbose = 1)

In [None]:
recons = ac_dec.predict(embed, batch_size = BATCH_SIZE, verbose = 1)

In [None]:
idx = 44
print np.count_nonzero(embed[idx]), "nonzero"
print embed[idx]

print len(scalars)
print np.count_nonzero((abs(scalars) > 1).astype('int'))

In [None]:
idx = 56

orig = windows[idx].flatten()
recn = recons[idx].flatten()

plt.plot(orig)
ylim = plt.gca().get_ylim()
plt.show()

plt.plot(recn)
plt.ylim(ylim)
plt.show()

plt.plot(abs(orig - recn))
plt.show()