In [1]:
import numpy as np
import tensorflow as tf

import tensorflow.contrib.slim as slim
from tensorflow.python import pywrap_tensorflow

In [4]:
def get_variables_from_checkpoint_file(file_name):
    variables = []
    reader = pywrap_tensorflow.NewCheckpointReader(file_name)

    var_to_shape_map = reader.get_variable_to_shape_map()
    for key in sorted(var_to_shape_map):
        variables.append((key, var_to_shape_map[key]))

    return variables

In [5]:
from os import listdir
from os.path import isfile, join
import ntpath

data_dir = '/home/scpark/ai/datasets/waveglow-datasets/jeon'
data_files = [join(data_dir, f) for f in listdir(data_dir) if isfile(join(data_dir, f)) and '.npz' in f]
print(len(data_files))

10506


In [6]:
BATCH_AXIS = 0
TIME_AXIS = 1
CHANNEL_AXIS = 2


In [7]:
def conv1d(inputs, filters, kernel_size=1, dilation_rate=1, zero_init=False, name='conv1d', reuse=False):
    
    with tf.variable_scope(name, reuse=reuse):
        
        x = inputs[:, None, :, :]
        
        V_initializer = tf.constant_initializer(0.) if zero_init else tf.random_normal_initializer(0, 0.05)
        V = tf.get_variable('V', [1, kernel_size, int(inputs.get_shape()[-1]), filters], tf.float32, 
                            initializer=V_initializer, trainable=True)
        g = tf.get_variable('g', [filters], dtype=tf.float32, initializer=tf.constant_initializer(1.), trainable=True)
        b = tf.get_variable('b', [filters], dtype=tf.float32, initializer=tf.constant_initializer(0.), trainable=True)
        
        # use weight normalization (Salimans & Kingma, 2016)
        W = tf.reshape(g, [1, 1, 1, filters]) * tf.nn.l2_normalize(V, [0, 1, 2])

        # calculate convolutional layer output
        x = tf.nn.bias_add(tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='VALID', dilations=[1, 1, dilation_rate, 1]), b)
        x = x[:, 0, :, :]
        
        return x

In [16]:
from tensorflow.contrib.framework.python.ops import add_arg_scope

# Invertible 1x1 conv
@add_arg_scope
def invertible_1x1_conv(name, z, c, reverse=False):

    if True:  # Set to "False" to use the LU-decomposed version

        with tf.variable_scope(name):

            shape = tf.shape(z)
            w_shape = [c, c]

            # Sample a random orthogonal matrix:
            w_init = np.linalg.qr(np.random.randn(
                *w_shape))[0].astype('float32')
            
            if np.linalg.det(w_init) < 0:
                w_init[:, 0] = -1. * w_init[:, 0]

            w = tf.get_variable("W", dtype=tf.float32, initializer=w_init)

            # dlogdet = tf.linalg.LinearOperator(w).log_abs_determinant() * shape[1]*shape[2]
            dlogdet = tf.cast(tf.log(abs(tf.matrix_determinant(
                tf.cast(w, 'float64')))), 'float32') * tf.cast(shape[0] * shape[1] * shape[2], tf.float32)

            if not reverse:

                _w = tf.reshape(w, [1, 1] + w_shape)
                z = tf.nn.conv2d(z, _w, [1, 1, 1, 1],
                                 'SAME', data_format='NHWC')

                return z, dlogdet
            else:

                _w = tf.matrix_inverse(w)
                _w = tf.reshape(_w, [1, 1]+w_shape)
                z = tf.nn.conv2d(z, _w, [1, 1, 1, 1],
                                 'SAME', data_format='NHWC')

                return z, dlogdet

            
def Invertible1x1Conv(z, c, reverse=False, name='inv1x1conv', reuse=False):
    
    with tf.variable_scope(name, reuse=reuse):
        z = z[:, None, :, :]
        z, logdet = invertible_1x1_conv(name, z, c, reverse=reverse)
        z = z[:, 0, :, :]
        
        if reverse:
            return z
        else:
            return z, logdet    

In [17]:
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
    in_act = input_a + input_b
    t_act = tf.tanh(in_act[:, :, :n_channels])
    s_act = tf.sigmoid(in_act[:, :, n_channels:])
    acts = t_act * s_act
    
    return acts


In [18]:
def WN(audio, spect, n_channels, n_layers, kernel_size, name='wavenet', reuse=False):
    
    with tf.variable_scope(name, reuse=reuse):
        n_in_channels = int(audio.get_shape()[-1])
        
        audio = conv1d(audio, n_channels, name='start')
    
        for i in range(n_layers):
            
            with tf.variable_scope(str(i), reuse=tf.AUTO_REUSE):
                
                dilation = 2 ** i
                padding = int((kernel_size * dilation - dilation) / 2)
                audio_padded = tf.pad(audio, [[0, 0], [padding, padding], [0, 0]])
                
                in_acts = conv1d(audio_padded, 2 * n_channels, kernel_size, dilation, name='in_layer')
                cond_acts = conv1d(spect, 2 * n_channels, name='cond_layer')
                acts = fused_add_tanh_sigmoid_multiply(in_acts, cond_acts, n_channels)
                
                res_skip_channels = 2 * n_channels if i < n_layers - 1 else n_channels
                res_skip_acts = conv1d(acts, res_skip_channels, name='res_skip_layer')
                
                if i < n_layers - 1:
                    audio = res_skip_acts[:, :, :n_channels] + audio
                    skip_acts = res_skip_acts[:, :, n_channels:]
                else:
                    skip_acts = res_skip_acts
                    
                if i == 0:
                    output = skip_acts
                else:
                    output = skip_acts + output

        # Zero Initialization
        # Glow : Generative Flow with Invertible 1x1 Convolutions
        output = tf.layers.conv1d(output, filters=2 * n_in_channels,
                                kernel_size=1, 
                                kernel_initializer=tf.zeros_initializer(), 
                                bias_initializer=tf.zeros_initializer(), reuse=reuse)
        output_a, output_b = tf.split(output, num_or_size_splits=2, axis=CHANNEL_AXIS)
        
        return output_a, output_b
                
        

In [19]:
def upsample(spect, n_mel_channels, kernel_size, stride, name='upsample', reuse=False):
    
    with tf.variable_scope(name, reuse=reuse):
        # spect : [Batch, Time, n_mel_channels]
        
        # expand height
        upsampled_spect = spect[:, None, :, :]
        upsampled_spect = tf.layers.conv2d_transpose(inputs=upsampled_spect, 
                                                     filters=n_mel_channels, 
                                                     kernel_size=(1, kernel_size), 
                                                     strides=(1, stride),
                                                     padding='SAME')
        upsampled_spect = upsampled_spect[:, 0, :, :]
    
    return upsampled_spect



In [20]:
def waveGlow(audio, spect, n_mel_channels, n_flows, n_group, n_early_every, n_early_size, 
             WN_n_channels, WN_n_layers, WN_kernel_size,
             name='WaveGlow', reuse=False):
    
    with tf.variable_scope(name, reuse=reuse):
    
        # audio : [Batch, Time]
        # spect : [Batch, Time / 256, n_mel_channels]

        Batch = tf.shape(audio)[0]
        Time = tf.shape(audio)[1]

        # [Batch, Time, n_mel_channels]
        spect = upsample(spect, n_mel_channels, 1024, 256)
        #spect = spect[:, :Time]
        
        # [Batch, Time / n_group, n_mel_channels * n_group]
        spect = tf.reshape(spect, [Batch, -1, n_mel_channels * n_group])
        audio = tf.reshape(audio, [Batch, -1, n_group])

        output_audio = []
        log_s_list = []
        log_det_W_list = []

        n_remaining_channels = n_group

        for k in range(n_flows):
            
            with tf.variable_scope('flow_' + str(k), reuse=reuse):
                
                print('flow ', k)
                
                if k % n_early_every == 0 and k > 0:
                    output_audio.append(audio[:, :, :n_early_size])
                    audio = audio[:, :, n_early_size:]
                    n_remaining_channels = n_remaining_channels - n_early_size
                    
                audio, log_det_W = Invertible1x1Conv(audio, n_remaining_channels)
                log_det_W_list.append(log_det_W)

                audio_0, audio_1 = tf.split(audio, 2, axis=CHANNEL_AXIS)

                log_s, b = WN(audio_0, spect, 
                            n_channels=WN_n_channels, n_layers=WN_n_layers, kernel_size=WN_kernel_size)

                audio_1 = tf.exp(log_s) * audio_1 + b
                log_s_list.append(log_s)

                audio = tf.concat([audio_0, audio_1], axis=CHANNEL_AXIS)

        output_audio.append(audio)

        return tf.concat(output_audio, axis=CHANNEL_AXIS), log_s_list, log_det_W_list, n_remaining_channels
    
def waveGlowInverse(spect, n_mel_channels, n_flows, n_group, n_early_every, n_early_size, n_remaining_channels,
                    WN_n_channels, WN_n_layers, WN_kernel_size,
                    sigma=1.0, name='WaveGlow', reuse=True):
    
    with tf.variable_scope(name, reuse=reuse):
        
        # spect : [Batch, Time / 256, n_mel_channels]
        
        time_cutoff = 1024 - 256
        spect = upsample(spect, n_mel_channels, 1024, 256, reuse=reuse)
        #spect = spect[:, :-time_cutoff]
        Batch = tf.shape(spect)[0]
        
        spect = tf.reshape(spect, [Batch, -1, n_mel_channels * n_group])
        Time = tf.shape(spect)[1]

        audio = tf.random_normal(shape=[Batch, Time, n_remaining_channels], stddev=sigma, dtype=tf.float32)
        
        for k in reversed(range(n_flows)):
            
            with tf.variable_scope('flow_' + str(k), reuse=reuse):
                
                print('inverse flow ', k)
                
                audio_0, audio_1 = tf.split(audio, 2, axis=CHANNEL_AXIS)
                log_s, b = WN(audio_0, spect, 
                            n_channels=WN_n_channels, n_layers=WN_n_layers, kernel_size=WN_kernel_size)
                audio_1 = (audio_1 - b) / tf.exp(log_s)
                audio = tf.concat([audio_0, audio_1], axis=CHANNEL_AXIS)
                
                audio = Invertible1x1Conv(audio, n_remaining_channels, reverse=True)
                
                if k % n_early_every == 0 and k > 0:
                    z = tf.random_normal(shape=[Batch, Time, n_early_size], dtype=tf.float32)
                    audio = tf.concat([sigma * z, audio], axis=CHANNEL_AXIS)
                    n_remaining_channels = n_remaining_channels + n_early_size
        
        audio = tf.reshape(audio, [Batch, -1])
        
        return audio
    
def waveGlowLoss(z, log_s_list, log_det_W_list, sigma=1.0):
    
    log_s_total = 0
    log_det_W_total = 0
    
    for i, log_s in enumerate(log_s_list):
        log_s_total += tf.reduce_sum(log_s)
        log_det_W_total += log_det_W_list[i]
        
    nll_loss = tf.reduce_sum(z * z) / (2. * sigma * sigma) 
    det_loss = -log_s_total - log_det_W_total
    norm_factor = tf.cast(tf.shape(z)[0] * tf.shape(z)[1] * tf.shape(z)[2], tf.float32)
    loss = (nll_loss + det_loss) / norm_factor
    
    return nll_loss, det_loss, loss
            

In [21]:
tf.reset_default_graph()
Batch = None
Time = None
n_mel_channels = 80
n_flows = 12
n_group = 8
n_early_every = 4
n_early_size = 2

WN_n_layers = 8
WN_n_channels = 512
WN_kernel_size = 3

audio = tf.placeholder(dtype=tf.float32, shape=[Batch, None])
spect = tf.placeholder(dtype=tf.float32, shape=[Batch, None, n_mel_channels])

z, log_s_list, log_det_W_list, n_remaining_channels = \
                    waveGlow(audio, spect, n_mel_channels, n_flows, n_group, n_early_every, n_early_size,
                            WN_n_channels, WN_n_layers, WN_kernel_size)

print(z)
    
audio_sample = waveGlowInverse(spect, n_mel_channels, n_flows, n_group, n_early_every, n_early_size, n_remaining_channels,
                                 WN_n_channels, WN_n_layers, WN_kernel_size, reuse=True)
print(audio_sample)

nll_loss, det_loss, loss = waveGlowLoss(z, log_s_list, log_det_W_list, sigma=1.0)
print(loss)

optim = tf.train.AdamOptimizer(1e-4).minimize(loss)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

print('done')


flow  0
flow  1
flow  2
flow  3
flow  4
flow  5
flow  6
flow  7
flow  8
flow  9
flow  10
flow  11
Tensor("WaveGlow/concat:0", shape=(?, ?, 8), dtype=float32)
inverse flow  11
inverse flow  10
inverse flow  9
inverse flow  8
inverse flow  7
inverse flow  6
inverse flow  5
inverse flow  4
inverse flow  3
inverse flow  2
inverse flow  1
inverse flow  0
Tensor("WaveGlow_1/Reshape_1:0", shape=(?, ?), dtype=float32)
Tensor("truediv_1:0", shape=(), dtype=float32)
done


In [22]:
def get_test_data(time_length, index):
    waves = []
    specs = []
    
    data_file = data_files[index]
    data = np.load(data_file)
    wave, spec = data['wave'], data['mel']

    start = 0
    end = time_length
    waves = wave[start * 256 : end * 256]
    specs = spec[start:end]
    
    return waves, specs
    

def get_data(time_length):
    waves = []
    specs = []
    
    while len(specs) < 3:
        read_index = np.random.randint(0, len(data_files) - 1, 1)[0]
        data_file = data_files[read_index]
        data = np.load(data_file)
        wave, spec = data['wave'], data['mel']

        if len(spec) == 0 or len(wave) == 0:
            continue

        waves.append(wave)
        specs.append(spec)

    waves = np.concatenate(waves, axis=0)
    specs = np.concatenate(specs, axis=0)

    length = np.maximum(len(specs), time_length)
    if length > time_length:
        start = np.random.randint(length - time_length, size=1)[0]
    else:
        start = 0

    end = start + time_length
    waves = waves[start * 256 : end * 256]
    specs = specs[start:end]
    
    return waves, specs


In [25]:
from IPython.display import clear_output
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

time_length = 50

while(True):
    for i in range(100):
        waves, specs = get_data(time_length)
        print(waves.shape, specs.shape)
        
        fetches = [optim, loss, nll_loss, det_loss]
        feed_dicts = {audio: np.expand_dims(waves, axis=0),
                      spect: np.expand_dims(specs, axis=0)}
        _, _loss, _nll_loss, _det_loss = sess.run(fetches, feed_dict=feed_dicts)
        
        print(_loss, _nll_loss, _det_loss)
        
    feed_dicts = {audio: np.expand_dims(waves, axis=0),
                  spect: np.expand_dims(specs, axis=0)}
    _audio_sample = sess.run(audio_sample, feed_dict=feed_dicts)
    _audio_sample = np.clip(_audio_sample, -1., 1.)
    clear_output()
    
    plt.figure(figsize=[18, 3])
    plt.plot(waves, alpha=0.3)
    plt.plot(_audio_sample[0], alpha=0.3)
    plt.show()
    
    

(12800,) (50, 80)


InvalidArgumentError: Incompatible shapes: [1,1602,1024] vs. [1,1600,1024]
	 [[Node: WaveGlow/flow_0/wavenet/1/add = Add[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](WaveGlow/flow_0/wavenet/1/in_layer/strided_slice_1, WaveGlow/flow_0/wavenet/1/cond_layer/strided_slice_1)]]
	 [[Node: truediv/_37 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_62210_truediv", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'WaveGlow/flow_0/wavenet/1/add', defined at:
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 505, in start
    self.io_loop.start()
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/asyncio/base_events.py", line 427, in run_forever
    self._run_once()
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/asyncio/base_events.py", line 1440, in _run_once
    handle._run()
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/tornado/gen.py", line 1233, in inner
    self.run()
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/tornado/gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2817, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2843, in _run_cell
    return runner(coro)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3018, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3183, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3265, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-21-97a74966d376>", line 19, in <module>
    WN_n_channels, WN_n_layers, WN_kernel_size)
  File "<ipython-input-20-ee9d4b668a49>", line 44, in waveGlow
    n_channels=WN_n_channels, n_layers=WN_n_layers, kernel_size=WN_kernel_size)
  File "<ipython-input-18-53b755ad7e0e>", line 18, in WN
    acts = fused_add_tanh_sigmoid_multiply(in_acts, cond_acts, n_channels)
  File "<ipython-input-17-2af00d23eb9c>", line 2, in fused_add_tanh_sigmoid_multiply
    in_act = input_a + input_b
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 847, in binary_op_wrapper
    return func(x, y, name=name)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 297, in add
    "Add", x=x, y=y, name=name)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3414, in create_op
    op_def=op_def)
  File "/home/scpark/anaconda3/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1740, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): Incompatible shapes: [1,1602,1024] vs. [1,1600,1024]
	 [[Node: WaveGlow/flow_0/wavenet/1/add = Add[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](WaveGlow/flow_0/wavenet/1/in_layer/strided_slice_1, WaveGlow/flow_0/wavenet/1/cond_layer/strided_slice_1)]]
	 [[Node: truediv/_37 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_62210_truediv", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [None]:
import IPython.display as ipd
ipd.Audio(_audio_sample[0], rate=22050)

In [None]:
print('done')

In [None]:
tf.__version__