By default trying to load a mp3 file in the jupyter notebook may throw "IOPub data rate exceeded" exception.

To avoid it restart the notebook with the following command: jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [None]:
import librosa
from IPython.display import Audio
import tensorflow as tf
from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
import numpy as np
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

## Loading the music track

In [None]:
MUSIC_TRACK, sample_rate = librosa.load("../../data/music/kalpol_introl.mp3", mono=True)
Audio(data=MUSIC_TRACK, rate=sample_rate, autoplay=False)

In [None]:
plt.plot(range(len(MUSIC_TRACK)), MUSIC_TRACK)
plt.show()

## Trying out dilated convolutions

In [None]:
sample = tf.constant(list(range(10)), shape=[1, 10, 1], dtype=tf.float32)
paddings = [[0, 0], [4, 0], [0, 0]]
padded_sample = tf.pad(sample, paddings, "constant")
conv_1d = tf.layers.conv1d(padded_sample, filters=3, kernel_size=2, padding="valid", use_bias=False, dilation_rate=4) 
apply_1x1_conv = tf.layers.conv1d(conv_1d, filters=1, kernel_size=1, strides=1, use_bias=False)

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    out = conv_1d.eval()
    out_1x1 = apply_1x1_conv.eval()
    fil = tf.trainable_variables()[-1].eval()

In [None]:
fil

In [None]:
out

In [None]:
out_1x1

## Testing a block of dilated convolutions

In [None]:
tf.reset_default_graph()
dilation_steps = [2 ** i for i in range(10)]
input_track = tf.constant(MUSIC_TRACK, shape=[1, len(MUSIC_TRACK), 1], dtype=tf.float32)

last_layer = input_track
for d in dilation_steps:
    paddings = [[0, 0], [d, 0], [0, 0]]
    padded_layer = tf.pad(last_layer, paddings, "constant")
    next_layer = tf.layers.conv1d(padded_layer, filters=1, kernel_size=2, padding="valid", 
                                  name="block1_" + str(d), use_bias=False, dilation_rate=d)
    last_layer = next_layer

In [None]:
mse_loss = tf.losses.mean_squared_error(last_layer[0, :-1, 0], input_track[0, 1:, 0])
adam = tf.train.AdamOptimizer().minimize(mse_loss)

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(50):
        current_mse_loss, _, last_filter = sess.run([mse_loss, adam, tf.trainable_variables()[-1]])
        print(i, current_mse_loss, last_filter[:, 0, 0])

## Expressing WaveNet via TF Estimator

Defining model function for WaveNet Estimator

So far the differences between this implementation and the one given in the paper: hardcoded single residual channel going throug the network and a simplified version of loss function (currently just mse loss between actual signal and predictions)

In [None]:
MAX_DILATION_SIZE_BASE_2 = 2
N_DILATION_BLOCKS = 1
N_DILATION_FILTERS = 1

In [None]:
def model_fn(features, targets, mode, params):

    # Setting the flow of residuals - will be additively accumulated through the network
    # For the future: this to be replaced with additional 1x1 convolution if I go for N_RESIDUAL_CHANNELS > 1
    residual_flow = features
    
    # Setting skip_connection_layer - it is going to accumulate all outputs of intermediary layers
    # For the future: this has to point for current state of residual_flow if I go for N_RESIDUAL_CHANNELS > 1
    skip_connections_layer = features
    
    # Going through stacks of dilated convolution layers
    dilation_steps = [2 ** i for i in range(MAX_DILATION_SIZE_BASE_2)]
    for block_num in range(1, N_DILATION_BLOCKS + 1):
        for d in dilation_steps:
            
            # Padding the intermediary sequences / layers from left
            paddings = [[0, 0], [d, 0], [0, 0]]
            conv_padded = tf.pad(residual_flow, paddings, "constant")
                        
            # Defining gated activation unit
            conv_gate = tf.sigmoid(tf.layers.conv1d(conv_padded, filters=N_DILATION_FILTERS, kernel_size=2, padding="valid", 
                                                    name="dilated_block" + str(block_num) + "_" + str(d) + "_gate", 
                                                    use_bias=False, dilation_rate=d
                                                   )
                                  )     

            # Defining filter for gated activation unit
            conv_filter = tf.tanh(tf.layers.conv1d(conv_padded, filters=N_DILATION_FILTERS, kernel_size=2, padding="valid", 
                                                   name="dilated_block" + str(block_num) + "_" + str(d) + "_filter", 
                                                   use_bias=False, dilation_rate=d
                                                  )
                                 ) 
            
            # Calculating layer output to send via skip-connections 
            # 1x1 convolutions are applied to squeeze several dilation filters (if present) 
            # For the future: filters parameter is to be substituted with N_RESIDUAL_CHANNELS (if I introduce this parameter)
            conv_residual = tf.layers.conv1d(conv_filter * conv_gate, filters=1, kernel_size=1, strides=1, use_bias=False,
                                             name="dilated_block" + str(block_num) + "_" + str(d) + "_residual"
                                            )
            
            # Necessary bookkeeping: updating residual_flow and connecting current output with final layers
            skip_connections_layer += conv_residual
            residual_flow += conv_residual
            
    # The original paper takes accumulated skip_connection_layer and adds extra layers on top of it
    # For simplicity current implementation stops here and takes mse between this layer and targets and a loss
    predictions = skip_connections_layer
    predictions_dict = {"predicted_sample": predictions}

    # Calculate loss using mse between actual and predicted sample
    # Note: as WaveNet is generative model - it tries to predict next signal so two tensors are shifted wrt each other
    loss = tf.losses.mean_squared_error(tf.contrib.layers.flatten(targets)[1:], 
                                        tf.contrib.layers.flatten(predictions)[:-1])

    # Defining optimization step: picking adam as main optimizer
    # For simplicitly hardcoding parameters and not addressing params input
    adam = tf.train.AdamOptimizer()
    train_op = tf.contrib.layers.optimize_loss(loss=loss,
                                               global_step=tf.contrib.framework.get_global_step(),
                                               learning_rate=0.01,
                                               optimizer=adam
                                              )

    return model_fn_lib.ModelFnOps(mode=mode,
                                   predictions=predictions_dict,
                                   loss=loss,
                                   train_op=train_op
                                  )

In [None]:
wavenet = tf.contrib.learn.Estimator(model_fn=model_fn, params={})

In [None]:
# In the future this functions should be replaced with generators
# As WaveNet is generative model both x and y refer to the same piece of music:
# Necessary relative shifting of x and y is done inside model_fn definition of loss function
def get_inputs():
    x = tf.constant(MUSIC_TRACK, shape=[1, len(MUSIC_TRACK), 1], dtype=tf.float32)
    y = tf.constant(MUSIC_TRACK, shape=[1, len(MUSIC_TRACK), 1], dtype=tf.float32)
    return x, y

# Testing the fit method of WaveNet
wavenet.fit(input_fn=get_inputs, steps=2)

In [None]:
# Testing the evaluate method of WaveNet
ev = wavenet.evaluate(input_fn=get_inputs, steps=1)
print(ev)

In [None]:
# Testing filter extraction
test_filter = wavenet.get_variable_names()[-3]
wavenet.get_variable_value(test_filter)