# Soundnet to Keras

Unfortunately the Tensorflow implementation (https://github.com/eborboihuc/SoundNet-tensorflow) of Soundnet is abit hard to work with in Keras. 

Let's see if we can interorgate it to create our own Keras model


In [1]:
import keras
from keras.layers.core import Dense, Flatten
from keras.layers import Input, BatchNormalization, Dropout, Conv2D, MaxPooling2D, ZeroPadding2D, Activation
from keras.activations import relu 
from keras.models import Model as KModel

import model as SoundNet
import tensorflow as tf
import numpy as np
import keras.backend as K

Using TensorFlow backend.


# Step 1: Understand the original model by looking at code

Download the pretrained weights from here https://drive.google.com/uc?export=download&id=0B9wE6h4m--wjR015M1RLZW45OEU

In [2]:
# Load pre-trained model
G_name = './models/sound8.npy'
param_G = np.load(G_name, encoding='latin1').item()


Lets look at the model weights

In [3]:
param_G.keys()

dict_keys(['conv1', 'conv3', 'conv2', 'conv8_2', 'conv5', 'conv7', 'conv8', 'conv4', 'conv6'])

The keys seem to line up with the names for each model layer:
![from soundnet](https://camo.githubusercontent.com/0b88af5c13ba987a17dcf90cd58816cf8ef04554/687474703a2f2f70726f6a656374732e637361696c2e6d69742e6564752f736f756e646e65742f736f756e646e65742e6a7067)

In [4]:
param_G['conv1']

{'beta': array([ 0.22135185,  0.0341051 ,  0.18717989,  0.00313667,  0.09816235,
        -0.01616378,  0.48686007,  0.13595471,  0.27616981,  0.03256222,
         0.58677894,  0.2408203 ,  0.0112493 ,  0.30446413,  0.23332365,
         0.34096041], dtype=float32),
 'biases': array([  5.22947684e-02,  -3.40161085e-01,   1.38327324e+00,
          3.64338547e-01,  -3.26040685e-01,   1.79568276e-01,
          6.42116740e-02,  -1.03648205e-03,   1.81802064e-01,
          1.18660912e-01,  -3.31185043e-01,  -5.57779595e-02,
          5.85727729e-02,   7.62460768e-01,   8.31446290e-01,
          4.92918462e-01], dtype=float32),
 'gamma': array([ 0.67092448,  0.70485812,  1.07762814,  1.07327878,  0.99859923,
         1.43394423,  0.98132694,  0.82170153,  1.76054716,  1.30240929,
         0.61374092,  0.69179982,  1.39582741,  0.89855939,  0.78170842,
         0.88884467], dtype=float32),
 'mean': array([-0.25211978,  3.3069694 ,  1.29265058,  0.3468363 , -7.58555794,
        -0.4582119 ,  0.3

Each name seems to contain the weights for the individual layer. 

Below is the code from the tensorflow `model.py` implementation.

In [5]:
# Don't run this cell! It's just so we can see the code
def add_generator(self, name_scope='SoundNet'):

    with tf.variable_scope(name_scope) as scope:

        self.layers = {}
        # Stream one: conv1 ~ conv7

        self.layers[1] = conv2d(self.sound_input_placeholder, 1, 16, k_h=64, d_h=2, p_h=32, name_scope='conv1')

        self.layers[2] = batch_norm(self.layers[1], 16, self.config['eps'], name_scope='conv1')

        self.layers[3] = relu(self.layers[2], name_scope='conv1')

        self.layers[4] = maxpool(self.layers[3], k_h=8, d_h=8, name_scope='conv1')



        self.layers[5] = conv2d(self.layers[4], 16, 32, k_h=32, d_h=2, p_h=16, name_scope='conv2')

        self.layers[6] = batch_norm(self.layers[5], 32, self.config['eps'], name_scope='conv2')

        self.layers[7] = relu(self.layers[6], name_scope='conv2')

        self.layers[8] = maxpool(self.layers[7], k_h=8, d_h=8, name_scope='conv2')



        self.layers[9] = conv2d(self.layers[8], 32, 64, k_h=16, d_h=2, p_h=8, name_scope='conv3')

        self.layers[10] = batch_norm(self.layers[9], 64, self.config['eps'], name_scope='conv3')

        self.layers[11] = relu(self.layers[10], name_scope='conv3')



        self.layers[12] = conv2d(self.layers[11], 64, 128, k_h=8, d_h=2, p_h=4, name_scope='conv4')

        self.layers[13] = batch_norm(self.layers[12], 128, self.config['eps'], name_scope='conv4')

        self.layers[14] = relu(self.layers[13], name_scope='conv4')



        self.layers[15] = conv2d(self.layers[14], 128, 256, k_h=4, d_h=2, p_h=2, name_scope='conv5')

        self.layers[16] = batch_norm(self.layers[15], 256, self.config['eps'], name_scope='conv5')

        self.layers[17] = relu(self.layers[16], name_scope='conv5')

        self.layers[18] = maxpool(self.layers[17], k_h=4, d_h=4, name_scope='conv5')



        self.layers[19] = conv2d(self.layers[18], 256, 512, k_h=4, d_h=2, p_h=2, name_scope='conv6')

        self.layers[20] = batch_norm(self.layers[19], 512, self.config['eps'], name_scope='conv6')

        self.layers[21] = relu(self.layers[20], name_scope='conv6')



        self.layers[22] = conv2d(self.layers[21], 512, 1024, k_h=4, d_h=2, p_h=2, name_scope='conv7')

        self.layers[23] = batch_norm(self.layers[22], 1024, self.config['eps'], name_scope='conv7')

        self.layers[24] = relu(self.layers[23], name_scope='conv7')



        # Split one: conv8, conv8_2

        self.layers[25] = conv2d(self.layers[24], 1024, 1000, k_h=8, d_h=2, name_scope='conv8')

        self.layers[26] = conv2d(self.layers[24], 1024, 401, k_h=8, d_h=2, name_scope='conv8_2')



Seems like a pretty standard conv net architecture, but these helper functions need to be inspected. 
Let's check-out `ops.py` 

In [6]:
# TensorFlow version of NIPS2016 soundnet
import tensorflow as tf

def conv2d(prev_layer, in_ch, out_ch, k_h=1, k_w=1, d_h=1, d_w=1, p_h=0, p_w=0, pad='VALID', name_scope='conv'):
    with tf.variable_scope(name_scope) as scope:
        # h x w x input_channel x output_channel
        w_conv = tf.get_variable('weights', [k_h, k_w, in_ch, out_ch], 
                initializer=tf.truncated_normal_initializer(0.0, stddev=0.01))

        b_conv = tf.get_variable('biases', [out_ch], 
                initializer=tf.constant_initializer(0.0))
        padded_input = tf.pad(prev_layer, [[0, 0], [p_h, p_h], [p_w, p_w], [0, 0]], "CONSTANT") if pad == 'VALID' \
                else prev_layer

        output = tf.nn.conv2d(padded_input, w_conv, 
                [1, d_h, d_w, 1], padding=pad, name='z') + b_conv

        return output

There are a bunch of strange things going on here... 

However, it also looks pretty standard, we have the conv weights and biases that we'll need to get from the params, we have a padding layer. There is also some weird `if`, `else` syntax going on. 

It looks like the trickiest thing will be making sure we don't get the height and width mixed up. 

In [7]:
def batch_norm(prev_layer, out_ch, eps, name_scope='conv'):
    with tf.variable_scope(name_scope) as scope:
        #mu_conv, var_conv = tf.nn.moments(prev_layer, [0, 1, 2], keep_dims=False)
        mu_conv = tf.get_variable('mean', [out_ch], 
            initializer=tf.constant_initializer(0))
        var_conv = tf.get_variable('var', [out_ch], 
            initializer=tf.constant_initializer(1))
        gamma_conv = tf.get_variable('gamma', [out_ch], 
            initializer=tf.constant_initializer(1))
        beta_conv = tf.get_variable('beta', [out_ch], 
            initializer=tf.constant_initializer(0))
        output = tf.nn.batch_normalization(prev_layer, mu_conv, 
            var_conv, beta_conv, gamma_conv, eps, name='batch_norm')
        return output

Batch norm looks normal. Note that we will need to get a bunch of variables here!

In [8]:
def relu(prev_layer, name_scope='conv'):
    with tf.variable_scope(name_scope) as scope:
        return tf.nn.relu(prev_layer, name='a')
def maxpool(prev_layer, k_h=1, k_w=1, d_h=1, d_w=1, name_scope='conv'):
    with tf.variable_scope(name_scope) as scope:
        return tf.nn.max_pool(prev_layer, 
                [1, k_h, k_w, 1], [1, d_h, d_w, 1], padding='VALID', name='maxpool')

The cool thing about these functions is that they don't use any weights, so we should be able to directly use Keras analgoues. 

# Step 2: Understand how Keras does things 
## Conv2D

Let's checkout the Keras Conv2D object. It seems to have a get_weights parameter

In [9]:
c = Conv2D(16, (64,1), strides=(2,1))
c.get_weights()

[]

Ah, I think we need to pass an Input through it to get it to work. 

In [10]:
inp = Input(shape=(None,1,1))
c(inp)
weights = c.get_weights()
print(weights[0].shape)
print(weights[1].shape)
print(weights[0][0])

(64, 1, 1, 16)
(16,)
[[[-0.00015322  0.03328478  0.0431439   0.05665177  0.02174677  0.07000305
    0.03909661 -0.02369325 -0.07219765  0.00808635 -0.04132511  0.01037094
   -0.06979632 -0.02060135 -0.05271819  0.01664291]]]


Ok, but this is a list, rather than a dictionary, so I have to guess which ones they are. However, by the looks of it, the first one is the weights, and the second is the biases.

The Conv2D object also has a set_weights function, lets see what happens if we pass in the Conv1 weights

In [11]:
c.set_weights([param_G['conv1']['weights'],param_G['conv1']['biases']])
c.get_weights()[0][0]

array([[[-0.41206488,  0.33841759, -0.05745466,  0.27979866,  0.19388007,
          0.14923079, -0.74014872, -0.32045165,  0.59633714,  0.03676248,
         -0.50728828, -0.1975117 , -0.22688788, -0.12416738, -0.1477133 ,
         -0.14983156]]], dtype=float32)

Cool, that looks like we have set the weights correctly. Let's move onto Batch Norm
## Batch Normalization

To explore batch norm, lets do the same as before, and see what the weights look like

In [12]:
bn = BatchNormalization()
bn(c(inp))
weights = bn.get_weights()
[w.shape for w in weights]

[(16,), (16,), (16,), (16,)]

Unfortunately all the weights for batch normalization are the same Shape! Let's consult the Keras Source code to figure out what order they should be in:

In [13]:
BatchNormalization??

Looking at the `build` function, it seems the order is 
`gamma`,`beta`,`moving_mean`,`moving_variance`
Let's compare this to the keys in the weight dictionary.


In [14]:
param_G['conv1'].keys()

dict_keys(['gamma', 'weights', 'biases', 'beta', 'var', 'mean'])

Cool, the mapping seems pretty straight forward. Let's try it out.

In [15]:
bn.set_weights([param_G['conv1'][name] for name in ['gamma','beta','mean','var']])
bn.get_weights()

[array([ 0.67092448,  0.70485812,  1.07762814,  1.07327878,  0.99859923,
         1.43394423,  0.98132694,  0.82170153,  1.76054716,  1.30240929,
         0.61374092,  0.69179982,  1.39582741,  0.89855939,  0.78170842,
         0.88884467], dtype=float32),
 array([ 0.22135185,  0.0341051 ,  0.18717989,  0.00313667,  0.09816235,
        -0.01616378,  0.48686007,  0.13595471,  0.27616981,  0.03256222,
         0.58677894,  0.2408203 ,  0.0112493 ,  0.30446413,  0.23332365,
         0.34096041], dtype=float32),
 array([-0.25211978,  3.3069694 ,  1.29265058,  0.3468363 , -7.58555794,
        -0.4582119 ,  0.36731982,  0.54561132,  0.08063642,  0.20060934,
        -0.39500237,  0.4565326 , -0.47915456,  0.28680989,  1.13165963,
         0.64741123], dtype=float32),
 array([ 25892.19140625,  28977.63671875,   9814.6171875 ,  27940.69335938,
         52304.66796875,  34629.9140625 ,  24162.2578125 ,  37237.19921875,
         38663.5234375 ,   5816.80810547,    764.2232666 ,  17119.06835938,
 

In [16]:
bn = BatchNormalization()
bn(c(inp))
print(bn.get_weights())
K.set_value(bn.gamma, param_G['conv1']['gamma'])
K.set_value(bn.beta, param_G['conv1']['beta'])
K.set_value(bn.moving_mean, param_G['conv1']['mean'])
K.set_value(bn.moving_variance, param_G['conv1']['var'])
bn.get_weights()

[array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.], dtype=float32), array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.], dtype=float32), array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.], dtype=float32), array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.], dtype=float32)]


[array([ 0.67092448,  0.70485812,  1.07762814,  1.07327878,  0.99859923,
         1.43394423,  0.98132694,  0.82170153,  1.76054716,  1.30240929,
         0.61374092,  0.69179982,  1.39582741,  0.89855939,  0.78170842,
         0.88884467], dtype=float32),
 array([ 0.22135185,  0.0341051 ,  0.18717989,  0.00313667,  0.09816235,
        -0.01616378,  0.48686007,  0.13595471,  0.27616981,  0.03256222,
         0.58677894,  0.2408203 ,  0.0112493 ,  0.30446413,  0.23332365,
         0.34096041], dtype=float32),
 array([-0.25211978,  3.3069694 ,  1.29265058,  0.3468363 , -7.58555794,
        -0.4582119 ,  0.36731982,  0.54561132,  0.08063642,  0.20060934,
        -0.39500237,  0.4565326 , -0.47915456,  0.28680989,  1.13165963,
         0.64741123], dtype=float32),
 array([ 25892.19140625,  28977.63671875,   9814.6171875 ,  27940.69335938,
         52304.66796875,  34629.9140625 ,  24162.2578125 ,  37237.19921875,
         38663.5234375 ,   5816.80810547,    764.2232666 ,  17119.06835938,
 

In [17]:
BatchNormalization??

Cool, that worked!

# Step 3: Implement Keras Model

Now that we know how to set weights, the rest was just a simple search to line up all the varibales.
I've recreated the soundnet `ops.py` functions here so I can just copy the `models.py` model description. 
However, Every conv has a BatchNorm and a ReLu, so I'm going to roll all that into one function. See below (as well as the padding). 

In [18]:
Conv2D??

In [19]:
def keras_conv_2d(prev_layer, in_ch, out_ch, k_h=1,
                 k_w=1, d_h=1, d_w=1,p_h=0, p_w=0, pad='valid',
                 name_scope='conv1', weight_dict=None, eps=1e-5, bn_act=True):
    if pad=='valid':
        padded_input = ZeroPadding2D((p_h, p_w))(prev_layer)
    else:
        padded_input = prev_layer
    
    weights = weight_dict[name_scope]
    
    conv = Conv2D(out_ch, (k_h,k_w),
               strides=(d_h, d_w))
    # Need to pass input through so the layer knows its shape. 
    convOut = conv(padded_input)
    
    conv.set_weights([weights['weights'], weights['biases']])

    # Break if we don't need to add activation or BatchNorm. 
    if not bn_act:
        return convOut
    
    bn = BatchNormalization(epsilon=eps)
    bnOut = bn(convOut)
    
    bn.set_weights([weights[k] for k in ['gamma','beta','mean','var']])
    act = Activation('relu')
    rOut = act(bnOut)
    
    return rOut

I had to add a few things to make this work with the entire model. This is because layer 'conv_8' is special and doesn't have a batch norm or relu on its output. 

The maxpooling code is a straightforward port. 

In [20]:
def keras_maxpool(prev, k_h=1, k_w=1, d_h=1, d_w=1):
    return MaxPooling2D(pool_size=(k_h,k_w), strides=(d_h,d_w))(prev)

# Step 4: Putting it together

No let's go and run the code. Keras should complain if we get the dimensions wrong. (This happened to me in the first iteration, but I was able to fix it). 

In [21]:
inp = Input(shape=(None,1,1))
inp

<tf.Tensor 'input_2:0' shape=(?, ?, 1, 1) dtype=float32>

In [22]:
weights
inp = Input(shape=(None, 1, 1))
# Stream one: conv1 ~ conv7
x1 = keras_conv_2d(inp, 1, 16, k_h=64, d_h=2, p_h=32, name_scope='conv1', weight_dict=param_G)

x2 = keras_maxpool(x1, k_h=8, d_h=8)

x3= keras_conv_2d(x2, 16, 32, k_h=32, d_h=2, p_h=16, name_scope='conv2', weight_dict=param_G)

x4 = keras_maxpool(x3, k_h=8, d_h=8)

x5 = keras_conv_2d(x4, 32, 64, k_h=16, d_h=2, p_h=8, name_scope='conv3',weight_dict=param_G)

x6 = keras_conv_2d(x5, 64, 128, k_h=8, d_h=2, p_h=4, name_scope='conv4',weight_dict=param_G)

x7 = keras_conv_2d(x6, 128, 256, k_h=4, d_h=2, p_h=2, name_scope='conv5',weight_dict=param_G)

x8 = keras_maxpool(x7, k_h=4, d_h=4)

x9 = keras_conv_2d(x8, 256, 512, k_h=4, d_h=2, p_h=2, name_scope='conv6',weight_dict=param_G)

x = keras_conv_2d(x9, 512, 1024, k_h=4, d_h=2, p_h=2, name_scope='conv7',weight_dict=param_G)

# Split one: conv8, conv8_2
imageNet = keras_conv_2d(x, 1024, 1000, k_h=8, d_h=2,
                     name_scope='conv8',weight_dict=param_G,
                     bn_act=False)
places = keras_conv_2d(x, 1024, 401, k_h=8, d_h=2,
                   name_scope='conv8_2',weight_dict=param_G,
                   bn_act=False)

We can construct 3 models from this. One model is the "features" we could use for transfer learning, while the other 2 models are predictions of sound related to the places and imagenet competitions classes. Cool!

In [23]:
imagnetModel = KModel(inputs=inp, outputs=imageNet)
placesModel = KModel(inputs=inp, outputs=places)
features = KModel(inputs=inp, outputs=x)
baseFeatures = KModel(inputs=inp, outputs=x1)

# Step 6: Test it's the same! 

To test that we have build the same thing as the Tensorflow implementation, we need to be running this notebook in a directory which has the contents of https://github.com/eborboihuc/SoundNet-tensorflow in it.

Let's also load in some data that I have handy at the moment

In [24]:
from keras.utils import np_utils as np_utils
from sklearn.cross_validation import train_test_split
data = np.load('linerData.npy')
data.shape

data = data.reshape((-1, 176400,1, 1))
labels = np_utils.to_categorical(np.load('linerLabels.npy').reshape((-1, 1))).reshape((-1,2))

x_train, x_test1, y_train, y_test1 = train_test_split(data, labels)

x_valid, x_test, y_valid, y_test = train_test_split(x_test1, y_test1, test_size=0.5)



lets compile our features model

In [25]:

#baseFeatures.compile('adam','categorical_crossentropy')


In [26]:
x_valid = x_valid.reshape(170,176400,1,1)

test_set = x_valid[0:2,:,:,:].reshape(2,176400,1,1)


In [43]:
featureResult = features.predict(test_set, batch_size=2)

In [28]:
np.sqrt(np.sum(np.square(featureResult)))

440.16888

Let's load in the Tensorflow model.
Note: Make sure you run `model.load` after `sesssion.run(init)`. Otherwise you will re-initialise all the variables, destroying their pretrained weights! 

In [29]:
import model as SoundNet


# Init. Session
sess_config = tf.ConfigProto()
sess_config.allow_soft_placement=True
sess_config.gpu_options.allow_growth = True

session = tf.Session(config=sess_config)

# Load pre-trained model
G_name = './models/sound8.npy'
param_G = np.load(G_name, encoding='latin1').item()

local_config = {
            'batch_size': 2,
            'eps': 1e-5,
            'sample_rate': 22050,
            'load_size': 22050*20,
            'name_scope': 'SoundNet',
}

model = SoundNet.Model(session, config=local_config, param_G=param_G)

init = tf.global_variables_initializer()

session.run(init)

model.load()

batch_size : 2
name_scope : SoundNet
sample_rate : 22050
eps : 1e-05
load_size : 441000
Assign pretrain model gamma to conv1
assigned diff = 0.0
Assign pretrain model weights to conv1
assigned diff = 0.0
Assign pretrain model biases to conv1
assigned diff = 0.0
Assign pretrain model beta to conv1
assigned diff = 0.0
Assign pretrain model var to conv1
assigned diff = 0.0
Assign pretrain model mean to conv1
assigned diff = 0.0
Assign pretrain model gamma to conv3
assigned diff = 0.0
Assign pretrain model weights to conv3
assigned diff = 0.0
Assign pretrain model biases to conv3
assigned diff = 0.0
Assign pretrain model beta to conv3
assigned diff = 0.0
Assign pretrain model var to conv3
assigned diff = 0.0
Assign pretrain model mean to conv3
assigned diff = 0.0
Assign pretrain model gamma to conv2
assigned diff = 0.0
Assign pretrain model weights to conv2
assigned diff = 0.0
Assign pretrain model biases to conv2
assigned diff = 0.0
Assign pretrain model beta to conv2
assigned diff = 0.0


True

Let's create the `edist` function to calculate the euclidean distance (or norm) of two vectors. This way we can check the approaches are equivalent. 

In [38]:
def edist(x):
    return np.sqrt(np.sum(np.square(x)))

In [44]:
SoundNetResult = session.run(model.layers[24], 
                      feed_dict={model.sound_input_placeholder: test_set})

print(SoundNetResult.shape)
print(featureResult.shape)

edist(SoundNetResult-featureResult)

(2, 6, 1, 1024)
(2, 6, 1, 1024)


0.0

Awesome! The results are the same!

We can also check the other layers, or try on more data. 