# Implementation of Accurate Binary Convolution Layer
[Original Paper](https://arxiv.org/abs/1711.11294)

In [1]:
from __future__ import division, print_function
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters


The inspiration for this network is the use of Deep Neural Networks for real-time object recognition. Currently available **Convolution Layers** require large amount of computation power at runtime and that hinders the use of very deep networks in embedded systems or ASICs. Xiaofan Lin, Cong Zhao, and Wei Pan presented a way to convert Convolution Layers to **Binary Convolution Layers** for faster realtime computation.

### Approximating Convolution weights using binary weights
Here the hope is to approximate $\mathbf{W}\in\mathbb{R}^{w*h*c_{in}*c_{out}}$ using $\alpha_1\mathbf{B_1}+\alpha_2\mathbf{B_2}+...+\alpha_m\mathbf{B_m}$ where $\mathbf{B_1}, \mathbf{B_2}, ..., \mathbf{B_m}\in\mathbb{R}^{w*h*c_{in}*c_{out}}$ and $\alpha_1, \alpha_2, ..., \alpha_m\in\mathbb{R}^1$

#### Conversion from convolution filter to binary filter
Let's implement the conversion of convolution filter to binary convolution filters first.
To approximate $\mathbf{W}$ with $\alpha_1\mathbf{B_1}+\alpha_2\mathbf{B_2}+...+\alpha_m\mathbf{B_m}$ we'll use the equation from the paper $\mathbf{B_i}=\operatorname{sign}(\bar{\mathbf{W}} + \mu_i\operatorname{std}(\mathbf{W}))$

We'll need mean and standard deviation of the complete convolution filters

In [2]:
def get_mean_stddev(input_tensor):
    with tf.name_scope('mean_stddev_cal'):
        mean, variance = tf.nn.moments(input_tensor, axes=range(len(input_tensor.get_shape())))
        stddev = tf.sqrt(variance, name="standard_deviation")
        return mean, stddev

We need to spread the standard deviation by the number of filters being used as in the original paper
$\mu_i= -1 + (i - 1)\frac{2}{\mathbf{M} - 1}$

In [3]:
# TODO: Allow shift parameters to be learnable
def get_shifted_stddev(stddev, no_filters):
    with tf.name_scope('shifted_stddev'):
        spreaded_deviation = -1. + (2./(no_filters - 1)) * tf.convert_to_tensor(range(no_filters),
                                                                                dtype=tf.float32)
        return spreaded_deviation * stddev

Now, we can get the values of $\mathbf{B_{i}s}$

In [4]:
def get_binary_filters(convolution_filters, no_filters, name=None):
    with tf.name_scope(name, default_name="get_binary_filters"):
        mean, stddev = get_mean_stddev(convolution_filters)
        shifted_stddev = get_shifted_stddev(stddev, no_filters)
        
        # Normalize the filters by subtracting mean from them
        mean_adjusted_filters = convolution_filters - mean
        
        # Tiling filters to match the number of filters
        expanded_filters = tf.expand_dims(mean_adjusted_filters, axis=0, name="expanded_filters")
        tiled_filters = tf.tile(expanded_filters, [no_filters] + [1] * len(convolution_filters.get_shape()),
                                name="tiled_filters")
        
        # Similarly tiling spreaded stddev to match the shape of tiled_filters
        expanded_stddev = tf.reshape(shifted_stddev, [no_filters] + [1] * len(convolution_filters.get_shape()),
                                     name="expanded_stddev")
        
        binarized_filters = tf.sign(tiled_filters + expanded_stddev, name="binarized_filters")
        return binarized_filters

#### Calculating alphas
Now, we can calculate alphas using the *binary filters* and *convolution filters* by minimizing the *squared difference*
$\|\mathbf{W}-\mathbf{B}\alpha\|^2$

In [5]:
def get_alphas(convolution_filters, binary_filters, no_filters, name=None):
    with tf.name_scope(name, "get_alphas"):
        # Reshaping convolution filters to be one dimensional and binary filters to be of [no_filters, -1] dimension
        reshaped_convolution_filters = tf.reshape(convolution_filters, [-1], name="reshaped_convolution_filters")
        reshaped_binary_filters = tf.reshape(binary_filters, [no_filters, -1],
                                             name="reshaped_binary_filters")
        
        # Creating variable for alphas
        alphas = tf.Variable(tf.constant(1./no_filters, shape=(no_filters, 1)), name="alphas")
        
        # Calculating W*alpha
        weighted_sum_filters = tf.reduce_sum(tf.multiply(alphas, reshaped_binary_filters),
                                             axis=0, name="weighted_sum_filters")
        
        # Defining loss
        error = tf.square(reshaped_convolution_filters - weighted_sum_filters, name="alphas_error")
        loss = tf.reduce_mean(error, axis=0, name="alphas_loss")
        
        # Defining optimizer
        training_op = tf.train.AdamOptimizer().minimize(loss, var_list=[alphas],
                                                        name="alphas_training_op")
        
        return alphas, training_op, loss

### Creating ApproxConv using the binary filters
$\mathbf{O}=\sum\limits_{m=1}^M\alpha_m\operatorname{Conv}(\mathbf{B}_m, \mathbf{A})$

As in mentioned in the paper, it is better to train the network first with simple Convolution networks and then convert the filters into the binary filters, allowing original filters to be trained.

In [6]:
def ApproxConv(no_filters, convolution_filters, convolution_biases=None,
               strides=(1, 1), padding="VALID", name=None):
    with tf.name_scope(name, "ApproxConv"):
        # Creating variables from input convolution filters and convolution biases
        filters = tf.Variable(convolution_filters, dtype=tf.float32, name="filters")
        if convolution_biases is None:
            biases = 0.
        else:
            biases = tf.Variable(convolution_biases, dtype=tf.float32, name="biases")
        
        # Creating binary filters
        binary_filters = get_binary_filters(filters, no_filters)
        
        # Getting alphas
        alphas, alphas_training_op, alphas_loss = get_alphas(filters, binary_filters,
                                                             no_filters)
        
        # Defining function for closure to accept multiple inputs with same filters
        def ApproxConvLayer(input_tensor, name=None):
            with tf.name_scope(name, "ApproxConv_Layer"):
                # Reshaping alphas to match the input tensor
                reshaped_alphas = tf.reshape(alphas,
                                             shape=[no_filters] + [1] * len(input_tensor.get_shape()),
                                             name="reshaped_alphas")
                
                # Calculating convolution for each binary filter
                approxConv_outputs = []
                for index in range(no_filters):
                    # Binary convolution
                    this_conv = tf.nn.conv2d(input_tensor, binary_filters[index],
                                             strides=(1,) + strides + (1,),
                                             padding=padding)
                    approxConv_outputs.append(this_conv + biases)
                conv_outputs = tf.convert_to_tensor(approxConv_outputs, dtype=tf.float32,
                                                    name="conv_outputs")
                
                # Summing up each of the binary convolution
                ApproxConv_output = tf.reduce_sum(tf.multiply(conv_outputs, reshaped_alphas), axis=0)
                
                return ApproxConv_output
        
        return alphas_training_op, ApproxConvLayer, alphas_loss

### Multiple binary activations and bitwise convolution
Now, convolution can be achieved using just the summation operations by using the ApproxConv layers. But the paper suggests something even better. We can even bypass the summation through bitwise operations only, if the input to the convolution layer is also binarized.
For that the authors suggests that an input can be binarized (creating multiple inputs) by shifting the inputs and binarizing them.

First, the input is clipped between 0. and 1. using multiple shift parameters $\nu$, learnable by the network  
$\operatorname{h_{\nu}}(x)=\operatorname{clip}(x + \nu, 0, 1)$  
  
Then using the following function it is binarized  
$\operatorname{H_{\nu}}(\mathbf{R})=2\mathbb{I}_{\operatorname{h_{\nu}}(\mathbf{R})\geq0.5}-1$

The above function can be implemented as  
$\operatorname{H_{\nu}}(\mathbf{R})=\operatorname{sign}(\mathbf{R} - 0.5)$

Now, after calculating the **ApproxConv** over each separated input, their weighted summation can be taken using trainable paramters $\beta s$

In [7]:
def ABC(convolution_filters, convolution_biases=None, no_binary_filters=5, no_ApproxConvLayers=5,
        strides=(1, 1), padding="VALID", name=None):
    with tf.name_scope(name, "ABC"):
        # Creating variables shift parameters and weighted sum parameters (betas)
        shift_parameters = tf.Variable(tf.constant(0., shape=(no_ApproxConvLayers, 1)), dtype=tf.float32,
                                       name="shift_parameters")
        betas = tf.Variable(tf.constant(1., shape=(no_ApproxConvLayers, 1)), dtype=tf.float32,
                            name="betas")
        
        # Instantiating the ApproxConv Layer
        alphas_training_op, ApproxConvLayer, alphas_loss = ApproxConv(no_binary_filters,
                                                                      convolution_filters, convolution_biases,
                                                                      strides, padding)
        
        def ABCLayer(input_tensor, name=None):
            with tf.name_scope(name, "ABCLayer"):
                # Reshaping betas to match the input tensor
                reshaped_betas = tf.reshape(betas,
                                            shape=[no_ApproxConvLayers] + [1] * len(input_tensor.get_shape()),
                                            name="reshaped_betas")
                
                # Calculating ApproxConv for each shifted input
                ApproxConv_layers = []
                for index in range(no_ApproxConvLayers):
                    # Shifting and binarizing input
                    shifted_input = tf.clip_by_value(input_tensor + shift_parameters[index], 0., 1.,
                                                     name="shifted_input_" + str(index))
                    binarized_activation = tf.sign(shifted_input - 0.5)
                    
                    # Passing through the ApproxConv layer
                    ApproxConv_layers.append(ApproxConvLayer(binarized_activation))
                ApproxConv_output = tf.convert_to_tensor(ApproxConv_layers, dtype=tf.float32,
                                                         name="ApproxConv_output")
                
                # Taking the weighted sum using the betas
                ABC_output = tf.reduce_sum(tf.multiply(ApproxConv_output, reshaped_betas), axis=0)
                return ABC_output
        
        return alphas_training_op, ABCLayer, alphas_loss

## Testing
Let's just test our network using MNIST

In [8]:
# Importing data
from tensorflow.examples.tutorials.mnist import input_data
!mkdir -p /tmp/data
mnist = input_data.read_data_sets("/tmp/data/")

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [9]:
# Defining utils function
def weight_variable(shape, name="weight"):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial, name=name)

def bias_variable(shape, name="bias"):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial, name=name)

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')

In [10]:
# Creating the graph
without_ABC_graph = tf.Graph()
with without_ABC_graph.as_default():
    # Defining inputs
    x = tf.placeholder(dtype=tf.float32)
    x_image = tf.reshape(x, [-1, 28, 28, 1])
    
     # Convolution Layer 1
    W_conv1 = weight_variable(shape=([5, 5, 1, 32]), name="W_conv1")
    b_conv1 = bias_variable(shape=[32], name="b_conv1")
    conv1 = (conv2d(x_image, W_conv1) + b_conv1)
    pool1 = max_pool_2x2(conv1)
    bn_conv1 = tf.layers.batch_normalization(pool1, axis=-1, name="batchNorm1")
    h_conv1 = tf.nn.relu(bn_conv1)

    # Convolution Layer 2
    W_conv2 = weight_variable(shape=([5, 5, 32, 64]), name="W_conv2")
    b_conv2 = bias_variable(shape=[64], name="b_conv2")
    conv2 = (conv2d(h_conv1, W_conv2) + b_conv2)
    pool2 = max_pool_2x2(conv2)
    bn_conv2 = tf.layers.batch_normalization(pool2, axis=-1, name="batchNorm2")
    h_conv2 = tf.nn.relu(bn_conv2)

    # Flat the conv2 output
    h_conv2_flat = tf.reshape(h_conv2, shape=(-1, 7*7*64))

    # Dense layer1
    W_fc1 = weight_variable([7 * 7 * 64, 1024])
    b_fc1 = bias_variable([1024])
    h_fc1 = tf.nn.relu(tf.matmul(h_conv2_flat, W_fc1) + b_fc1)

    # Dropout
    keep_prob = tf.placeholder(tf.float32)
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

    # Output layer
    W_fc2 = weight_variable([1024, 10])
    b_fc2 = bias_variable([10])

    y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
    
    # Labels
    y = tf.placeholder(tf.int32, [None])
    y_ = tf.one_hot(y, 10)
    
    # Defining optimizer and loss
    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_, logits=y_conv))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    # Initializer
    graph_init = tf.global_variables_initializer()

Let's just define a dictionary to hold the numpy values of the calculated parameters of the network, so that we can feed it directly to our custom network

In [11]:
# Defining variables to save. These will be fed to our custom layer
variables_to_save = {"W_conv1": W_conv1,
                     "b_conv1": b_conv1,
                     "W_conv2": W_conv2,
                     "b_conv2": b_conv2,
                     "W_fc1": W_fc1,
                     "b_fc1": b_fc1,
                     "W_fc2": W_fc2,
                     "b_fc2": b_fc2}
values = {}

In [12]:
n_epochs = 5
batch_size = 32
        
with tf.Session(graph=without_ABC_graph) as sess:
    sess.run(graph_init)
    for epoch in range(n_epochs):
        for iteration in range(1, 200 + 1):
            batch = mnist.train.next_batch(50)
            
            # Run operation and calculate loss
            _, loss_train = sess.run([train_step, cross_entropy],
                                     feed_dict={x: batch[0], y: batch[1], keep_prob: 0.5})
            print("\rIteration: {}/{} ({:.1f}%)  Loss: {:.5f}".format(
                      iteration, 200,
                      iteration * 100 / 200,
                      loss_train),
                  end="")

        # At the end of each epoch,
        # measure the validation loss and accuracy:
        loss_vals = []
        acc_vals = []
        for iteration in range(1, 200 + 1):
            X_batch, y_batch = mnist.validation.next_batch(batch_size)
            acc_val, loss_val = sess.run([accuracy, cross_entropy],
                                     feed_dict={x: batch[0], y: batch[1], keep_prob: 1.0})
            loss_vals.append(loss_val)
            acc_vals.append(acc_val)
            print("\rEvaluating the model: {}/{} ({:.1f}%)".format(iteration, 200,
                iteration * 100 / 200),
                  end=" " * 10)
        loss_val = np.mean(loss_vals)
        acc_val = np.mean(acc_vals)
        print("\rEpoch: {}  Val accuracy: {:.4f}%  Loss: {:.6f}".format(
            epoch + 1, acc_val * 100, loss_val))
        
    # On completion of training, save the variables to be fed to custom model
    for var_name in variables_to_save:
        values[var_name] = sess.run(variables_to_save[var_name])

Epoch: 1  Val accuracy: 82.0000%  Loss: 0.463989
Epoch: 2  Val accuracy: 98.0000%  Loss: 0.066990
Epoch: 3  Val accuracy: 92.0000%  Loss: 0.354678
Epoch: 4  Val accuracy: 98.0000%  Loss: 0.111749
Epoch: 5  Val accuracy: 98.0000%  Loss: 0.073754


### Let's build our model now

In [13]:
custom_graph = tf.Graph()
with custom_graph.as_default():
    alphas_training_operations = []
    
    # Inputs
    x = tf.placeholder(dtype=tf.float32)
    x_image = tf.reshape(x, [-1, 28, 28, 1])
    
    # Convolution Layer 1
    W_conv1 = tf.Variable(values["W_conv1"], name="W_conv1")
    b_conv1 = tf.Variable(values["b_conv1"], name="b_conv1")
    alphas_training_op1, ABCLayer1, alphas_loss1 = ABC(W_conv1, b_conv1,
                                                       no_binary_filters=5,
                                                       no_ApproxConvLayers=5,
                                                       padding="SAME")
    alphas_training_operations.append(alphas_training_op1)
    conv1 = ABCLayer1(x_image)
    pool1 = max_pool_2x2(conv1)
    bn_conv1 = tf.layers.batch_normalization(pool1, axis=-1)
    h_conv1 = tf.nn.relu(bn_conv1)

    # Convolution Layer 2
    W_conv2 = tf.Variable(values["W_conv2"], name="W_conv2")
    b_conv2 = tf.Variable(values["b_conv2"], name="b_conv2")
    alphas_training_op2, ABCLayer2, alphas_loss2 = ABC(W_conv2, b_conv2,
                                                       no_binary_filters=5,
                                                       no_ApproxConvLayers=5,
                                                       padding="SAME")
    alphas_training_operations.append(alphas_training_op2)
    conv2 = ABCLayer2(h_conv1)
    pool2 = max_pool_2x2(conv2)
    bn_conv2 = tf.layers.batch_normalization(pool2, axis=-1)
    h_conv2 = tf.nn.relu(bn_conv2)

    # Flat the conv2 output
    h_conv2_flat = tf.reshape(h_conv2, shape=(-1, 7*7*64))

    # Dense layer1
    W_fc1 = weight_variable([7 * 7 * 64, 1024])
    b_fc1 = bias_variable([1024])
    h_fc1 = tf.nn.relu(tf.matmul(h_conv2_flat, W_fc1) + b_fc1)

    # Dropout
    keep_prob = tf.placeholder(tf.float32)
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

    # Output layer
    W_fc2 = weight_variable([1024, 10])
    b_fc2 = bias_variable([10])
    y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
    
    # Labels
    y = tf.placeholder(tf.int32, [None])
    y_ = tf.one_hot(y, 10)
    
    # Defining optimizer and loss
    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    graph_init = tf.global_variables_initializer()

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



In [14]:
n_epochs = 5
batch_size = 32
alpha_training_epochs = 200
        
with tf.Session(graph=custom_graph) as sess:
    sess.run(graph_init)
    for epoch in range(n_epochs):
        for iteration in range(1, 200 + 1):
            # Training alphas
            for alpha_training_op in alphas_training_operations:
                for alpha_epoch in range(alpha_training_epochs):
                    sess.run(alpha_training_op)
            
            batch = mnist.train.next_batch(50)
            
            # Run operation and calculate loss
            _, loss_train = sess.run([train_step, cross_entropy],
                                     feed_dict={x: batch[0], y: batch[1], keep_prob: 0.5})
            print("\rIteration: {}/{} ({:.1f}%)  Loss: {:.5f}".format(
                      iteration, 200,
                      iteration * 100 / 200,
                      loss_train),
                  end="")

        # At the end of each epoch,
        # measure the validation loss and accuracy:
        
        # Training alphas
        for alpha_training_op in alphas_training_operations:
            for alpha_epoch in range(alpha_training_epochs):
                sess.run(alpha_training_op)
                    
        loss_vals = []
        acc_vals = []
        for iteration in range(1, 200 + 1):            
            X_batch, y_batch = mnist.validation.next_batch(batch_size)
            acc_val, loss_val = sess.run([accuracy, cross_entropy],
                                     feed_dict={x: batch[0], y: batch[1], keep_prob: 1.0})
            loss_vals.append(loss_val)
            acc_vals.append(acc_val)
            print("\rEvaluating the model: {}/{} ({:.1f}%)".format(iteration, 200,
                iteration * 100 / 200),
                  end=" " * 10)
        loss_val = np.mean(loss_vals)
        acc_val = np.mean(acc_vals)
        print("\rEpoch: {}  Val accuracy: {:.4f}%  Loss: {:.6f}".format(
            epoch + 1, acc_val * 100, loss_val))

Epoch: 1  Val accuracy: 82.0000%  Loss: 2.189009
Epoch: 2  Val accuracy: 88.0000%  Loss: 2.503564
Epoch: 3  Val accuracy: 86.0000%  Loss: 1.941223
Epoch: 4  Val accuracy: 90.0000%  Loss: 1.004598
Epoch: 5  Val accuracy: 82.0000%  Loss: 3.454274
