# Implementation of Accurate Binary Convolution Layer
[Original Paper](https://arxiv.org/abs/1711.11294)

In [2]:
from __future__ import division, print_function
import tensorflow as tf
import numpy as np


  from ._conv import register_converters as _register_converters


### Approximating Convolution weights using binary weights
Here the hope is to approximate $\mathbf{W}\in\mathbb{R}^{w*h*c_{in}*c_{out}}$ using $\alpha_1\mathbf{B_1}+\alpha_2\mathbf{B_2}+...+\alpha_m\mathbf{B_m}$ where $\mathbf{B_1}, \mathbf{B_2}, ..., \mathbf{B_m}\in\mathbb{R}^{w*h*c_{in}*c_{out}}$ and $\alpha_1, \alpha_2, ..., \alpha_m\in\mathbb{R}^1$

#### Conversion from convolution filter to binary filter
Let's implement the conversion of convolution filter to binary convolution filters first.
To approximate $\mathbf{W}$ with $\alpha_1\mathbf{B_1}+\alpha_2\mathbf{B_2}+...+\alpha_m\mathbf{B_m}$ we'll use the equation from the paper $\mathbf{B_i}=\operatorname{sign}(\bar{\mathbf{W}} + \mu_i\operatorname{std}(\mathbf{W}))$

The inspiration for this network is the use of Deep Neural Networks for real-time object recognition. Currently available **Convolution Layers** require large amount of computation power at runtime and that hinders the use of very deep networks in embedded systems or ASICs. Xiaofan Lin, Cong Zhao, and Wei Pan presented a way to convert Convolution Layers to **Binary Convolution Layers** for faster realtime computation.

We'll need mean and standard deviation of the complete convolution filters

In [3]:
def get_mean_stddev(input_tensor):
    with tf.name_scope('mean_stddev_cal'):
        mean, variance = tf.nn.moments(input_tensor, axes=range(len(input_tensor.get_shape())))
        stddev = tf.sqrt(variance, name="standard_deviation")
        return mean, stddev

We need to spread the standard deviation by the number of filters being used as in the original paper
$\mu_i= -1 + (i - 1)\frac{2}{\mathbf{M} - 1}$

In [4]:
# TODO: Allow shift parameters to be learnable
def get_shifted_stddev(stddev, no_filters):
    with tf.name_scope('shifted_stddev'):
        spreaded_deviation = -1. + (2./(no_filters - 1)) * tf.convert_to_tensor(range(no_filters),
                                                                                dtype=tf.float32)
        return spreaded_deviation * stddev

Now, we can get the values of $\mathbf{B_{i}s}$

In [5]:
def get_binary_filters(convolution_filters, no_filters, name=None):
    with tf.name_scope(name, default_name="get_binary_filters"):
        mean, stddev = get_mean_stddev(convolution_filters)
        shifted_stddev = get_shifted_stddev(stddev, no_filters)
        
        # Normalize the filters by subtracting mean from them
        mean_adjusted_filters = convolution_filters - mean
        
        # Tiling filters to match the number of filters
        expanded_filters = tf.expand_dims(mean_adjusted_filters, axis=0, name="expanded_filters")
        tiled_filters = tf.tile(expanded_filters, [no_filters] + [1] * len(convolution_filters.get_shape()),
                                name="tiled_filters")
        
        # Similarly tiling spreaded stddev to match the shape of tiled_filters
        expanded_stddev = tf.reshape(shifted_stddev, [no_filters] + [1] * len(convolution_filters.get_shape()),
                                     name="expanded_stddev")
        
        binarized_filters = tf.sign(tiled_filters + expanded_stddev, name="binarized_filters")
        return binarized_filters

#### Calculating alphas
Now, we can calculate alphas using the *binary filters* and *convolution filters* by minimizing the *squared difference*
$\|\mathbf{W}-\mathbf{B}\alpha\|^2$

In [6]:
def get_alphas(convolution_filters, binary_filters, no_filters, name=None):
    with tf.name_scope(name, "get_alphas"):
        # Reshaping convolution filters to be one dimensional and binary filters to be of [no_filters, -1] dimension
        reshaped_convolution_filters = tf.reshape(convolution_filters, [-1], name="reshaped_convolution_filters")
        reshaped_binary_filters = tf.reshape(binary_filters, [no_filters, -1],
                                             name="reshaped_binary_filters")
        
        # Creating variable for alphas
        alphas = tf.Variable(tf.constant(1./no_filters, shape=(no_filters, 1)), name="alphas")
        
        # Calculating W*alpha
        weighted_sum_filters = tf.reduce_sum(tf.multiply(alphas, reshaped_binary_filters),
                                             axis=0, name="weighted_sum_filters")
        
        # Defining loss
        error = tf.square(reshaped_convolution_filters - weighted_sum_filters, name="alphas_error")
        loss = tf.reduce_mean(error, axis=0, name="alphas_loss")
        
        # Defining optimizer
        training_op = tf.train.AdamOptimizer().minimize(loss, var_list=[alphas],
                                                        name="alphas_training_op")
        
        return alphas, training_op, loss

### Creating ApproxConv using the binary filters
$\mathbf{O}=\sum\limits_{m=1}^M\alpha_m\operatorname{Conv}(\mathbf{B}_m, \mathbf{A})$

As in mentioned in the paper, it is better to train the network first with simple Convolution networks and then convert the filters into the binary filters, allowing original filters to be trained.

In [7]:
def ApproxConv(no_filters, convolution_filters, convolution_biases=None,
               strides=(1, 1), padding="VALID", name=None):
    with tf.name_scope(name, "ApproxConv"):
        # Creating variables from input convolution filters and convolution biases
        filters = tf.Variable(convolution_filters, dtype=tf.float32, name="filters")
        if convolution_biases is None:
            biases = 0.
        else:
            biases = tf.Variable(convolution_biases, dtype=tf.float32, name="biases")
        
        # Creating binary filters
        binary_filters = get_binary_filters(filters, no_filters)
        
        # Getting alphas
        alphas, alphas_training_op, alphas_loss = get_alphas(filters, binary_filters,
                                                             no_filters)
        
        # Defining function for closure to accept multiple inputs with same filters
        def ApproxConvLayer(input_tensor, name=None):
            with tf.name_scope(name, "ApproxConv_Layer"):
                # Reshaping alphas to match the input tensor
                reshaped_alphas = tf.reshape(alphas,
                                             shape=[no_filters] + [1] * len(input_tensor.get_shape()),
                                             name="reshaped_alphas")
                
                # Calculating convolution for each binary filter
                approxConv_outputs = []
                for index in range(no_filters):
                    # Binary convolution
                    this_conv = tf.nn.conv2d(input_tensor, binary_filters[index],
                                             strides=(1,) + strides + (1,),
                                             padding=padding)
                    approxConv_outputs.append(this_conv + biases)
                conv_outputs = tf.convert_to_tensor(approxConv_outputs, dtype=tf.float32,
                                                    name="conv_outputs")
                
                # Summing up each of the binary convolution
                ApproxConv_output = tf.reduce_sum(tf.multiply(conv_outputs, reshaped_alphas), axis=0)
                
                return ApproxConv_output
        
        return alphas_training_op, ApproxConvLayer, alphas_loss

### Multiple binary activations and bitwise convolution
Now, convolution can be achieved using just the summation operations by using the ApproxConv layers. But the paper suggests something even better. We can even bypass the summation through bitwise operations only, if the input to the convolution layer is also binarized.
For that the authors suggests that an input can be binarized (creating multiple inputs) by shifting the inputs and binarizing them.

First, the input is clipped between 0. and 1. using multiple shift parameters $\nu$, learnable by the network  
$\operatorname{h_{\nu}}(x)=\operatorname{clip}(x + \nu, 0, 1)$  
  
Then using the following function it is binarized  
$\operatorname{H_{\nu}}(\mathbf{R})=2\mathbb{I}_{\operatorname{h_{\nu}}(\mathbf{R})\geq0.5}-1$

The above function can be implemented as  
$\operatorname{H_{\nu}}(\mathbf{R})=\operatorname{sign}(\mathbf{R} - 0.5)$

Now, after calculating the **ApproxConv** over each separated input, their weighted summation can be taken using trainable paramters $\beta s$

In [8]:
def ABC(convolution_filters, convolution_biases=None, no_binary_filters=5, no_ApproxConvLayers=5,
        strides=(1, 1), padding="VALID", name=None):
    with tf.name_scope(name, "ABC"):
        # Creating variables shift parameters and weighted sum parameters (betas)
        shift_parameters = tf.Variable(tf.constant(0., shape=(no_ApproxConvLayers, 1)), dtype=tf.float32,
                                       name="shift_parameters")
        betas = tf.Variable(tf.constant(1., shape=(no_ApproxConvLayers, 1)), dtype=tf.float32,
                            name="betas")
        
        # Instantiating the ApproxConv Layer
        alphas_training_op, ApproxConvLayer, alphas_loss = ApproxConv(no_binary_filters,
                                                                      convolution_filters, convolution_biases,
                                                                      strides, padding)
        
        def ABCLayer(input_tensor, name=None):
            with tf.name_scope(name, "ABCLayer"):
                # Reshaping betas to match the input tensor
                reshaped_betas = tf.reshape(betas,
                                            shape=[no_ApproxConvLayers] + [1] * len(input_tensor.get_shape()),
                                            name="reshaped_betas")
                
                # Calculating ApproxConv for each shifted input
                ApproxConv_layers = []
                for index in range(no_ApproxConvLayers):
                    # Shifting and binarizing input
                    shifted_input = tf.clip_by_value(input_tensor + shift_parameters[index], 0., 1.,
                                                     name="shifted_input_" + str(index))
                    binarized_activation = tf.sign(shifted_input - 0.5)
                    
                    # Passing through the ApproxConv layer
                    ApproxConv_layers.append(ApproxConvLayer(binarized_activation))
                ApproxConv_output = tf.convert_to_tensor(ApproxConv_layers, dtype=tf.float32,
                                                         name="ApproxConv_output")
                
                # Taking the weighted sum using the betas
                ABC_output = tf.reduce_sum(tf.multiply(ApproxConv_output, reshaped_betas), axis=0)
                return ABC_output
        
        return alphas_training_op, ABCLayer, alphas_loss

## Testing
Let's just test our network using Cifar-10

In [9]:
import cPickle as pickle
import numpy as np
import tensorflow as tf
import tarfile
import ntpath
import cPickle, gzip
import os
from six.moves import urllib
import sys
import time
import math

data_dir = "."
DATA_URL = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'

def maybe_download():
    """Download and extract the tarball from Alex's website."""
    dest_directory = data_dir
    if not os.path.exists(dest_directory):
        os.makedirs(dest_directory)
    filename = DATA_URL.split('/')[-1]
    filepath = os.path.join(dest_directory, filename)
    if not os.path.exists(filepath):
        def _progress(count, block_size, total_size):
            sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
                float(count * block_size) / float(total_size) * 100.0))
            sys.stdout.flush()
        filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, reporthook=_progress)
    print()
    statinfo = os.stat(filepath)
    print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')

def load_cifar_10_dataset():
    #print "Opening CIFAR 10 dataset"
    dataset = {}
    with tarfile.open(data_dir + "/cifar-10-python.tar.gz", "r:gz") as tar:
        for member in tar.getmembers():
            if member.isfile():
                if "_batch" in member.name:
                    file_name = ntpath.basename(member.name)
                    f = tar.extractfile(member)
                    batch_dataset = cPickle.load(f) 
                    dataset[file_name] = batch_dataset
                elif member.name.endswith("batches.meta"):
                    f = tar.extractfile(member)
                    label_names = cPickle.load(f) 
                    dataset["meta"] = label_names
    return dataset

def merge_datasets(dataset_one, dataset_two):
    return {
        "data": np.concatenate((dataset_one["data"], dataset_two["data"])),
        "labels": dataset_one["labels"] + dataset_two["labels"], 
    }

def get_merged_training_datasets(dataset_batches_dict):
    training_dataset_names = [ "data_batch_1", "data_batch_2", "data_batch_3", "data_batch_4" ]
    training_datasets = map(lambda name: dataset_batches_dict[name], training_dataset_names)
    training_dataset_and_labels = reduce(merge_datasets, training_datasets)
    validation_dataset_and_labels = dataset_batches_dict["data_batch_5"]
    test_dataset_and_labels = dataset_batches_dict["test_batch"]
    return (
        np.asarray(training_dataset_and_labels["data"]), np.asarray(training_dataset_and_labels["labels"]),
        np.asarray(validation_dataset_and_labels["data"]), np.asarray(validation_dataset_and_labels["labels"]),
        np.asarray(test_dataset_and_labels["data"]), np.asarray(test_dataset_and_labels["labels"])
    )

maybe_download()
dataset_batches_dict = load_cifar_10_dataset()
label_names = dataset_batches_dict["meta"]["label_names"]
train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels = get_merged_training_datasets(dataset_batches_dict)

#print 'Training set', train_dataset.shape, train_labels.shape
#print 'Validation set', valid_dataset.shape, valid_labels.shape
#print 'Test set', test_dataset.shape, test_labels.shape


Successfully downloaded cifar-10-python.tar.gz 170498071 bytes.


In [10]:
image_size = 32
num_labels = 10
num_channels = 3 # RGB


def reformat(dataset, labels):
    #dataset = dataset.reshape(
    #  (-1, image_size, image_size, num_channels)).astype(np.float32)
    
    # the dataset is of a shape (*, num_channels * image_size * image_size) 
    # with the red values first, followed by the green, then blue.
    dataset = dataset
    x = dataset.reshape((-1, num_channels, image_size * image_size)) # break the channels into their own axes.
    y = x.transpose([0, 2, 1]) # This transpose the matrix by swapping the second and third axes, but not the first. This puts matching RGB values together
    reformated_dataset = y.reshape((-1, image_size, image_size, num_channels)).astype(np.float32) # Turn the dataset into a 4D tensor of a collection of images, with axes of width, height and colour channels.
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return reformated_dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
#print 'Training set', train_dataset.shape, train_labels.shape
#print 'Validation set', valid_dataset.shape, valid_labels.shape
#print 'Test set', test_dataset.shape, test_labels.shape

In [11]:
# Defining utils function
def weight_variable(shape, name="weight"):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial, name=name)

def bias_variable(shape, name="bias"):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial, name=name)

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')

In [25]:
# Creating the graph
num_classes = 10
without_ABC_graph = tf.Graph()
with without_ABC_graph.as_default():
    # Defining inputs
   # x = tf.placeholder(dtype=tf.float32)
    x = tf.placeholder(tf.float32, shape=[None, image_size, image_size, num_channels], name='x')
    y = tf.placeholder(tf.float32, shape=[None, num_classes], name='y')
    #y_ = tf.placeholder(tf.float32, shape=[None, num_classes], name='y')
    y_ = y
    x_image = tf.reshape(x, [-1, 32, 32, 3])
    
     # Convolution Layer 1
    W_conv1 = weight_variable(shape=([5, 5, 3, 32]), name="W_conv1")
    b_conv1 = bias_variable(shape=[32], name="b_conv1")
    conv1 = (conv2d(x_image, W_conv1) + b_conv1)
    pool1 = max_pool_2x2(conv1)
    bn_conv1 = tf.layers.batch_normalization(pool1, axis=-1, name="batchNorm1")
    h_conv1 = tf.nn.relu(bn_conv1)

    # Convolution Layer 2
    W_conv2 = weight_variable(shape=([5, 5, 32, 64]), name="W_conv2")
    b_conv2 = bias_variable(shape=[64], name="b_conv2")
    conv2 = (conv2d(h_conv1, W_conv2) + b_conv2)
    pool2 = max_pool_2x2(conv2)
    bn_conv2 = tf.layers.batch_normalization(pool2, axis=-1, name="batchNorm2")
    h_conv2 = tf.nn.relu(bn_conv2)

    # Flat the conv2 output
    h_conv2_flat = tf.reshape(h_conv2, shape=(-1, 8*8*64))

    # Dense layer1
    W_fc1 = weight_variable([8 * 8 * 64, 1024])
    b_fc1 = bias_variable([1024])
    h_fc1 = tf.nn.relu(tf.matmul(h_conv2_flat, W_fc1) + b_fc1)

    # Dropout
    keep_prob = tf.placeholder(tf.float32)
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

    # Output layer
    W_fc2 = weight_variable([1024, 10])
    b_fc2 = bias_variable([10])

    y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
    
    # Labels
    #y_ = tf.placeholder(tf.float32, shape=[None, num_classes], name='y')
    #y_ = tf.argmax(y_, dimension=1)
    
   # y = tf.placeholder(tf.int32, [None])
   # y_ = tf.one_hot(y, 10)
    
    # Defining optimizer and loss
    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_, logits=y_conv))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    # Initializer
    graph_init = tf.global_variables_initializer()

Let's just define a dictionary to hold the numpy values of the calculated parameters of the network, so that we can feed it directly to our custom network

In [26]:
# Defining variables to save. These will be fed to our custom layer
variables_to_save = {"W_conv1": W_conv1,
                     "b_conv1": b_conv1,
                     "W_conv2": W_conv2,
                     "b_conv2": b_conv2,
                     "W_fc1": W_fc1,
                     "b_fc1": b_fc1,
                     "W_fc2": W_fc2,
                     "b_fc2": b_fc2}
values = {}

In [27]:
n_epochs = 5
batch_size = 32
        
with tf.Session(graph=without_ABC_graph) as sess:
    sess.run(graph_init)
    for epoch in range(n_epochs):
        for iteration in range(1, 200 + 1):
            
            
            offset = (iteration * batch_size) % (train_labels.shape[0] - batch_size)
            batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
          #  feed_dict = {model.tf_train_dataset : batch_data, model.tf_train_labels : batch_labels, model.dropout_keep_probability: dropout_keep_prob}
            
            #batch = mnist.train.next_batch(50)
            
            # Run operation and calculate loss
            _, loss_train = sess.run([train_step, cross_entropy],
                                     feed_dict={x: batch_data, y: batch_labels, keep_prob: 0.5})
            print("\rIteration: {}/{} ({:.1f}%)  Loss: {:.5f}".format(
                      iteration, 200,
                      iteration * 100 / 200,
                      loss_train))

        # At the end of each epoch,
        # measure the validation loss and accuracy:
        loss_vals = []
        acc_vals = []
        for iteration in range(1, 200 + 1):
            offset = (iteration * batch_size) % (valid_labels.shape[0] - batch_size)
            batch_data = valid_dataset[offset:(offset + batch_size), :, :, :]
            batch_labels = valid_labels[offset:(offset + batch_size), :]
           # feed_dict = {model.tf_valid_dataset : batch_data, model.tf_train_labels : batch_labels, model.dropout_keep_probability: dropout_keep_prob}
            
            #X_batch, y_batch = mnist.validation.next_batch(batch_size)
            acc_val, loss_val = sess.run([accuracy, cross_entropy],
                                     feed_dict={x: batch_data, y: batch_labels, keep_prob: 1.0})
            loss_vals.append(loss_val)
            acc_vals.append(acc_val)
            print("\rEvaluating the model: {}/{} ({:.1f}%)".format(iteration, 200,
                iteration * 100 / 200,
                  end=" " * 10))
        loss_val = np.mean(loss_vals)
        acc_val = np.mean(acc_vals)
        print("\rEpoch: {}  Val accuracy: {:.4f}%  Loss: {:.6f}".format(
            epoch + 1, acc_val * 100, loss_val))
        
    # On completion of training, save the variables to be fed to custom model
    for var_name in variables_to_save:
        values[var_name] = sess.run(variables_to_save[var_name])

Iteration: 1/200 (0.5%)  Loss: 3195.11572
Iteration: 2/200 (1.0%)  Loss: 3092.08838
Iteration: 3/200 (1.5%)  Loss: 2460.34546
Iteration: 4/200 (2.0%)  Loss: 2598.86182
Iteration: 5/200 (2.5%)  Loss: 2108.45483
Iteration: 6/200 (3.0%)  Loss: 2720.36450
Iteration: 7/200 (3.5%)  Loss: 2503.95117
Iteration: 8/200 (4.0%)  Loss: 1742.82410
Iteration: 9/200 (4.5%)  Loss: 1580.77417
Iteration: 10/200 (5.0%)  Loss: 1781.08301
Iteration: 11/200 (5.5%)  Loss: 1665.51245
Iteration: 12/200 (6.0%)  Loss: 1881.06702
Iteration: 13/200 (6.5%)  Loss: 2292.36621
Iteration: 14/200 (7.0%)  Loss: 1798.37183
Iteration: 15/200 (7.5%)  Loss: 1858.07495
Iteration: 16/200 (8.0%)  Loss: 1937.64490
Iteration: 17/200 (8.5%)  Loss: 1610.55371
Iteration: 18/200 (9.0%)  Loss: 1874.48364
Iteration: 19/200 (9.5%)  Loss: 1791.45459
Iteration: 20/200 (10.0%)  Loss: 1456.11792
Iteration: 21/200 (10.5%)  Loss: 1600.89600
Iteration: 22/200 (11.0%)  Loss: 1153.12500
Iteration: 23/200 (11.5%)  Loss: 1344.40991
Iteration: 24/20

Iteration: 191/200 (95.5%)  Loss: 246.67917
Iteration: 192/200 (96.0%)  Loss: 227.98451
Iteration: 193/200 (96.5%)  Loss: 191.86090
Iteration: 194/200 (97.0%)  Loss: 175.97824
Iteration: 195/200 (97.5%)  Loss: 284.84805
Iteration: 196/200 (98.0%)  Loss: 219.68420
Iteration: 197/200 (98.5%)  Loss: 263.77612
Iteration: 198/200 (99.0%)  Loss: 190.22037
Iteration: 199/200 (99.5%)  Loss: 298.72827
Iteration: 200/200 (100.0%)  Loss: 307.82227
Evaluating the model: 1/200 (0.5%)
Evaluating the model: 2/200 (1.0%)
Evaluating the model: 3/200 (1.5%)
Evaluating the model: 4/200 (2.0%)
Evaluating the model: 5/200 (2.5%)
Evaluating the model: 6/200 (3.0%)
Evaluating the model: 7/200 (3.5%)
Evaluating the model: 8/200 (4.0%)
Evaluating the model: 9/200 (4.5%)
Evaluating the model: 10/200 (5.0%)
Evaluating the model: 11/200 (5.5%)
Evaluating the model: 12/200 (6.0%)
Evaluating the model: 13/200 (6.5%)
Evaluating the model: 14/200 (7.0%)
Evaluating the model: 15/200 (7.5%)
Evaluating the model: 16/200

Iteration: 8/200 (4.0%)  Loss: 245.52910
Iteration: 9/200 (4.5%)  Loss: 228.08081
Iteration: 10/200 (5.0%)  Loss: 242.55099
Iteration: 11/200 (5.5%)  Loss: 247.93379
Iteration: 12/200 (6.0%)  Loss: 185.16980
Iteration: 13/200 (6.5%)  Loss: 216.06656
Iteration: 14/200 (7.0%)  Loss: 187.52992
Iteration: 15/200 (7.5%)  Loss: 186.09706
Iteration: 16/200 (8.0%)  Loss: 175.18237
Iteration: 17/200 (8.5%)  Loss: 187.92615
Iteration: 18/200 (9.0%)  Loss: 266.71463
Iteration: 19/200 (9.5%)  Loss: 244.65741
Iteration: 20/200 (10.0%)  Loss: 253.55063
Iteration: 21/200 (10.5%)  Loss: 195.82706
Iteration: 22/200 (11.0%)  Loss: 242.52026
Iteration: 23/200 (11.5%)  Loss: 214.43872
Iteration: 24/200 (12.0%)  Loss: 265.84436
Iteration: 25/200 (12.5%)  Loss: 271.67758
Iteration: 26/200 (13.0%)  Loss: 272.21274
Iteration: 27/200 (13.5%)  Loss: 301.96002
Iteration: 28/200 (14.0%)  Loss: 202.46161
Iteration: 29/200 (14.5%)  Loss: 315.12341
Iteration: 30/200 (15.0%)  Loss: 169.40851
Iteration: 31/200 (15.5%)

Iteration: 199/200 (99.5%)  Loss: 66.17442
Iteration: 200/200 (100.0%)  Loss: 143.05627
Evaluating the model: 1/200 (0.5%)
Evaluating the model: 2/200 (1.0%)
Evaluating the model: 3/200 (1.5%)
Evaluating the model: 4/200 (2.0%)
Evaluating the model: 5/200 (2.5%)
Evaluating the model: 6/200 (3.0%)
Evaluating the model: 7/200 (3.5%)
Evaluating the model: 8/200 (4.0%)
Evaluating the model: 9/200 (4.5%)
Evaluating the model: 10/200 (5.0%)
Evaluating the model: 11/200 (5.5%)
Evaluating the model: 12/200 (6.0%)
Evaluating the model: 13/200 (6.5%)
Evaluating the model: 14/200 (7.0%)
Evaluating the model: 15/200 (7.5%)
Evaluating the model: 16/200 (8.0%)
Evaluating the model: 17/200 (8.5%)
Evaluating the model: 18/200 (9.0%)
Evaluating the model: 19/200 (9.5%)
Evaluating the model: 20/200 (10.0%)
Evaluating the model: 21/200 (10.5%)
Evaluating the model: 22/200 (11.0%)
Evaluating the model: 23/200 (11.5%)
Evaluating the model: 24/200 (12.0%)
Evaluating the model: 25/200 (12.5%)
Evaluating the 

Iteration: 16/200 (8.0%)  Loss: 71.78947
Iteration: 17/200 (8.5%)  Loss: 50.53530
Iteration: 18/200 (9.0%)  Loss: 93.15199
Iteration: 19/200 (9.5%)  Loss: 78.34195
Iteration: 20/200 (10.0%)  Loss: 68.67317
Iteration: 21/200 (10.5%)  Loss: 96.32579
Iteration: 22/200 (11.0%)  Loss: 72.38687
Iteration: 23/200 (11.5%)  Loss: 54.69341
Iteration: 24/200 (12.0%)  Loss: 89.30476
Iteration: 25/200 (12.5%)  Loss: 126.30247
Iteration: 26/200 (13.0%)  Loss: 127.07924
Iteration: 27/200 (13.5%)  Loss: 106.49461
Iteration: 28/200 (14.0%)  Loss: 87.80541
Iteration: 29/200 (14.5%)  Loss: 79.10455
Iteration: 30/200 (15.0%)  Loss: 70.28092
Iteration: 31/200 (15.5%)  Loss: 98.19199
Iteration: 32/200 (16.0%)  Loss: 95.95340
Iteration: 33/200 (16.5%)  Loss: 53.06898
Iteration: 34/200 (17.0%)  Loss: 107.64432
Iteration: 35/200 (17.5%)  Loss: 63.28591
Iteration: 36/200 (18.0%)  Loss: 71.73843
Iteration: 37/200 (18.5%)  Loss: 114.64564
Iteration: 38/200 (19.0%)  Loss: 64.79031
Iteration: 39/200 (19.5%)  Loss: 

Evaluating the model: 13/200 (6.5%)
Evaluating the model: 14/200 (7.0%)
Evaluating the model: 15/200 (7.5%)
Evaluating the model: 16/200 (8.0%)
Evaluating the model: 17/200 (8.5%)
Evaluating the model: 18/200 (9.0%)
Evaluating the model: 19/200 (9.5%)
Evaluating the model: 20/200 (10.0%)
Evaluating the model: 21/200 (10.5%)
Evaluating the model: 22/200 (11.0%)
Evaluating the model: 23/200 (11.5%)
Evaluating the model: 24/200 (12.0%)
Evaluating the model: 25/200 (12.5%)
Evaluating the model: 26/200 (13.0%)
Evaluating the model: 27/200 (13.5%)
Evaluating the model: 28/200 (14.0%)
Evaluating the model: 29/200 (14.5%)
Evaluating the model: 30/200 (15.0%)
Evaluating the model: 31/200 (15.5%)
Evaluating the model: 32/200 (16.0%)
Evaluating the model: 33/200 (16.5%)
Evaluating the model: 34/200 (17.0%)
Evaluating the model: 35/200 (17.5%)
Evaluating the model: 36/200 (18.0%)
Evaluating the model: 37/200 (18.5%)
Evaluating the model: 38/200 (19.0%)
Evaluating the model: 39/200 (19.5%)
Evaluati

Iteration: 29/200 (14.5%)  Loss: 28.84146
Iteration: 30/200 (15.0%)  Loss: 18.09091
Iteration: 31/200 (15.5%)  Loss: 29.34911
Iteration: 32/200 (16.0%)  Loss: 27.00686
Iteration: 33/200 (16.5%)  Loss: 20.77533
Iteration: 34/200 (17.0%)  Loss: 33.88227
Iteration: 35/200 (17.5%)  Loss: 25.19869
Iteration: 36/200 (18.0%)  Loss: 20.87011
Iteration: 37/200 (18.5%)  Loss: 50.13494
Iteration: 38/200 (19.0%)  Loss: 26.92312
Iteration: 39/200 (19.5%)  Loss: 18.53452
Iteration: 40/200 (20.0%)  Loss: 33.74566
Iteration: 41/200 (20.5%)  Loss: 27.13927
Iteration: 42/200 (21.0%)  Loss: 30.66907
Iteration: 43/200 (21.5%)  Loss: 20.55213
Iteration: 44/200 (22.0%)  Loss: 38.40760
Iteration: 45/200 (22.5%)  Loss: 19.08531
Iteration: 46/200 (23.0%)  Loss: 25.53761
Iteration: 47/200 (23.5%)  Loss: 24.15660
Iteration: 48/200 (24.0%)  Loss: 26.64197
Iteration: 49/200 (24.5%)  Loss: 23.45428
Iteration: 50/200 (25.0%)  Loss: 15.45543
Iteration: 51/200 (25.5%)  Loss: 22.71687
Iteration: 52/200 (26.0%)  Loss: 1

Evaluating the model: 27/200 (13.5%)
Evaluating the model: 28/200 (14.0%)
Evaluating the model: 29/200 (14.5%)
Evaluating the model: 30/200 (15.0%)
Evaluating the model: 31/200 (15.5%)
Evaluating the model: 32/200 (16.0%)
Evaluating the model: 33/200 (16.5%)
Evaluating the model: 34/200 (17.0%)
Evaluating the model: 35/200 (17.5%)
Evaluating the model: 36/200 (18.0%)
Evaluating the model: 37/200 (18.5%)
Evaluating the model: 38/200 (19.0%)
Evaluating the model: 39/200 (19.5%)
Evaluating the model: 40/200 (20.0%)
Evaluating the model: 41/200 (20.5%)
Evaluating the model: 42/200 (21.0%)
Evaluating the model: 43/200 (21.5%)
Evaluating the model: 44/200 (22.0%)
Evaluating the model: 45/200 (22.5%)
Evaluating the model: 46/200 (23.0%)
Evaluating the model: 47/200 (23.5%)
Evaluating the model: 48/200 (24.0%)
Evaluating the model: 49/200 (24.5%)
Evaluating the model: 50/200 (25.0%)
Evaluating the model: 51/200 (25.5%)
Evaluating the model: 52/200 (26.0%)
Evaluating the model: 53/200 (26.5%)
E

Iteration: 41/200 (20.5%)  Loss: 5.61516
Iteration: 42/200 (21.0%)  Loss: 12.07119
Iteration: 43/200 (21.5%)  Loss: 6.87112
Iteration: 44/200 (22.0%)  Loss: 10.19197
Iteration: 45/200 (22.5%)  Loss: 5.65424
Iteration: 46/200 (23.0%)  Loss: 6.81945
Iteration: 47/200 (23.5%)  Loss: 5.10301
Iteration: 48/200 (24.0%)  Loss: 5.35861
Iteration: 49/200 (24.5%)  Loss: 8.75535
Iteration: 50/200 (25.0%)  Loss: 10.21100
Iteration: 51/200 (25.5%)  Loss: 7.65736
Iteration: 52/200 (26.0%)  Loss: 5.80542
Iteration: 53/200 (26.5%)  Loss: 6.33906
Iteration: 54/200 (27.0%)  Loss: 7.07915
Iteration: 55/200 (27.5%)  Loss: 12.54118
Iteration: 56/200 (28.0%)  Loss: 14.16590
Iteration: 57/200 (28.5%)  Loss: 13.73232
Iteration: 58/200 (29.0%)  Loss: 11.56656
Iteration: 59/200 (29.5%)  Loss: 6.45049
Iteration: 60/200 (30.0%)  Loss: 12.90474
Iteration: 61/200 (30.5%)  Loss: 9.07092
Iteration: 62/200 (31.0%)  Loss: 3.79937
Iteration: 63/200 (31.5%)  Loss: 8.25820
Iteration: 64/200 (32.0%)  Loss: 8.51189
Iteratio

Evaluating the model: 47/200 (23.5%)
Evaluating the model: 48/200 (24.0%)
Evaluating the model: 49/200 (24.5%)
Evaluating the model: 50/200 (25.0%)
Evaluating the model: 51/200 (25.5%)
Evaluating the model: 52/200 (26.0%)
Evaluating the model: 53/200 (26.5%)
Evaluating the model: 54/200 (27.0%)
Evaluating the model: 55/200 (27.5%)
Evaluating the model: 56/200 (28.0%)
Evaluating the model: 57/200 (28.5%)
Evaluating the model: 58/200 (29.0%)
Evaluating the model: 59/200 (29.5%)
Evaluating the model: 60/200 (30.0%)
Evaluating the model: 61/200 (30.5%)
Evaluating the model: 62/200 (31.0%)
Evaluating the model: 63/200 (31.5%)
Evaluating the model: 64/200 (32.0%)
Evaluating the model: 65/200 (32.5%)
Evaluating the model: 66/200 (33.0%)
Evaluating the model: 67/200 (33.5%)
Evaluating the model: 68/200 (34.0%)
Evaluating the model: 69/200 (34.5%)
Evaluating the model: 70/200 (35.0%)
Evaluating the model: 71/200 (35.5%)
Evaluating the model: 72/200 (36.0%)
Evaluating the model: 73/200 (36.5%)
E

### Let's build our model now

In [30]:
custom_graph = tf.Graph()
with custom_graph.as_default():
    alphas_training_operations = []
    
    # Inputs
   # x = tf.placeholder(dtype=tf.float32)
   # x_image = tf.reshape(x, [-1, 32, 32, 3])
    
    x = tf.placeholder(tf.float32, shape=[None, image_size, image_size, num_channels], name='x')
    y = tf.placeholder(tf.float32, shape=[None, num_classes], name='y')
    #y_ = tf.placeholder(tf.float32, shape=[None, num_classes], name='y')
    y_ = y
    x_image = tf.reshape(x, [-1, 32, 32, 3])
    
    # Convolution Layer 1
    W_conv1 = tf.Variable(values["W_conv1"], name="W_conv1")
    b_conv1 = tf.Variable(values["b_conv1"], name="b_conv1")
    alphas_training_op1, ABCLayer1, alphas_loss1 = ABC(W_conv1, b_conv1,
                                                       no_binary_filters=5,
                                                       no_ApproxConvLayers=5,
                                                       padding="SAME")
    alphas_training_operations.append(alphas_training_op1)
    conv1 = ABCLayer1(x_image)
    pool1 = max_pool_2x2(conv1)
    bn_conv1 = tf.layers.batch_normalization(pool1, axis=-1)
    h_conv1 = tf.nn.relu(bn_conv1)

    # Convolution Layer 2
    W_conv2 = tf.Variable(values["W_conv2"], name="W_conv2")
    b_conv2 = tf.Variable(values["b_conv2"], name="b_conv2")
    alphas_training_op2, ABCLayer2, alphas_loss2 = ABC(W_conv2, b_conv2,
                                                       no_binary_filters=5,
                                                       no_ApproxConvLayers=5,
                                                       padding="SAME")
    alphas_training_operations.append(alphas_training_op2)
    conv2 = ABCLayer2(h_conv1)
    pool2 = max_pool_2x2(conv2)
    bn_conv2 = tf.layers.batch_normalization(pool2, axis=-1)
    h_conv2 = tf.nn.relu(bn_conv2)

    # Flat the conv2 output
    h_conv2_flat = tf.reshape(h_conv2, shape=(-1, 8*8*64))

    # Dense layer1
    W_fc1 = weight_variable([8 * 8 * 64, 1024])
    b_fc1 = bias_variable([1024])
    h_fc1 = tf.nn.relu(tf.matmul(h_conv2_flat, W_fc1) + b_fc1)

    # Dropout
    keep_prob = tf.placeholder(tf.float32)
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

    # Output layer
    W_fc2 = weight_variable([1024, 10])
    b_fc2 = bias_variable([10])
    y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
    
    # Labels
   # y = tf.placeholder(tf.int32, [None])
   # y_ = tf.one_hot(y, 10)
    
    # Defining optimizer and loss
    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    graph_init = tf.global_variables_initializer()

In [None]:
n_epochs = 5
batch_size = 32
alpha_training_epochs = 200
        
with tf.Session(graph=custom_graph) as sess:
    sess.run(graph_init)
    for epoch in range(n_epochs):
        for iteration in range(1, 200 + 1):
            # Training alphas
            for alpha_training_op in alphas_training_operations:
                for alpha_epoch in range(alpha_training_epochs):
                    sess.run(alpha_training_op)
            
           # batch = mnist.train.next_batch(50)
                        
            
            offset = (iteration * batch_size) % (train_labels.shape[0] - batch_size)
            batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
          #  feed_dict = {model.tf_train_dataset : batch_data, model.tf_train_labels : batch_labels, model.dropout_keep_probability: dropout_keep_prob}
            
            #batch = mnist.train.next_batch(50)
            
            # Run operation and calculate loss
            _, loss_train = sess.run([train_step, cross_entropy],
                                     feed_dict={x: batch_data, y: batch_labels, keep_prob: 0.5})
            
            # Run operation and calculate loss
          #  _, loss_train = sess.run([train_step, cross_entropy],
                                    # feed_dict={x: batch[0], y: batch[1], keep_prob: 0.5})
            print("\rIteration: {}/{} ({:.1f}%)  Loss: {:.5f}".format(
                      iteration, 200,
                      iteration * 100 / 200,
                      loss_train),
                  end="")

        # At the end of each epoch,
        # measure the validation loss and accuracy:
        
        # Training alphas
        for alpha_training_op in alphas_training_operations:
            for alpha_epoch in range(alpha_training_epochs):
                sess.run(alpha_training_op)
                    
        loss_vals = []
        acc_vals = []
        for iteration in range(1, 200 + 1):   
              
            offset = (iteration * batch_size) % (train_labels.shape[0] - batch_size)
            batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            
            #X_batch, y_batch = mnist.validation.next_batch(batch_size)
            acc_val, loss_val = sess.run([accuracy, cross_entropy],
                                     feed_dict={x: batch_data, y: batch_labels, keep_prob: 1.0})
            loss_vals.append(loss_val)
            acc_vals.append(acc_val)
            print("\rEvaluating the model: {}/{} ({:.1f}%)".format(iteration, 200,
                iteration * 100 / 200),
                  end=" " * 10)
        loss_val = np.mean(loss_vals)
        acc_val = np.mean(acc_vals)
        print("\rEpoch: {}  Val accuracy: {:.4f}%  Loss: {:.6f}".format(
            epoch + 1, acc_val * 100, loss_val))

Iteration: 28/200 (14.0%)  Loss: 112.48172