In [None]:
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dropout, Flatten, BatchNormalization
import tensorflow as tf
from keras.datasets import fashion_mnist
from keras.utils import np_utils
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

# Densely Connected Networks (DenseNet)

ResNet significantly changed the view of how to parametrize the functions in deep networks. *DenseNet* (dense convolutional network) is to some extent the logical extension of this.
As a result,
DenseNet 
is characterized by
both the connectivity pattern where
each layer connects to all the preceding layers
and the concatenation operation (rather than the addition operator in ResNet) to preserve and reuse features
from earlier layers.


## From ResNet to DenseNet

ResNet decomposes functions into

$$f(\mathbf{x}) = \mathbf{x} + g(\mathbf{x}).$$

That is, ResNet decomposes $f$ into a simple linear term and a more complex
nonlinear one.
What if we want to capture (not necessarily add) information beyond two terms?
One solution was
DenseNet.

![The main difference between ResNet (left) and DenseNet (right) in cross-layer connections: use of addition and use of concatenation. ](https://github.com/d2l-ai/d2l-tensorflow-colab/blob/master/img/densenet-block.svg?raw=1)


As shown in the figure, the key difference between ResNet and DenseNet is that in the latter case outputs are *concatenated* (denoted by $[,]$) rather than added.
As a result, we perform a mapping from $\mathbf{x}$ to its values after applying an increasingly complex sequence of functions:

$$\mathbf{x} \to \left[
\mathbf{x},
f_1(\mathbf{x}),
f_2([\mathbf{x}, f_1(\mathbf{x})]), f_3([\mathbf{x}, f_1(\mathbf{x}), f_2([\mathbf{x}, f_1(\mathbf{x})])]), \ldots\right].$$

In the end, all these functions are combined in MLP to reduce the number of features again. In terms of implementation this is quite simple:
rather than adding terms, we concatenate them. The name DenseNet arises from the fact that the dependency graph between variables becomes quite dense. The last layer of such a chain is densely connected to all previous layers. The dense connections are shown below.

![Dense connections in DenseNet.](http://d2l.ai/_images/densenet.svg)

The main components that compose a DenseNet are ***dense blocks*** and ***transition layers***. The former define how the inputs and outputs are concatenated, while the latter control the number of channels so that it is not too large.

The difference between DenseNet and ResNet are shown below.
![](https://i.imgur.com/z3n0Afg.png)




## **Dense Blocks**

First we go step by step using cells, and then we write it as a function

In [None]:
# Define some input, with 4 tensors of 8x8x3
X = tf.random.uniform((4, 8, 8, 3))

In [None]:
# The first operation is to normalise the input
bn = tf.keras.layers.BatchNormalization()

In [None]:
# The second operation is to apply convolution and non-linear activation
conv = tf.keras.layers.Conv2D(
            filters=16, kernel_size=(3, 3), padding='same', activation = 'relu')

In [None]:
# There are the two operations
listLayers = [bn, conv]

In [None]:
# First we set y to be the input X
y = X

# Then we iterate over the operations and compose them into a chain
for layer in listLayers:
    print (layer)
    y = layer(y)

# See below, batch normalisation is applied to X, and then
# conv2d is applied to: batch normalisation which is applied to X
# so basically: conv2d(batchNorm(X))

<keras.layers.normalization.batch_normalization.BatchNormalization object at 0x7f9a65ab2690>
<keras.layers.convolutional.conv2d.Conv2D object at 0x7f9a65741ed0>


In [None]:
# Now we concatenate the input X with the output of the dense block y
y = tf.keras.layers.concatenate([X,y], axis=-1)
# y = [X, conv2d(batchNorm(X))]

In [None]:
# the width and heigh are unchanged, makes sense since we used same padding
# we created 16 filters so this means we have 16 feature maps, and X had a depth of
# 3, so 16 + 3 = 19.
y.shape

TensorShape([4, 8, 8, 19])

In [None]:
class ConvBlock(tf.keras.layers.Layer):
    def __init__(self, num_channels):
        super(ConvBlock, self).__init__()

        # The batch normalisation layer
        self.bn = tf.keras.layers.BatchNormalization()

        # A relu activation
        self.relu = tf.keras.layers.ReLU()

        # A linear convolution
        self.conv = tf.keras.layers.Conv2D(
            filters=num_channels, kernel_size=(3, 3), padding='same')
        
        self.conv2 = tf.keras.layers.Conv2D(
            filters=num_channels, kernel_size=(3, 3), padding='same')


        # Order of operations: BN, then relu, then conv2d
        self.listLayers = [self.bn, self.relu, self.conv, self.conv2]

    def call(self, x):

        # First set y to be the input x
        y = x

        # Then apply BN, ReLU and then conv2d
        for layer in self.listLayers.layers:
            y = layer(y)
            # BN([X, Conv2d(ReLU(BN(X)))])
            # Relu(BN([X, Conv2d(ReLU(BN(X)))]))
            # Conv(Relu(BN([X, Conv2d(ReLU(BN(X)))])))

        # Finally, concantenate X and the output y
        # meaning we get: [X, Conv2d(ReLU(BN(X)))]
        y = tf.keras.layers.concatenate([x,y], axis=-1)
        # [X, Conv2d(ReLU(BN(X)))], Conv(Relu(BN([X, Conv2d(ReLU(BN(X)))])))
        return y

A *dense block* consists of multiple convolution blocks, each using the same number of output channels. In the forward propagation, however, we concatenate the input and output of each convolution block on the channel dimension.


In [None]:
class DenseBlock(tf.keras.layers.Layer):
    def __init__(self, num_convs, num_channels):
        super(DenseBlock, self).__init__()
        self.listLayers = []

        # Here we create a number of ConvBlocks (see above)
        # Here we simply create and append them to this list
        for _ in range(num_convs):
            self.listLayers.append(ConvBlock(num_channels))
            # [ConvBlock(10), ConvBLock(10)]

    def call(self, x):

        # Now we create the dense block by chaining together
        # the various smaller ConvBlocks! So it would look like
        # ConvBlock(ConvBlock(X)) in the case of num_conv = 2
        # which expands to: [X, Conv2d(ReLU(BN([X, Conv2d(ReLU(BN(X))))))]]
        # the above is a little messy! But the first block 
        # will create [X, [X, Conv2d(ReLU(BN(X)))] and then we do this one more time
        for layer in self.listLayers.layers:
            x = layer(x)
            # x = [X, Conv2d(ReLU(BN(X)))]
            # ConvBLock([X, Conv2d(ReLU(BN(X)))])
        return x

In the following example,
we [**define a `DenseBlock` instance**] with 2 convolution blocks of 10 output channels.
When using an input with 3 channels, we will get an output with $23$ channels. 

[X, Conv2d(ReLU(BN(X)))]

[3, 10] which gets concatenated = 13

[13, 10] which gets concatenated = 23


The number of convolution block channels controls the growth in the number of output channels relative to the number of input channels. This is also referred to as the *growth rate*.


In [None]:
blk = DenseBlock(2, 10)
X = tf.random.uniform((4, 8, 8, 3))
Y = blk(X)
Y.shape

TensorShape([4, 8, 8, 23])

## **Transition Layers**

* Since each dense block will increase the number of channels, adding too many of them will lead to an excessively complex model. 

* A *transition layer* is used to control the complexity of the model. 

* It reduces the number of channels by using the $1\times 1$ convolutional layer and halves the height and width of the average pooling layer with a stride of 2, further reducing the complexity of the model.


In [None]:
class TransitionBlock(tf.keras.layers.Layer):
    def __init__(self, num_channels, **kwargs):
        super(TransitionBlock, self).__init__(**kwargs)
        self.batch_norm = tf.keras.layers.BatchNormalization()
        self.relu = tf.keras.layers.ReLU()

        # 1x1 convolution will reduce the depth of the tensor
        self.conv = tf.keras.layers.Conv2D(num_channels, kernel_size=1)

        # Reduce the width and height
        self.avg_pool = tf.keras.layers.AvgPool2D(pool_size=2, strides=2)

    def call(self, x):

        # batch norm first
        x = self.batch_norm(x)

        # then relu
        x = self.relu(x)

        # then conv
        x = self.conv(x)

        # then conv
        return self.avg_pool(x)

[**Apply a transition layer**] with 10 channels to the output of the dense block in the previous example.  This reduces the number of output channels to 10, and halves the height and width.


In [None]:
blk = TransitionBlock(10)
blk(Y).shape

TensorShape([4, 4, 4, 10])

## **DenseNet Model**

Next, we will construct a DenseNet model. DenseNet first uses the same single convolutional layer and max-pooling layer as in ResNet.


In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Conv2D(
    64, kernel_size=7, strides=2, padding='same', input_shape=(96,96,1)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding='same'))

* Then, similar to the four modules made up of residual blocks that ResNet uses,
DenseNet uses four dense blocks.
Similar to ResNet, we can set the number of convolutional layers used in each dense block. 

* Here, we set it to 4, consistent with the ResNet-18 model. 

* Furthermore, we set the number of channels (i.e., growth rate) for the convolutional layers in the dense block to 32, so 128 channels will be added to each dense block.

* In ResNet, the height and width are reduced between each module by a residual block with a stride of 2. 

* Here, we use the transition layer to halve the height and width and halve the number of channels. Similar to ResNet, a global pooling layer and a fully connected layer are connected at the end to produce the output.


In [None]:
num_channels=64 
growth_rate=32

# First Dense block
model.add(DenseBlock(4, growth_rate))
num_channels = num_channels + (4 * growth_rate)
num_channels = num_channels / 2
model.add(TransitionBlock(num_channels))

# Second Dense block
model.add(DenseBlock(4, growth_rate))
num_channels = num_channels + (4 * growth_rate)
num_channels = num_channels / 2
model.add(TransitionBlock(num_channels))

# Third Dense block
model.add(DenseBlock(4, growth_rate))
num_channels = num_channels + (4 * growth_rate)
num_channels = num_channels / 2
model.add(TransitionBlock(num_channels))

# Forth Dense block
model.add(DenseBlock(4, growth_rate))

model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.GlobalAvgPool2D())
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(10, activation = "softmax"))

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_4 (Conv2D)           (None, 48, 48, 64)        3200      
                                                                 
 batch_normalization_4 (Batc  (None, 48, 48, 64)       256       
 hNormalization)                                                 
                                                                 
 re_lu_3 (ReLU)              (None, 48, 48, 64)        0         
                                                                 
 max_pooling2d (MaxPooling2D  (None, 24, 24, 64)       0         
 )                                                               
                                                                 
 dense_block_1 (DenseBlock)  (None, 24, 24, 192)       130944    
                                                                 
 transition_block_1 (Transit  (None, 12, 12, 96)       1

## Load the dataset

In [None]:
# load data
(X_train, Y_train), (X_test, Y_test) = tf.keras.datasets.fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


## Find the unique numbers from the train labels

In [None]:
classes = np.unique(Y_train)
nClasses = len(classes)
print('Total number of outputs : ', nClasses)
print('Output classes : ', classes)

Total number of outputs :  10
Output classes :  [0 1 2 3 4 5 6 7 8 9]


## Reshape needed

Keras wants to know the depth of an image. 

For CNNS, Keras wants the format of the data as follows: [batches, width, height, depth]. 

In this case the colour channel/depth of the images is 1. Currently the shape is:

But this doesn't have a depth value. So we can reshape it

In [None]:
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], X_train.shape[2], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], X_test.shape[2], 1))

## Convert from categorical labels to one-hot encoded vectors

In this case there are 10 classes so we can tell the function to convert into a vector of length 10

In [None]:
Y_train = np_utils.to_categorical(Y_train, 10)
Y_test = np_utils.to_categorical(Y_test, 10)
num_classes = 10

## Small twist!

API: https://www.tensorflow.org/api_docs/python/tf/data/Dataset

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, Y_train))
test_ds = tf.data.Dataset.from_tensor_slices((X_test, Y_test))

In [None]:
def resize_images(image, label):
    # Normalize images to have a mean of 0 and standard deviation of 1
    image = tf.image.per_image_standardization(image)

    image = tf.image.resize(image, (96,96))
    return image, label

In [None]:
train_ds = (train_ds
                  .map(resize_images)
                  .shuffle(buffer_size=10000)
                  .batch(batch_size=64, drop_remainder=True))
test_ds = (test_ds
                  .map(resize_images)
                  .batch(batch_size=32, drop_remainder=False))

In [None]:
model.compile(loss='categorical_crossentropy',
             optimizer=tf.keras.optimizers.Adam(learning_rate=0.005),
             metrics=['accuracy'])

## Begin training

In [None]:
model.fit(train_ds, epochs=2, batch_size=32, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f9a502fbbd0>

## Predict on all the test data

In [None]:
predictions = model.predict(test_ds)



In [None]:
predictions.shape

(10000, 10)

In [None]:
correct_values = np.argmax(Y_test,axis=-1)
predicted_classes = np.argmax(predictions,axis=-1)

In [None]:
accuracy_score(predicted_classes,correct_values)*100

86.26

# Tasks

* explore different architecture structures by modifying the number of blocks

* attempt to modify the Conv block

* attempt to modify the transition block

* Here we apply ReLU after batch norm, apply it before like we did with ResNet

* Modify the overall network so that not every layer is connected to every previous layer, but rather it skips between two consecutive layers. So layer i is connected to i-2.
