# Implementing Transfer Learning


In [None]:
import tensorflow as tf
import numpy as np
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.datasets import cifar10
from tensorflow.keras import Input
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.models import Model

## Load the dataset

In [None]:
(X_train, Y_train), (X_test, Y_test) = cifar10.load_data()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, Y_train, train_size=0.10, random_state=42)

## Normalise

In [None]:
X_train = X_train / 255
X_test = X_test / 255

# One hot encoding

In [None]:
Y_train = np_utils.to_categorical(Y_train, 10)
Y_test = np_utils.to_categorical(Y_test, 10)
num_classes = 10

## Task:

Take a moment to look at the various models you can import in Keras: https://keras.io/applications/ and look for "Documentation for individual models". Spend some time looking at the documentation for the VGG16 model. Pay attention to "include_top" and "weights". 

What is the difference between including or not including the top?

What is the difference between using the imagenet weights and not using them?

What is imagenet?

## Import VGG16

In [None]:
from tensorflow.keras.applications import VGG16

Pay attention to  
```
include_top = False
```



In [None]:
vgg_conv = VGG16(weights='imagenet', # here we are saying that we want to use the pre-trained weights from pre-training on ImageNet
                  include_top=False, # here we won't want the fully connected layers
                  input_shape=(32, 32, 3)) # we specify the input, this can be modified and Tensorflow will rescale the 
                  # input (** this is an important point - let's chat some more about this **)
                  # ** especially the part about the depth needing to be 3! **

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


## View what we have so far

In [None]:
vgg_conv.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 32, 32, 3)]       0         
                                                                 
 block1_conv1 (Conv2D)       (None, 32, 32, 64)        1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 32, 32, 64)        36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 16, 16, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 16, 16, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 16, 16, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 8, 8, 128)         0     

# Import VGG16 but this time include the top of the network as well. Compare the differences.

In [None]:
vgg_conv = VGG16(weights='imagenet',
                  include_top=True,
                  input_shape=(224, 224, 3))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


## have a look at the summary now and compare the number of parameters. Where do the majority of the parameters come from, and why?

In [None]:
vgg_conv.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

# Freezing layers

## We can specify which layers to freeze using:



```
layer.trainable = False
```



## Task: write a single line to freeze the first convolutional layer of the model.

But first, print out the number of trainable parameters for the model so that you can later compare things. For this task, load the weights for VGG16 and set `include_top=False`

In [None]:
vgg_conv = VGG16(weights='imagenet',
                  include_top=False,
                  input_shape=(224, 224, 3))

Check the parameters to make sure you did this correctly

In [None]:
vgg_conv.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

You can freeze layer *i* using the following `vgg_conv.layers[i]`.

In our case, we want to freeze the first convolutional layer, is this layer 0 or layer 1? To find out, look at the summary above.

In [None]:
vgg_conv.layers[1].trainable = False

## Task: verify that you were successful by printing the summary of the model. Have a look at the number of paramters in the first convolutional layer and comapre that to the number of non-trainable parameters. Are they the same?

In [None]:
vgg_conv.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

## Write a for loop and iterate over the layers of the network and freeze some of them. Freeze only the first 6 layers.

Remember that to freeze a particular layer *i* we can acheive that with the following code: `vgg_conv.layers[i] = False`

Are layers numbered from 0 or from 1?

In [None]:
for layer in vgg_conv.layers[:6]:
    layer.trainable = False

In [None]:
for layer in vgg_conv.layers:
    print(layer.trainable)

False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True


## Now take a look at the number of parameters, what is different?

In [None]:
vgg_conv.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

## Task: actually, let's not freeze the first 6 layers, let's only freeze the first 5 layers. How can we now undo what we just did by freezing the 6th layer?

In [None]:
vgg_conv.layers[5].trainable = True

## Task: now take a look at the number of parameters, what is different?

In [None]:
vgg_conv.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

# Transfer learning without fine-tuning the feature extractor

We will use the functional API approach.

* We want to use VGG16

* We want to use the pre-trained weights

* We won't include the classifier part, rather we will make our own

In [None]:
vgg_conv = VGG16(weights='imagenet', 
                  include_top=False, 
                  input_shape=(32, 32, 3)) 

In [None]:
# First we set the entire feature extractor to non-trainable
vgg_conv.trainable = False

In [None]:
x = Flatten()(vgg_conv.output)
x = Dense(64, activation="relu")(x)
output = Dense(10, activation="softmax")(x)

In [None]:
# Now we define the Model by combining the input and output
model = Model(inputs=vgg_conv.input,outputs= output)

In [None]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 32, 32, 3)]       0         
                                                                 
 block1_conv1 (Conv2D)       (None, 32, 32, 64)        1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 32, 32, 64)        36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 16, 16, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 16, 16, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 16, 16, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 8, 8, 128)         0     

## You can obtain the output of a particular layer as follows:

In this case we use the vgg_conv model and get the layers from the input to ```block2_pool``` (or any other layer). 


* Here we extract a particular layer by it's name

* Then we flatten

* And then create the model

* This would be useful if you wanted to pass the output of a trained model at a given layer to another machine learning algorithm (e.g. SVM)

* Look at the summary() of vgg_conv to see this in action. For this task, 

In [None]:
x = vgg_conv.get_layer('block2_pool').output
x = Flatten()(x)

In [None]:
intermediate_layer_model = Model(inputs=vgg_conv.input,
                                 outputs=x)

In [None]:
intermediate_layer_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 32, 32, 3)]       0         
                                                                 
 block1_conv1 (Conv2D)       (None, 32, 32, 64)        1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 32, 32, 64)        36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 16, 16, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 16, 16, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 16, 16, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 8, 8, 128)         0   

You can have a look at the layers of a model as follows:

In [None]:
intermediate_layer_model.layers

[<keras.engine.input_layer.InputLayer at 0x7fa9706ee590>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7fa9706ee9d0>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7fa9706ee8d0>,
 <keras.layers.pooling.max_pooling2d.MaxPooling2D at 0x7fa9706d28d0>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7fa9706f5a10>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7fa97071f810>,
 <keras.layers.pooling.max_pooling2d.MaxPooling2D at 0x7fa9706bd210>,
 <keras.layers.reshaping.flatten.Flatten at 0x7fa97065e750>]

# Transfer learning with fine-tuning the feature extractor

We will use the functional API approach.

* We want to use VGG16

* We want to use the pre-trained weights

* We won't include the classifier part, rather we will make our own

In [None]:
vgg_conv = VGG16(weights='imagenet', 
                  include_top=False, 
                  input_shape=(32, 32, 3)) 

In [None]:
# First we set the entire feature extractor to non-trainable
vgg_conv.trainable = True

In [None]:
x = Flatten()(vgg_conv.output)
x = Dense(64, activation="relu")(x)
output = Dense(10, activation="softmax")(x)

In [None]:
# Now we define the Model by combining the input and output
model = Model(vgg_conv.input, output)

In [None]:
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 32, 32, 3)]       0         
                                                                 
 block1_conv1 (Conv2D)       (None, 32, 32, 64)        1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 32, 32, 64)        36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 16, 16, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 16, 16, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 16, 16, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 8, 8, 128)         0   

# Transfer learning without fine-tuning the feature extractor and only deleting last dense layer (fine tune dense layers only)

* We want to use VGG16

* We want to use the pre-trained weights

* We will include the classifier head, but then we will delete the last softmax layer and add our own.

* We will freeze the feature extractor

In [None]:
vgg_conv = VGG16(weights='imagenet',
                  include_top=True,
                  input_shape=(224, 224, 3))

In [None]:
vgg_conv.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [None]:
# Get the index and name of each layer
vgg_conv.layers

[<keras.engine.input_layer.InputLayer at 0x7fa9707bab10>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7fa984a3cdd0>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7fa984a3c850>,
 <keras.layers.pooling.max_pooling2d.MaxPooling2D at 0x7fa9f588ef90>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7fa9706eb4d0>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7fa9706bad50>,
 <keras.layers.pooling.max_pooling2d.MaxPooling2D at 0x7fa9705dddd0>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7fa9705846d0>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7fa970584fd0>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7fa9706cfad0>,
 <keras.layers.pooling.max_pooling2d.MaxPooling2D at 0x7fa970671f90>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7fa970786f10>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7fa970613890>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7fa9705e5250>,
 <keras.layers.pooling.max_pooling2d.MaxPooling2D at 0x7fa9705842d0>,
 <keras.layers.convolutional.con

In [None]:
# Freeze feature extractor
for layer in vgg_conv.layers[:19]:
    layer.trainable = False

In [None]:
x = vgg_conv.layers[-2].output 

In [None]:
# We will name it the same as it was before "predictions"
output = Dense(10, activation="softmax", name='predictions')(x)

In [None]:
# Now we define the Model by combining the input and output
model = Model(vgg_conv.input, output)

In [None]:
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

In [None]:
# Check
102764544+16781312+40970

119586826

## 4 cases:

Assume that a model X is trained on some data, which we call the original dataset.


**1)** new dataset is small and similar to the original: 

Solution: Download the weights. Freeze the majority of the lower end of the network, delete the fully connected layers and fine-tune the fully connected layers.

<br>

**2)** new dataset is large and similar to the original:

Solution: with more data there isn't a need to worry about overfitting. All of the layers can be set to trainable. Download the weights and fine-tine them by training on the new dataset.

<br>

**3)** new dataset is small and has different data to the original:

Solution: the higher level features in the network will not be useful since the datasets are different. Keep (freeze) the lower level features and remove the higher level ones. Add new fully connected layers.


<br>

**4)** new dataset is large and has different data to the original:

Solution: overfitting is not an issue here. Fine-tune or re-train the model. Fine-tuning is easier since the weights are not randomly initialised and less time is spent learning simple lower level filters

**Case 1**: example on setting layers to non-trainable

In [None]:
for layer in vgg_conv.layers[0:3]: # freeze feature extractor parts
    layer.trainable = False

**Case 2 and 4**: example on setting layers to trainable

In [None]:
for layer in vgg_conv.layers[0:3]:# fine-tune feature extractor parts
    layer.trainable = True

## Stacking layers

Sometimes our data might not match the input dimensions (depth) expected by the model. E.g. MNIST data has a depth of 1, whereas most models expect a depth of 3. This is one way of dealing with this:

In [None]:
from keras.datasets import fashion_mnist
(X_train, Y_train), (X_test, Y_test) = tf.keras.datasets.fashion_mnist.load_data()
X_train, X_val, Y_train, y_val = train_test_split(X_train, Y_train, train_size=0.10, random_state=42)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [None]:
Y_train = np_utils.to_categorical(Y_train, 10)
y_val = np_utils.to_categorical(y_val, 10)
num_classes = 10

In [None]:
X_train_stacked = np.stack([X_train, X_train ,X_train], axis=3)
X_val_stacked = np.stack([X_val, X_val ,X_val], axis=3)

In [None]:
X_train_stacked.shape

(6000, 28, 28, 3)

In [None]:
Y_train.shape

(6000, 10)

In [None]:
X_val_stacked.shape

(54000, 28, 28, 3)

In [None]:
y_val.shape

(54000, 10)

In [None]:
# We need to resize as the minimum size that the model will take is 32,32
X_train_stacked = tf.image.resize(X_train_stacked, [32,32])
X_val_stacked = tf.image.resize(X_val_stacked, [32,32])

In [None]:
X_train_stacked.shape

TensorShape([6000, 32, 32, 3])

# A quick full example

In [None]:
vgg_conv = VGG16(weights='imagenet', 
                  include_top=False, 
                  input_shape=(32, 32, 3)) 

# First we set the entire feature extractor to non-trainable
vgg_conv.trainable = False

x = Flatten()(vgg_conv.output)
x = Dense(64, activation="relu")(x)
output = Dense(10, activation="softmax")(x)

# Now we define the Model by combining the input and output
model = Model(vgg_conv.input, output)

In [None]:
# Compile the model
model.compile(loss='categorical_crossentropy',
          optimizer='adam',
          metrics=['accuracy'])

In [None]:
model.fit(X_train_stacked, Y_train, epochs=20, batch_size=32, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa970514c90>

In [None]:
predictions = model.predict(X_val_stacked)



In [None]:
correct_values = np.argmax(y_val,axis=-1)
predicted_classes = np.argmax(predictions,axis=-1)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(predicted_classes,correct_values)*100

79.86851851851851