In [158]:
import tensorflow.keras as tf 
import tensorflow as tf1

Lenet is the first Neural Network model to be implemented for the Computer Vision task, but it is useful for the grayscale images. It is used for the bank system to identify the handwritten digits. 

In [159]:
# Architecture of LeNet

In [160]:
lenet_model = tf.Sequential()

In [161]:
lenet_model.add(
    tf.layers.Input(shape = (28, 28, 1))
)

In [162]:
lenet_model.add(
    tf.layers.Conv2D(
        kernel_size = (5, 5), strides = (1, 1), activation = 'relu', filters = 6
    )
)

In [163]:
lenet_model.add(
    tf.layers.AveragePooling2D(pool_size = (2, 2), strides = (2, 2))
)

In [164]:
lenet_model.add(
    tf.layers.Conv2D(
        kernel_size = (5, 5), strides = (1, 1), activation = 'relu', filters = 16
    )
)

In [165]:
lenet_model.add(
    tf.layers.AveragePooling2D(pool_size = (2, 2), strides = (2, 2))
)

In [166]:
lenet_model.add(
    tf.layers.Flatten()
)

In [167]:
lenet_model.add(
    tf.layers.Dense(units = 120, activation = 'relu')
)

In [168]:
lenet_model.add(
    tf.layers.Dense(units = 84, activation = 'relu')
)

In [169]:
lenet_model.add(
    tf.layers.Dense(units = 10,)
)

In [170]:
lenet_model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_21 (Conv2D)          (None, 24, 24, 6)         156       
                                                                 
 average_pooling2d_10 (Avera  (None, 12, 12, 6)        0         
 gePooling2D)                                                    
                                                                 
 conv2d_22 (Conv2D)          (None, 8, 8, 16)          2416      
                                                                 
 average_pooling2d_11 (Avera  (None, 4, 4, 16)         0         
 gePooling2D)                                                    
                                                                 
 flatten_9 (Flatten)         (None, 256)               0         
                                                                 
 dense_19 (Dense)            (None, 120)              

In [171]:
# Prepare the train and test dataset.
batch_size = 64
(x_train, y_train), (x_test, y_test) = tf.datasets.mnist.load_data()

# Normalize data
x_train = x_train.astype("float32") / 255.0
x_train = np.reshape(x_train, (-1, 28, 28, 1))

x_test = x_test.astype("float32") / 255.0
x_test = np.reshape(x_test, (-1, 28, 28, 1))


In [172]:
lenet_model.compile(loss = tf.losses.SparseCategoricalCrossentropy(from_logits = True), metrics=[tf.metrics.SparseCategoricalAccuracy()], optimizer = 'adam')

In [173]:
history = lenet_model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


We are going to use the model of lenet as a teacher and we are going to create a student model with the knowledge of the teacher 

In [174]:
lenet_model.save('lenet.h5')


In [175]:
import os 

(os.stat('/content/lenet.h5').st_size / 1024)/1024

0.5589599609375

Not a Quite big model, but this is enough for the demonstration


In [176]:
student = tf.Sequential(
    [
        tf.layers.Input(shape=(28, 28, 1)),
        tf.layers.Conv2D(16, (3, 3), strides=(2, 2), padding="same"),
        tf.layers.LeakyReLU(alpha=0.2),
        tf.layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        tf.layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
        tf.layers.Flatten(),
        tf.layers.Dense(10),
    ],
    name="student",
)


In [177]:
# Creating a copy of the model

student_copy = tf.models.clone_model(student)

In [178]:
# model summary of the student


student.summary()

Model: "student"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_23 (Conv2D)          (None, 14, 14, 16)        160       
                                                                 
 leaky_re_lu_4 (LeakyReLU)   (None, 14, 14, 16)        0         
                                                                 
 max_pooling2d_4 (MaxPooling  (None, 14, 14, 16)       0         
 2D)                                                             
                                                                 
 conv2d_24 (Conv2D)          (None, 7, 7, 32)          4640      
                                                                 
 flatten_10 (Flatten)        (None, 1568)              0         
                                                                 
 dense_22 (Dense)            (None, 10)                15690     
                                                           

Parameter Wise, Teacher Model has a 60k parameters, Student has a 20k parameters. So there is some difference in the number of parameter between the teacher and the student. 
We are going to use the distiller cklass which is defined in keras. https://keras.io/examples/vision/knowledge_distillation/ . This is the link for the example of knowledge distillation, take a look at it. This is created based on the example from the keras offical page

In [179]:
class Distiller(tf.Model):
    def __init__(self, student, teacher):
        super(Distiller, self).__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf1.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = self.distillation_loss_fn(
                tf1.nn.softmax(teacher_predictions / self.temperature, axis=1),
                tf1.nn.softmax(student_predictions / self.temperature, axis=1),
            )
            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results


In [180]:
distiller = Distiller(student=student, teacher=lenet_model)
distiller.compile(
    optimizer=tf.optimizers.Adam(),
    metrics=[tf.metrics.SparseCategoricalAccuracy()],
    student_loss_fn=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=tf.losses.KLDivergence(),
    alpha=0.1,
    temperature=10,
)

# Distill teacher to student
distiller.fit(x_train, y_train, epochs=3)

# Evaluate student on test dataset
distiller.evaluate(x_test, y_test)

Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.9779000282287598, 0.0018058405257761478]

In [181]:
# Train student as doen usually
student_copy.compile(
    optimizer=tf.optimizers.Adam(),
    loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.metrics.SparseCategoricalAccuracy()],
)

# Train and evaluate student trained from scratch.
student_copy.fit(x_train, y_train, epochs=3)
student_copy.evaluate(x_test, y_test)


Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.0636371374130249, 0.9789000153541565]

In [182]:
# Both the student from scratch model and distilled model provides the similar result. But this is not the case in the complex dataset like a image 
# dataset, classication, object detection, in those tasks, knowledge distillation performs well

In [183]:
(os.path.getsize(filename='/content/lenet.h5')/(1024 * 1024))

0.5589599609375

In [184]:
# Converting tf model into tf-lite model 
# step1 : Create a convertor
# Extension of tf-lite model is .tflite

tflite_filename = "tflite-model.tflite"

# This will convert our model into a tflite model. We defined the class object, inorder to convert, we need to call the method .convert
tflite_convertor = tf1.lite.TFLiteConverter.from_keras_model(lenet_model)
tflite_model = tflite_convertor.convert()


INFO:tensorflow:Assets written to: /tmp/tmp7dyjgy5e/assets


INFO:tensorflow:Assets written to: /tmp/tmp7dyjgy5e/assets


In [185]:
open(tflite_filename, "wb").write(tflite_model)

180988

In [186]:
(os.path.getsize(filename='/content/tflite-model.tflite')/(1024 * 1024))

0.17260360717773438

In [187]:
# Post Quantization ----> This will even make our model size smaller 

In [190]:
tflite_convertor = tf1.lite.TFLiteConverter.from_keras_model(lenet_model)

tflite_convertor.optimizations = [tf1.lite.Optimize.DEFAULT]


tflite_quant_model = tflite_convertor.convert()

INFO:tensorflow:Assets written to: /tmp/tmp82uxt25q/assets


INFO:tensorflow:Assets written to: /tmp/tmp82uxt25q/assets


In [191]:
quantizered_filename = 'quantizered_model.tflite'

open(quantizered_filename, "wb").write(tflite_quant_model)

52064

In [192]:
(os.path.getsize(filename='/content/quantizered_model.tflite')/(1024 * 1024))

0.049652099609375

In [None]:
# But when we convert the tf model into tf-lite model, this will reduce the accuracy of the model by little

# And also, when we use quantizer optimization in the tf-lite, it further decreases the accuracy of the model, this is tradeoff of accuracy - size of 
# the model.