In [1]:
"""
Title: Knowledge Distillation
Author: [Kenneth Borup](https://twitter.com/Kennethborup)
Date created: 2020/09/01
Last modified: 2020/09/01
Description: Implementation of classical Knowledge Distillation.
"""

"""
## Introduction to Knowledge Distillation
Knowledge Distillation is a procedure for model
compression, in which a small (student) model is trained to match a large pre-trained
(teacher) model. Knowledge is transferred from the teacher model to the student
by minimizing a loss function, aimed at matching softened teacher logits as well as
ground-truth labels.
The logits are softened by applying a "temperature" scaling function in the softmax,
effectively smoothing out the probability distribution and revealing
inter-class relationships learned by the teacher.
**Reference:**
- [Hinton et al. (2015)](https://arxiv.org/abs/1503.02531)
"""

"""
## Setup
"""

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import onnxruntime

In [2]:
session = onnxruntime.InferenceSession("../benchmarks/ACASXU_run2a_1_2_batch_2000-trunc.onnx")
inname = [input.name for input in session.get_inputs()]
outname = [output.name for output in session.get_outputs()]

In [3]:
inname

['input']

Taken from [this tutorial](https://github.com/keras-team/keras-io/blob/master/examples/vision/knowledge_distillation.py)

In [4]:
u, l = ([0.679858, 0.500000, 0.500000, 0.500000, -0.450000 ],[0.600000, -0.500000, -0.500000, 0.450000, -0.500000])
x_train = np.array(np.random.default_rng().uniform(l,u,(200000,5)),dtype=np.float32)
x_train[0]
x_train[-1]

array([ 0.6030121 , -0.16638894,  0.27088654,  0.45687816, -0.48346344],
      dtype=float32)

In [5]:
y_train = []
for d in x_train:
    y_train.append(session.run(outname, {inname[0]: d}))
y_train = np.array(y_train)
y_train[-1]

array([[-0.0202845, -0.0193085, -0.019157 , -0.0189128, -0.0189197]],
      dtype=float32)

In [6]:
y_train = y_train[:,0,:]
y_train.shape

(200000, 5)

In [7]:
x_test = x_train[150000:]
y_test = y_train[150000:]
x_train = x_train[:150000]
y_train = y_train[:150000]

In [8]:
print(x_test.shape)
print(x_train.shape)

(50000, 5)
(150000, 5)


In [9]:
"""
## Create student and teacher models
Initialy, we create a teacher model and a smaller student model. Both models are
convolutional neural networks and created using `Sequential()`,
but could be any Keras model.
"""

# Create the student
student = keras.Sequential(
    [
        keras.Input(shape=(5,)),
        layers.Dense(50, activation='relu'),
        layers.Dense(50, activation='relu'),
        layers.Dense(50, activation='relu'),
        layers.Dense(50, activation='relu'),
        layers.Dense(50, activation='relu'),
        layers.Dense(50, activation='relu'),
        layers.Dense(5),
    ],
    name="student",
)

In [10]:
"""
## Construct `Distiller()` class
The custom `Distiller()` class, overrides the `Model` methods `train_step`, `test_step`,
and `compile()`. In order to use the distiller, we need:
- A trained teacher model
- A student model to train
- A student loss function on the difference between student predictions and ground-truth
- A distillation loss function, along with a `temperature`, on the difference between the
soft student predictions and the soft teacher labels
- An `alpha` factor to weight the student and distillation loss
- An optimizer for the student and (optional) metrics to evaluate performance
In the `train_step` method, we perform a forward pass of both the teacher and student,
calculate the loss with weighting of the `student_loss` and `distillation_loss` by `alpha` and
`1 - alpha`, respectively, and perform the backward pass. Note: only the student weights are updated,
and therefore we only calculate the gradients for the student weights.
In the `test_step` method, we evaluate the student model on the provided dataset.
"""


class Distiller(keras.Model):
    def __init__(self, student, teacher, inname, outname):
        super(Distiller, self).__init__()
        self.teacher = teacher
        self.student = student
        self.inname = inname
        self.outname = outname

    def compile(
        self,
        optimizer,
        metrics,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.
        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics, run_eagerly=True)
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x,y = data

        # Forward pass of teacher
        #teacher_predictions = self.teacher(x, training=False)
        #teacher_predictions = []
        #for d in x:
        #    #print(d)
        #    #teacher_in = np.array(d,dtype=np.float32)
        #    teacher_predictions.append(self.teacher.run(outname, {inname[0]: d.numpy()}))
        #teacher_predictions = np.array(teacher_predictions)[:,0,:]

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            #student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = self.distillation_loss_fn(
                y,
                student_predictions,
            )
            loss = distillation_loss

        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        #self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.distillation_loss_fn(y, y_prediction)

        # Update the metrics.
        #self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [11]:
"""
## Distill teacher to student
We have already trained the teacher model, and we only need to initialize a
`Distiller(student, teacher)` instance, `compile()` it with the desired losses,
hyperparameters and optimizer, and distill the teacher to the student.
"""

# Initialize and compile distiller
distiller = Distiller(student, session, inname, outname)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[],
    distillation_loss_fn=keras.losses.MeanSquaredError(),
    alpha=0.1,
    temperature=10,
)

In [12]:
# Evaluate student on test dataset
distiller.evaluate(x_test, y_test)



[]

In [13]:
#def scheduler(epoch, lr):
#    if epoch < 10:
#        return lr
#    else:
#        return lr * tf.math.exp(-0.1)
#callback = keras.callbacks.LearningRateScheduler(scheduler)
# Distill teacher to student
distiller.fit(x_train, y_train, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f9706aa8d90>

In [14]:
# Evaluate student on test dataset
distiller.evaluate(x_test, y_test)



[]

In [15]:
student.save("ACASXU_run2a_1_2_batch_2000-retrain.h5")

In [16]:
from tensorflow.python.keras.models import load_model
import onnx
import keras2onnx

In [17]:
model = load_model('ACASXU_run2a_1_2_batch_2000-retrain.h5')
onnx_model = keras2onnx.convert_keras(model, model.name)
onnx.save_model(onnx_model, "ACASXU_run2a_1_2_batch_2000-retrain.onnx")



tf executing eager_mode: True
tf.keras model eager_mode: False
The ONNX operator number change on the optimization: 25 -> 20
The maximum opset needed by this model is only 9.


In [18]:
session.run(outname,{inname[0]:[0.679858, -0.03432750175414749, 0.385071, 0.5, -0.5]})

[array([-0.0202845, -0.0193085, -0.019157 , -0.0189128, -0.0189197],
       dtype=float32)]