<a href="https://colab.research.google.com/github/sp7412/colab/blob/master/distilling_new_try.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

from https://github.com/myatthuhein97/knowledge_distillation_tensorflow

In [None]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import datasets

In [None]:
!nvidia-smi

Wed Aug  5 23:27:34 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.57       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    33W /  70W |  14575MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
SEED = 888
EPOCH = 10
BATCH_SIZE = 256
VAL_FREQUENCY = 10

In [None]:
(train_images, train_labels), (test_images,
                                test_labels) = datasets.cifar10.load_data()
# Normalize pixel values to be between 0 and 1

train_images, test_images = train_images / 255.0, test_images / 255.0
class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
                'dog', 'frog', 'horse', 'ship', 'truck']

train_ds = tf.data.Dataset.from_tensor_slices((train_images,train_labels))
train_ds = train_ds.shuffle(10000, seed=SEED)
train_ds = train_ds.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

test_ds = tf.data.Dataset.from_tensor_slices((test_images,test_labels))
test_ds = test_ds.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [None]:
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import (AvgPool2D, BatchNormalization, Conv2D,
                                     Dense, DepthwiseConv2D, Dropout, Flatten,
                                     Input, MaxPooling2D, ReLU, Softmax)


def vgg_3blocks():

    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same', input_shape=(32, 32, 3)))
    model.add(BatchNormalization())
    model.add(Conv2D(32, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.2))
    model.add(Conv2D(64, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(64, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.3))
    model.add(Conv2D(128, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(128, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.4))
    model.add(Flatten())
    model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(10, name='logits'))
    model.summary()

    return model


def vgg_4blocks():

    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same', input_shape=(32, 32, 3)))
    model.add(BatchNormalization())
    model.add(Conv2D(32, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.2))
    model.add(Conv2D(64, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(64, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.4))
    model.add(Conv2D(128, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(128, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.5))
    model.add(Conv2D(256, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(256, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
    model.add(Dense(10, name='logits'))
    model.summary()

    return model


def vgg_2blocks():

    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same', input_shape=(32, 32, 3)))
    model.add(BatchNormalization())
    model.add(Conv2D(32, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.2))
    model.add(Conv2D(64, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(64, (3, 3), activation='relu',
                     kernel_initializer='he_uniform', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.4))
    model.add(Flatten())
    model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
    model.add(Dense(10, name='logits'))
    model.summary()

    return model


def mobilenet_block(x, filters, strides):

    x = DepthwiseConv2D(kernel_size=3, strides=strides, padding='same')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = Conv2D(filters=filters, kernel_size=1, strides=1)(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)

    return x


def mobilenet():

    input = Input(shape=(32, 32, 3))
    x = Conv2D(filters=32, kernel_size=3, padding='same')(input)
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = mobilenet_block(x, filters=64, strides=1)

    x = mobilenet_block(x, filters=128, strides=2)
    x = mobilenet_block(x, filters=128, strides=1)

    x = mobilenet_block(x, filters=256, strides=2)
    x = mobilenet_block(x, filters=256, strides=1)

    x = mobilenet_block(x, filters=512, strides=2)

    for _ in range(5):
        x = mobilenet_block(x, filters=512, strides=1)

    x = mobilenet_block(x, filters=1024, strides=1)
    x = mobilenet_block(x, filters=1024, strides=1)
    x = AvgPool2D(pool_size=4, strides=1)(x)
    x = Flatten()(x)
    output = Dense(units=10, name='logits')(x)
    output = Softmax()(output)

    model = Model(inputs=input, outputs=output)
    model.summary()

    return model

In [None]:
teacher_model = vgg_4blocks()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_12 (Conv2D)           (None, 32, 32, 32)        896       
_________________________________________________________________
batch_normalization_14 (Batc (None, 32, 32, 32)        128       
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 32, 32, 32)        9248      
_________________________________________________________________
batch_normalization_15 (Batc (None, 32, 32, 32)        128       
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 16, 16, 32)        0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 16, 16, 32)        0         
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 16, 16, 64)       

In [None]:
EPOCH = 100
optimizer = tf.keras.optimizers.Adam(
    learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
    name='Adam')

teacher_model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = teacher_model.fit(train_ds, epochs=EPOCH,validation_data=test_ds)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
test_loss, test_acc = teacher_model.evaluate(test_ds, verbose=2)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)

40/40 - 0s - loss: 0.6482 - accuracy: 0.8880
Test Loss: 0.6482346057891846
Test Accuracy: 0.8880000114440918


In [None]:
teacher_soft_logits = teacher_model.predict(train_images)

In [None]:
def softmax_with_temp(logits, temp=1):

    logits = (logits - tf.math.reduce_max(logits)) / temp
    exp_logits = tf.math.exp(logits)
    logits_sum = tf.math.reduce_sum(exp_logits, axis=-1, keepdims=True)
    result = exp_logits / logits_sum

    return result


def custom_cross_entropy(y_true, y_soft, y_pred, y_soft_pred, alpha=0.5):

    y_pred = tf.clip_by_value(y_pred, 1e-7, 1 - 1e-7)
    cross_entropy = -tf.math.reduce_mean(tf.math.reduce_sum(
        y_true * tf.math.log(y_pred), axis=-1, keepdims=False))

    y_soft = tf.clip_by_value(y_soft, 1e-7, 1 - 1e-7)
    y_soft_pred = tf.clip_by_value(y_soft_pred, 1e-7, 1 - 1e-7)
    soft_cross_entropy = -tf.math.reduce_mean(tf.math.reduce_sum(
        y_soft * tf.math.log(y_soft_pred), axis=-1, keepdims=False))

    return alpha * soft_cross_entropy + (1 - alpha) * cross_entropy


def kl_divergence_cross_entropy(y_true, y_soft, y_pred, y_soft_pred,cross_entropy,soft_kl_divergence, alpha=0.5,temp=1):

    cross_entropy = tf.keras.losses.CategoricalCrossentropy()

    soft_kl_divergence = tf.keras.losses.KLDivergence()

    return alpha * soft_kl_divergence(y_soft, y_soft_pred) + (1-alpha) * cross_entropy(y_true,y_pred)

In [None]:
opt = tf.keras.optimizers.Adam(
    learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
    name='Adam')

# Define our metrics
train_acc = tf.keras.metrics.CategoricalAccuracy('train_accuracy')
test_acc = tf.keras.metrics.CategoricalAccuracy('test_accuracy')

train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)

cross_entropy = tf.keras.losses.CategoricalCrossentropy()
soft_kl_divergence = tf.keras.losses.KLDivergence()

In [None]:
student_model = vgg_2blocks()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_24 (Conv2D)           (None, 32, 32, 32)        896       
_________________________________________________________________
batch_normalization_28 (Batc (None, 32, 32, 32)        128       
_________________________________________________________________
conv2d_25 (Conv2D)           (None, 32, 32, 32)        9248      
_________________________________________________________________
batch_normalization_29 (Batc (None, 32, 32, 32)        128       
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 16, 16, 32)        0         
_________________________________________________________________
dropout_16 (Dropout)         (None, 16, 16, 32)        0         
_________________________________________________________________
conv2d_26 (Conv2D)           (None, 16, 16, 64)       

In [None]:
def train_step(images, labels, teacher_soft_logits):

    with tf.GradientTape() as tape:
        pred = student_model(images, training=True)

        unsoft_pred = softmax_with_temp(pred, 1)
        soft_pred = softmax_with_temp(pred, TEMP)

        teacher_logits = teacher_soft_logits
        softened_teacher_prob = softmax_with_temp(teacher_logits, TEMP)
        
        loss_value = kl_divergence_cross_entropy(labels, softened_teacher_prob, unsoft_pred, soft_pred, cross_entropy,soft_kl_divergence,alpha=0.4, temp=TEMP)

    grads = tape.gradient(loss_value, student_model.trainable_variables)
    opt.apply_gradients(zip(grads, student_model.trainable_variables))

    train_acc(labels, pred)
    train_loss(loss_value)

In [None]:
EPOCHS = 100
BATCH_SIZE = 128
TEMP = 5
AUTO = tf.data.experimental.AUTOTUNE

In [None]:
(train_images, train_labels), (test_images,
                                test_labels) = datasets.cifar10.load_data()

In [None]:
train_images, test_images = train_images / 255.0, test_images / 255.0

In [None]:
train_labels = tf.keras.utils.to_categorical(train_labels.astype('float32'))
test_labels = tf.keras.utils.to_categorical(test_labels.astype('float32'))

dataset = tf.data.Dataset.from_tensor_slices(
    (train_images, train_labels, teacher_soft_logits))
dataset = dataset.repeat(EPOCHS).batch(BATCH_SIZE)
dataset = dataset.prefetch(AUTO)

In [None]:
dataset

<PrefetchDataset shapes: ((None, 32, 32, 3), (None, 10), (None, 10)), types: (tf.float64, tf.float32, tf.float32)>

In [None]:
train_summary_writer = tf.summary.create_file_writer('./')
test_summary_writer = tf.summary.create_file_writer('./')

In [None]:
train_acc = tf.keras.metrics.CategoricalAccuracy('train_accuracy')
test_acc = tf.keras.metrics.CategoricalAccuracy('test_accuracy')

In [None]:
for x, y, soft_logits in dataset:

  _ = train_step(x, y, soft_logits)
  step += 1
  if step % int(len(train_images) / BATCH_SIZE) == 0:

        test_acc(test_labels, student_model(
            test_images, training=False))
        tf.print("Steps loss     accuracy   test_accuracy")
        tf.print(step, train_loss.result(),
                  train_acc.result(), test_acc.result())

        ckpt_step += 1
        with train_summary_writer.as_default():
            tf.summary.scalar('epoch_accuracy',
                              train_acc.result(), step=ckpt_step)
            tf.summary.scalar(
                'epoch_loss', train_loss.result(), step=ckpt_step)
        with test_summary_writer.as_default():
            tf.summary.scalar('epoch_accuracy',
                              test_acc.result(), step=ckpt_step)

        train_acc.reset_states()
        test_acc.reset_states()
        train_loss.reset_states()

Steps loss     accuracy   test_accuracy
2340 2.2228694 0.234243691 0.1318
Steps loss     accuracy   test_accuracy
2730 1.72659349 0.385957539 0.4714
Steps loss     accuracy   test_accuracy
3120 1.48683774 0.464362979 0.5574
Steps loss     accuracy   test_accuracy
3510 1.36717403 0.514603376 0.5819
Steps loss     accuracy   test_accuracy
3900 1.27935278 0.547736406 0.605
Steps loss     accuracy   test_accuracy
4290 1.21125829 0.574278831 0.6269
Steps loss     accuracy   test_accuracy
4680 1.15396202 0.599098563 0.6395
Steps loss     accuracy   test_accuracy
5070 1.10401225 0.619771659 0.6484
Steps loss     accuracy   test_accuracy
5460 1.06371403 0.63511616 0.6578
Steps loss     accuracy   test_accuracy
5850 1.03376627 0.646915078 0.6666
Steps loss     accuracy   test_accuracy
6240 1.00761831 0.658173084 0.6687
Steps loss     accuracy   test_accuracy
6630 0.97988975 0.666025639 0.6842
Steps loss     accuracy   test_accuracy
7020 0.952482343 0.677904665 0.688
Steps loss     accuracy   te

In [None]:
student_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_24 (Conv2D)           (None, 32, 32, 32)        896       
_________________________________________________________________
batch_normalization_28 (Batc (None, 32, 32, 32)        128       
_________________________________________________________________
conv2d_25 (Conv2D)           (None, 32, 32, 32)        9248      
_________________________________________________________________
batch_normalization_29 (Batc (None, 32, 32, 32)        128       
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 16, 16, 32)        0         
_________________________________________________________________
dropout_16 (Dropout)         (None, 16, 16, 32)        0         
_________________________________________________________________
conv2d_26 (Conv2D)           (None, 16, 16, 64)       

In [None]:
student_model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
test_loss, test_acc = student_model.evaluate(test_ds, verbose=2)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)

40/40 - 0s - loss: 0.7804 - accuracy: 0.8059
Test Loss: 0.7804117202758789
Test Accuracy: 0.805899977684021


In [None]:
student_soft_logits = student_model.predict(test_images)

In [None]:
student_soft_logits[0]

array([-3.570874  , -2.519906  , -5.808381  , 11.211636  , -3.1897728 ,
        7.058151  ,  4.684191  , -4.9653907 ,  0.10992634, -1.8421792 ],
      dtype=float32)

In [None]:
np.argmax(student_soft_logits[0])

3

In [None]:
np.argmax(test_labels[0])

3

In [None]:
num_classes = len(class_names)

In [None]:
student_model.layers[-1].activation = tf.keras.activations.softmax

In [None]:
student_model.layers[-1].activation

<function tensorflow.python.keras.activations.softmax>

In [None]:
# TODO add the compile stage if necessary
# student_model.compile

In [None]:
test_loss, test_acc = student_model.evaluate(test_ds, verbose=2)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)

40/40 - 0s - loss: 0.7804 - accuracy: 0.8059
Test Loss: 0.7804117202758789
Test Accuracy: 0.805899977684021


In [None]:
output_layer = tf.keras.layers.Dense(num_classes, activation='softmax', name='scores')(student_model.layers[-1].output)
new_student_model = tf.keras.Model(student_model.input, output_layer)

In [None]:
student_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_24 (Conv2D)           (None, 32, 32, 32)        896       
_________________________________________________________________
batch_normalization_28 (Batc (None, 32, 32, 32)        128       
_________________________________________________________________
conv2d_25 (Conv2D)           (None, 32, 32, 32)        9248      
_________________________________________________________________
batch_normalization_29 (Batc (None, 32, 32, 32)        128       
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 16, 16, 32)        0         
_________________________________________________________________
dropout_16 (Dropout)         (None, 16, 16, 32)        0         
_________________________________________________________________
conv2d_26 (Conv2D)           (None, 16, 16, 64)       

In [None]:
new_student_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [None]:
new_student_model.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_24_input (InputLayer) [(None, 32, 32, 3)]       0         
_________________________________________________________________
conv2d_24 (Conv2D)           (None, 32, 32, 32)        896       
_________________________________________________________________
batch_normalization_28 (Batc (None, 32, 32, 32)        128       
_________________________________________________________________
conv2d_25 (Conv2D)           (None, 32, 32, 32)        9248      
_________________________________________________________________
batch_normalization_29 (Batc (None, 32, 32, 32)        128       
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 16, 16, 32)        0         
_________________________________________________________________
dropout_16 (Dropout)         (None, 16, 16, 32)       

In [None]:
test_loss, test_acc = new_student_model.evaluate(test_ds, verbose=2)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)

40/40 - 0s - loss: 5.1993 - accuracy: 0.0712
Test Loss: 5.199324131011963
Test Accuracy: 0.07119999825954437
