In [None]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231n/assignments/assignment1/'
FOLDERNAME = '/MyDrive/Project/'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/{}'.format(FOLDERNAME))

%cd '/content/drive/MyDrive/Project/'
%pwd

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1XSIbHWFizvrGifGPTwtsmagK2AS8ZE7n/Project


'/content/drive/.shortcut-targets-by-id/1XSIbHWFizvrGifGPTwtsmagK2AS8ZE7n/Project'

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [None]:
np.random.seed(682)
tf.random.set_seed(682)

In [None]:
class Distiller(keras.Model):

    def __init__(self, student1, student2, student3, teacher):
        super(Distiller, self).__init__()
        self.teacher = teacher
        self.student1 = student1
        self.student2 = student2
        self.student3 = student3

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape_student_1:
            
            # Forward pass of student
            student_predictions_1 = self.student1(x, training=True)
            student_predictions_2 = self.student2(x, training=False)
            student_predictions_3 = self.student3(x, training=False)

            # Compute losses
            student_loss_1 = self.student_loss_fn(y, student_predictions_1)
            student_loss_2 = self.student_loss_fn(y, student_predictions_2)
            student_loss_3 = self.student_loss_fn(y, student_predictions_3)

            # for student 1
            distillation_loss_t_s1 = self.distillation_loss_fn(
                tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                tf.nn.softmax(student_predictions_1 / self.temperature, axis=1)
            )
            distillation_loss_s1_s2 = self.distillation_loss_fn(
                tf.nn.softmax(student_predictions_1 / self.temperature, axis=1),
                tf.nn.softmax(student_predictions_2 / self.temperature, axis=1)
            )
            distillation_loss_s1_s3 = self.distillation_loss_fn(
                tf.nn.softmax(student_predictions_1 / self.temperature, axis=1),
                tf.nn.softmax(student_predictions_3 / self.temperature, axis=1)
            )

            loss_1 = 0.1 * student_loss_1 + 0.8 * distillation_loss_t_s1 + 0.1 * (distillation_loss_s1_s2 + distillation_loss_s1_s3)

        # Compute gradients
        trainable_vars_1 = self.student1.trainable_variables
        gradients_1 = tape_student_1.gradient(loss_1, trainable_vars_1)
        self.optimizer.apply_gradients(zip(gradients_1, trainable_vars_1))
        self.compiled_metrics._metrics[0].update_state(y, student_predictions_1)
        

        with tf.GradientTape() as tape_student_2:
            
            # Forward pass of student
            student_predictions_1 = self.student1(x, training=False)
            student_predictions_2 = self.student2(x, training=True)
            student_predictions_3 = self.student3(x, training=False)

            # Compute losses
            student_loss_1 = self.student_loss_fn(y, student_predictions_1)
            student_loss_2 = self.student_loss_fn(y, student_predictions_2)
            student_loss_3 = self.student_loss_fn(y, student_predictions_3)
            
            # for student 2
            distillation_loss_t_s2 = self.distillation_loss_fn(
                tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                tf.nn.softmax(student_predictions_2 / self.temperature, axis=1)
            )
            distillation_loss_s2_s1 = self.distillation_loss_fn(
                tf.nn.softmax(student_predictions_2 / self.temperature, axis=1),
                tf.nn.softmax(student_predictions_1 / self.temperature, axis=1)
            )
            distillation_loss_s2_s3 = self.distillation_loss_fn(
                tf.nn.softmax(student_predictions_2 / self.temperature, axis=1),
                tf.nn.softmax(student_predictions_3 / self.temperature, axis=1)
            )

            loss_2 = 0.1 * student_loss_2 + 0.8 * distillation_loss_t_s2 + 0.1 * (distillation_loss_s2_s1 + distillation_loss_s2_s3)

        trainable_vars_2 = self.student2.trainable_variables
        gradients_2 = tape_student_2.gradient(loss_2, trainable_vars_2)

        self.optimizer.apply_gradients(zip(gradients_2, trainable_vars_2))

        self.compiled_metrics._metrics[1].update_state(y, student_predictions_2)
            

        with tf.GradientTape() as tape_student_3:
            
            # Forward pass of student
            student_predictions_1 = self.student1(x, training=False)
            student_predictions_2 = self.student2(x, training=False)
            student_predictions_3 = self.student3(x, training=True)

            # Compute losses
            student_loss_1 = self.student_loss_fn(y, student_predictions_1)
            student_loss_2 = self.student_loss_fn(y, student_predictions_2)
            student_loss_3 = self.student_loss_fn(y, student_predictions_3)

            # for student 3
            distillation_loss_t_s3 = self.distillation_loss_fn(
                tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                tf.nn.softmax(student_predictions_3 / self.temperature, axis=1)
            )
            distillation_loss_s3_s1 = self.distillation_loss_fn(
                tf.nn.softmax(student_predictions_3 / self.temperature, axis=1),
                tf.nn.softmax(student_predictions_1 / self.temperature, axis=1)
            )
            distillation_loss_s3_s2 = self.distillation_loss_fn(
                tf.nn.softmax(student_predictions_3 / self.temperature, axis=1),
                tf.nn.softmax(student_predictions_2 / self.temperature, axis=1)
            )

            loss_3 = 0.1 * student_loss_3 + 0.8 * distillation_loss_t_s3 + 0.1 * (distillation_loss_s3_s1 + distillation_loss_s3_s2)

        trainable_vars_3 = self.student3.trainable_variables
        gradients_3 = tape_student_3.gradient(loss_3, trainable_vars_3)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients_3, trainable_vars_3))

        # Update the metrics configured in `compile()`.     
        self.compiled_metrics._metrics[2].update_state(y, student_predictions_3)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.compiled_metrics._metrics}

        results.update (
            {"student_loss_1": student_loss_1,
             "student_loss_2": student_loss_2,
             "student_loss_3": student_loss_3,
             "distillation_loss_t_s1": distillation_loss_t_s1,
             "distillation_loss_s1_s2": distillation_loss_s1_s2,
             "distillation_loss_s1_s3": distillation_loss_s1_s3,
             "distillation_loss_t_s2": distillation_loss_t_s2,
             "distillation_loss_s2_s1": distillation_loss_s2_s1,
             "distillation_loss_s2_s3": distillation_loss_s2_s3,
             "distillation_loss_t_s3": distillation_loss_t_s3,
             "distillation_loss_s3_s1": distillation_loss_s3_s1,
             "distillation_loss_s3_s2": distillation_loss_s3_s2}
        )

        return results


    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction_1 = self.student1(x, training=False)
        y_prediction_2 = self.student2(x, training=False)
        y_prediction_3 = self.student3(x, training=False)

        # Calculate the loss
        student_loss_1 = self.student_loss_fn(y, y_prediction_1)
        student_loss_2 = self.student_loss_fn(y, y_prediction_2)
        student_loss_3 = self.student_loss_fn(y, y_prediction_3)

        # Update the metrics.
        self.compiled_metrics._metrics[0].update_state(y, y_prediction_1)
        self.compiled_metrics._metrics[1].update_state(y, y_prediction_2)
        self.compiled_metrics._metrics[2].update_state(y, y_prediction_3)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.compiled_metrics._metrics}

        results.update({"student_loss_1": student_loss_1,
             "student_loss_2": student_loss_2,
             "student_loss_3": student_loss_3})

        return results


In [None]:
teacher = tf.keras.models.load_model('/saved_models/resnet50cifar')

In [None]:
# Student 3

def get_student3():

  def preprocess_image_input(input_images):
    input_images = tf.cast(input_images, 'float32')
    output_ims = tf.keras.applications.mobilenet.preprocess_input(input_images)
    return output_ims

  class Preprocess(tf.keras.layers.Layer):
      def __init__(self):
          super(Preprocess, self).__init__()

      def call(self, inputs):
          return preprocess_image_input(inputs)

  student_mobile = tf.keras.applications.MobileNet(
      input_shape=(224, 224, 3),
      alpha=1.0,
      depth_multiplier=1,
      dropout=0.001,
      include_top=True,
      weights=None,
      input_tensor=None,
      pooling=None,
      classes=10,
      classifier_activation=None
  )

  inputs = tf.keras.layers.Input(shape=(32,32,3))
  resize = tf.keras.layers.UpSampling2D(size=(7,7))(inputs)
  pre_process = Preprocess()(resize)
  resnet_extractor = student_mobile(pre_process)
  student = tf.keras.Model(inputs=inputs, outputs = resnet_extractor)
  return student

student3 = get_student3()

In [None]:
# Student 2

def get_student2():

  # Import necessary components to build LeNet
  from keras.models import Sequential
  from keras.layers.core import Dense, Activation, Flatten
  from keras.layers.convolutional import Conv2D, MaxPooling2D
  from keras.regularizers import l2

  def lenet_model(img_shape=(32, 32, 3), n_classes=10, l2_reg=0.,
    weights=None):

    # Initialize model
    lenet = Sequential()

    # 2 sets of CRP (Convolution, RELU, Pooling)
    lenet.add(Conv2D(20, (5, 5), padding="same",
      input_shape=img_shape, kernel_regularizer=l2(l2_reg)))
    lenet.add(Activation("relu"))
    lenet.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

    lenet.add(Conv2D(50, (5, 5), padding="same",
      kernel_regularizer=l2(l2_reg)))
    lenet.add(Activation("relu"))
    lenet.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

    # Fully connected layers (w/ RELU)
    lenet.add(Flatten())
    lenet.add(Dense(500, kernel_regularizer=l2(l2_reg)))
    lenet.add(Activation("relu"))

    # Softmax (for classification)
    lenet.add(Dense(n_classes, kernel_regularizer=l2(l2_reg)))
    # lenet.add(Activation("softmax"))

    if weights is not None:
      lenet.load_weights(weights)

    # Return the constructed network
    return lenet

  # Create the student
  student = lenet_model()
  return student

student2 = get_student2()

In [None]:
# Student 1

def get_student1():

  from keras.models import Sequential
  from keras.layers.core import Dense, Dropout, Activation, Flatten
  from keras.layers.convolutional import Conv2D, MaxPooling2D, ZeroPadding2D
  from keras.layers import BatchNormalization
  from keras.regularizers import l2

  def alexnet_model(img_shape=(32, 32, 3), n_classes=10, l2_reg=0., weights=None):

    # Initialize model
    alexnet = Sequential()

    # Layer 1
    alexnet.add(Conv2D(96, (11, 11), input_shape=img_shape,
      padding='same', kernel_regularizer=l2(l2_reg)))
    alexnet.add(BatchNormalization())
    alexnet.add(Activation('relu'))
    alexnet.add(MaxPooling2D(pool_size=(2, 2)))

    # Layer 2
    alexnet.add(Conv2D(256, (5, 5), padding='same'))
    alexnet.add(BatchNormalization())
    alexnet.add(Activation('relu'))
    alexnet.add(MaxPooling2D(pool_size=(2, 2)))

    # Layer 3
    alexnet.add(ZeroPadding2D((1, 1)))
    alexnet.add(Conv2D(512, (3, 3), padding='same'))
    alexnet.add(BatchNormalization())
    alexnet.add(Activation('relu'))
    alexnet.add(MaxPooling2D(pool_size=(2, 2)))

    # Layer 4
    alexnet.add(ZeroPadding2D((1, 1)))
    alexnet.add(Conv2D(1024, (3, 3), padding='same'))
    alexnet.add(BatchNormalization())
    alexnet.add(Activation('relu'))

    # Layer 5
    alexnet.add(ZeroPadding2D((1, 1)))
    alexnet.add(Conv2D(1024, (3, 3), padding='same'))
    alexnet.add(BatchNormalization())
    alexnet.add(Activation('relu'))
    alexnet.add(MaxPooling2D(pool_size=(2, 2)))

    # Layer 6
    alexnet.add(Flatten())
    alexnet.add(Dense(3072))
    alexnet.add(BatchNormalization())
    alexnet.add(Activation('relu'))
    alexnet.add(Dropout(0.5))

    # Layer 7
    alexnet.add(Dense(4096))
    alexnet.add(BatchNormalization())
    alexnet.add(Activation('relu'))
    alexnet.add(Dropout(0.5))

    # Layer 8
    alexnet.add(Dense(n_classes))

    if weights is not None:
      alexnet.load_weights(weights)

    return alexnet

  # Create the student
  student = alexnet_model()
  return student

student1 = get_student1()

In [None]:
# Prepare the train and test dataset.
batch_size = 64

(x_train, y_train) , (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [None]:
# Initialize and compile distiller
distiller = Distiller(student1=student1, student2=student2, student3=student3, teacher=teacher)
metric_student1 = keras.metrics.SparseCategoricalAccuracy(name='s1_acc')
metric_student2 = keras.metrics.SparseCategoricalAccuracy(name='s2_acc')
metric_student3 = keras.metrics.SparseCategoricalAccuracy(name='s3_acc')

distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[metric_student1, metric_student2, metric_student3],
    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=10,
)

# Distill teacher to student
distiller.fit(x_train, y_train, epochs=5)

# Evaluate student on test dataset
distiller.evaluate(x_test, y_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.5799269080162048,
 0.5345500111579895,
 0.7479192018508911,
 1.1325149536132812,
 0.9329811334609985,
 0.6135732531547546]

In [None]:
student1.save('/saved_models/peers1')
student2.save('/saved_models/peers2')
student3.save('/saved_models/peers3')

INFO:tensorflow:Assets written to: /content/drive/MyDrive/Project/models/saved_models/peers1/assets
INFO:tensorflow:Assets written to: /content/drive/MyDrive/Project/models/saved_models/peers2/assets
INFO:tensorflow:Assets written to: /content/drive/MyDrive/Project/models/saved_models/peers3/assets


In [None]:
student1_test = tf.keras.models.load_model('/saved_models/peers1')
student2_test = tf.keras.models.load_model('/saved_models/peers2')
student3_test = tf.keras.models.load_model('/saved_models/peers3')

# Initialize and compile distiller
distiller_test = Distiller(student1=student1_test, student2=student2_test, student3=student3_test, teacher=teacher)
metric_student1 = keras.metrics.SparseCategoricalAccuracy(name='s1_acc')
metric_student2 = keras.metrics.SparseCategoricalAccuracy(name='s2_acc')
metric_student3 = keras.metrics.SparseCategoricalAccuracy(name='s3_acc')

distiller_test.compile (
    optimizer=keras.optimizers.Adam(),
    metrics=[metric_student1, metric_student2, metric_student3],
    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=10,
)

# Evaluate student on test dataset
distiller_test.evaluate(x_test, y_test)



[0.675000011920929,
 0.5669000148773193,
 0.8284000158309937,
 1.1325149536132812,
 0.9329811334609985,
 0.6135732531547546]

In [None]:
# Initialize and compile distiller

student1_v2 = get_student1()
student2_v2 = get_student2()
student3_v2 = get_student3()

distiller = Distiller(student1=student1_v2, student2=student2_v2, student3=student3_v2, teacher=teacher)
metric_student1 = keras.metrics.SparseCategoricalAccuracy(name='s1_acc')
metric_student2 = keras.metrics.SparseCategoricalAccuracy(name='s2_acc')
metric_student3 = keras.metrics.SparseCategoricalAccuracy(name='s3_acc')

distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[metric_student1, metric_student2, metric_student3],
    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=10,
)

# Distill teacher to student
distiller.fit(x_train, y_train, epochs=5)

# Evaluate student on test dataset
distiller.evaluate(x_test, y_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.5608307719230652,
 0.47388461232185364,
 0.7377307415008545,
 0.8109766244888306,
 1.2295775413513184,
 0.5317625999450684]

In [None]:
# Evaluate student on test dataset
distiller.evaluate(x_test, y_test)



[0.565929651260376,
 0.4767037034034729,
 0.7402999997138977,
 0.8109766244888306,
 1.2295775413513184,
 0.5317625999450684]

In [None]:
student1_v2.save('/saved_models/peers1_v2')
student2_v2.save('/saved_models/peers2_v2')
student3_v2.save('/saved_models/peers3_v2')

INFO:tensorflow:Assets written to: /content/drive/MyDrive/Project/models/saved_models/peers1_v2/assets
INFO:tensorflow:Assets written to: /content/drive/MyDrive/Project/models/saved_models/peers2_v2/assets
INFO:tensorflow:Assets written to: /content/drive/MyDrive/Project/models/saved_models/peers3_v2/assets


In [None]:
student1_test = tf.keras.models.load_model('/saved_models/peers1_v2')
student2_test = tf.keras.models.load_model('/saved_models/peers2_v2')
student3_test = tf.keras.models.load_model('/saved_models/peers3_v2')

# Initialize and compile distiller
distiller_test = Distiller(student1=student1_test, student2=student2_test, student3=student3_test, teacher=teacher)
metric_student1 = keras.metrics.SparseCategoricalAccuracy(name='s1_acc')
metric_student2 = keras.metrics.SparseCategoricalAccuracy(name='s2_acc')
metric_student3 = keras.metrics.SparseCategoricalAccuracy(name='s3_acc')

distiller_test.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[metric_student1, metric_student2, metric_student3],
    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=10,
)

# Evaluate student on test dataset
distiller_test.evaluate(x_test, y_test)



[0.6984999775886536,
 0.550000011920929,
 0.8070999979972839,
 0.8109766244888306,
 1.2295775413513184,
 0.5317625999450684]

In [None]:
# Initialize and compile distiller
distiller = Distiller(student1=student1, student2=student2, student3=student3, teacher=teacher)
metric_student1 = keras.metrics.SparseCategoricalAccuracy(name='s1_acc')
metric_student2 = keras.metrics.SparseCategoricalAccuracy(name='s2_acc')
metric_student3 = keras.metrics.SparseCategoricalAccuracy(name='s3_acc')

distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[metric_student1, metric_student2, metric_student3],
    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=10,
)

# Distill teacher to student
distiller.fit(x_train, y_train, epochs=10)

# Evaluate student on test dataset
distiller.evaluate(x_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.6349921822547913,
 0.690374493598938,
 0.8398725390434265,
 0.5361946225166321,
 2.4807844161987305,
 0.5392907857894897]