# Project B: Knowledge Distillation for Building Lightweight Deep Learning Models in Visual Classification Tasks

In [1]:
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
from typing import Union

tf.enable_v2_behavior()

BATCH_SIZE = 32
NUM_EPOCHS = 10

  from .autonotebook import tqdm as notebook_tqdm


# Data loading

In [11]:
import pandas as pd

# Load train and test splits.
anno_dir = r'D:\Github Repos\ECE1512\-ECE1512_2022W_ProjectRepo_StephanieDiNunzio\Project B\Project_B_Supp\mhist_dataset\annotations.csv' #you should change to your directory
df = pd.read_csv(anno_dir)
df = df[['Image Name','Majority Vote Label','Partition']]
df

Unnamed: 0,Image Name,Majority Vote Label,Partition
0,MHIST_aaa.png,SSA,train
1,MHIST_aab.png,HP,train
2,MHIST_aac.png,SSA,train
3,MHIST_aae.png,HP,train
4,MHIST_aaf.png,SSA,train
...,...,...,...
3147,MHIST_cpn.png,SSA,train
3148,MHIST_cfc.png,SSA,test
3149,MHIST_cgp.png,SSA,test
3150,MHIST_dlf.png,SSA,train


In [15]:
import shutil

hp_test_dir = r'D:\Github Repos\ECE1512\-ECE1512_2022W_ProjectRepo_StephanieDiNunzio\Project B\Project_B_Supp\mhist_dataset\images_sorted\test\HP_test'
hp_train_dir = r'D:\Github Repos\ECE1512\-ECE1512_2022W_ProjectRepo_StephanieDiNunzio\Project B\Project_B_Supp\mhist_dataset\images_sorted\train\HP_train'
ssa_test_dir = r'D:\Github Repos\ECE1512\-ECE1512_2022W_ProjectRepo_StephanieDiNunzio\Project B\Project_B_Supp\mhist_dataset\images_sorted\test\SSA_test'
ssa_train_dir = r'D:\Github Repos\ECE1512\-ECE1512_2022W_ProjectRepo_StephanieDiNunzio\Project B\Project_B_Supp\mhist_dataset\images_sorted\train\SSA_train'

img_root_dir = r'D:\Github Repos\ECE1512\-ECE1512_2022W_ProjectRepo_StephanieDiNunzio\Project B\Project_B_Supp\mhist_dataset\images\images'

for i, row in df.iterrows():
    img_dir = img_root_dir + '\\' + row['Image Name']
    if row['Majority Vote Label'] == 'SSA':
        if row['Partition'] == 'train':
            new_dir = ssa_train_dir + '\\' + row['Image Name']
            shutil.copyfile(img_dir,new_dir)
        else:
            new_dir = ssa_test_dir + '\\' + row['Image Name']
            shutil.copyfile(img_dir,new_dir)
    else:
        if row['Partition'] == 'train':
            new_dir = hp_train_dir + '\\' + row['Image Name']
            shutil.copyfile(img_dir,new_dir)
        else:
            new_dir = hp_test_dir + '\\' + row['Image Name']
            shutil.copyfile(img_dir,new_dir)
    #break

In [2]:
from keras.preprocessing.image import ImageDataGenerator

test_dir = r'D:\Github Repos\ECE1512\-ECE1512_2022W_ProjectRepo_StephanieDiNunzio\Project B\Project_B_Supp\mhist_dataset\images_sorted\test'
train_dir = r'D:\Github Repos\ECE1512\-ECE1512_2022W_ProjectRepo_StephanieDiNunzio\Project B\Project_B_Supp\mhist_dataset\images_sorted\train'

train_datagen = ImageDataGenerator(rescale=1/255.,
shear_range=0.1,
rotation_range=15,
zoom_range=0.2, # zoom
horizontal_flip=True,
vertical_flip=True)

test_datagen = ImageDataGenerator(rescale=1/255.)

train_generator = train_datagen.flow_from_directory(train_dir,
class_mode='binary',
interpolation='bilinear',
target_size=(224, 224),
batch_size=32,
shuffle=True,)

test_generator = test_datagen.flow_from_directory(test_dir,
class_mode='binary',
interpolation='bilinear',
target_size=(224, 224),
batch_size=32,
shuffle=False)

Found 2175 images belonging to 2 classes.
Found 977 images belonging to 2 classes.


In [3]:
import math


# Model creation

In [215]:
import keras
from tensorflow.keras import layers

resnet = tf.keras.applications.resnet_v2.ResNet50V2(weights='imagenet', include_top=False)
resnet.trainable = False

inputs = keras.Input(shape=(244,244,3))
x = resnet(inputs, training=False)
x= layers.GlobalAveragePooling2D()(x)
outputs = layers.Dense(1)(x)


teacher_model = keras.Model(inputs, outputs)
teacher_model.summary()

Model: "model_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_58 (InputLayer)        [(None, 244, 244, 3)]     0         
_________________________________________________________________
resnet50v2 (Functional)      (None, None, None, 2048)  23564800  
_________________________________________________________________
global_average_pooling2d_23  (None, 2048)              0         
_________________________________________________________________
dense_57 (Dense)             (None, 1)                 2049      
Total params: 23,566,849
Trainable params: 2,049
Non-trainable params: 23,564,800
_________________________________________________________________


# Student loss function

In [22]:
#@test {"output": "ignore"}

# Hyperparameters for distillation (need to be tuned).
ALPHA = 0.5 # task balance between cross-entropy and distillation loss
DISTILLATION_TEMPERATURE = 4. #temperature hyperparameter

def distillation_loss(teacher_logits: tf.Tensor, student_logits: tf.Tensor,
                      temperature: Union[float, tf.Tensor]):
  """Compute distillation loss.

  This function computes cross entropy between softened logits and softened
  targets. The resulting loss is scaled by the squared temperature so that
  the gradient magnitude remains approximately constant as the temperature is
  changed. For reference, see Hinton et al., 2014, "Distilling the knowledge in
  a neural network."

  Args:
    teacher_logits: A Tensor of logits provided by the teacher.
    student_logits: A Tensor of logits provided by the student, of the same
      shape as `teacher_logits`.
    temperature: Temperature to use for distillation.

  Returns:
    A scalar Tensor containing the distillation loss.
  """
 # your code start from here for step 3
  soft_targets = tf.nn.sigmoid(teacher_logits)

  return tf.reduce_mean(
      tf.nn.sigmoid_cross_entropy_with_logits(
          soft_targets, student_logits / temperature)) * temperature ** 2

def compute_student_loss(images, labels):
  """Compute class knowledge distillation student loss for given images
     and labels.

  Args:
    images: Tensor representing a batch of images.
    labels: Tensor representing a batch of labels.

  Returns:
    Scalar loss Tensor.
  """
  student_class_logits = student_model(images, training=True)

  # Compute class distillation loss between student class logits and
  # softened teacher class targets probabilities.

  # your code start from here for step 3

  teacher_class_logits = teacher_model(images, training=False)
  distillation_loss_value = distillation_loss(teacher_class_logits, student_class_logits, DISTILLATION_TEMPERATURE)

  # Compute cross-entropy loss with hard targets.

  # your code start from here for step 3
  loss = keras.losses.BinaryCrossentropy(from_logits=True)
  
  cross_entropy_loss_value = loss(labels,student_class_logits).numpy()

  return ALPHA*cross_entropy_loss_value + (1-ALPHA)*distillation_loss_value

# Training models

In [216]:
LEARNING_RATE = 0.001
teacher_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [217]:
teacher_model.fit(train_generator,
                    steps_per_epoch=train_generator.n // BATCH_SIZE,
                    epochs=10,                  
                    shuffle = True,
                    verbose=1,
                    validation_data=test_generator,
                    validation_steps=test_generator.n // BATCH_SIZE,)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x211a580b5b0>

In [218]:
teacher_model.save(r'D:\Github Repos\ECE1512\-ECE1512_2022W_ProjectRepo_StephanieDiNunzio\Project B\Project_B_Supp\models\mhist_model_init.h5')

In [220]:
resnet.trainable = True

LEARNING_RATE = 0.0001
teacher_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
teacher_model.fit(train_generator,
                    steps_per_epoch=train_generator.n // BATCH_SIZE,
                    epochs=25,                  
                    shuffle = True,
                    verbose=1,
                    validation_data=test_generator,
                    validation_steps=test_generator.n // BATCH_SIZE,)

In [None]:
teacher_model.save(r'D:\Github Repos\ECE1512\-ECE1512_2022W_ProjectRepo_StephanieDiNunzio\Project B\Project_B_Supp\models\mhist_model.h5')



In [223]:
print("Evaluate on test data")
results = teacher_model.evaluate_generator(generator=test_generator)
print("test loss, test acc:", results)

Evaluate on test data
test loss, test acc: [0.34587690234184265, 0.8526100516319275]


In [238]:


# Build student. NO KD

mobilenet = tf.keras.applications.mobilenet_v2.MobileNetV2(input_shape=(244,244,3), weights='imagenet', include_top=False)
mobilenet.trainable = False

inputs2 = keras.Input(shape=(244,244,3))
y = mobilenet(inputs2, training=False)
y= layers.GlobalAveragePooling2D()(y)
outputs2 = layers.Dense(1)(y)

student_model = keras.Model(inputs2, outputs2)
student_model.summary()


# 

LEARNING_RATE = 0.01
student_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

Model: "model_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_66 (InputLayer)        [(None, 244, 244, 3)]     0         
_________________________________________________________________
mobilenetv2_1.00_224 (Functi (None, 8, 8, 1280)        2257984   
_________________________________________________________________
global_average_pooling2d_27  (None, 1280)              0         
_________________________________________________________________
dense_61 (Dense)             (None, 1)                 1281      
Total params: 2,259,265
Trainable params: 1,281
Non-trainable params: 2,257,984
_________________________________________________________________


In [239]:
student_model.fit(train_generator,
                    steps_per_epoch=train_generator.n // BATCH_SIZE,
                    epochs=10,                  
                    shuffle = True,
                    verbose=1,
                    validation_data=test_generator,
                    validation_steps=test_generator.n // BATCH_SIZE,)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x211c712cb20>

In [208]:
student_model.save(r'D:\Github Repos\ECE1512\-ECE1512_2022W_ProjectRepo_StephanieDiNunzio\Project B\Project_B_Supp\models\mhist_model_stu_init.h5')



In [240]:
mobilenet.trainable = True

LEARNING_RATE = 0.02
student_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [242]:
student_model.fit(train_generator,
                    steps_per_epoch=train_generator.n // BATCH_SIZE,
                    epochs=25,                  
                    shuffle = True,
                    verbose=1,
                    validation_data=test_generator,
                    validation_steps=test_generator.n // BATCH_SIZE)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25

KeyboardInterrupt: 

In [213]:
student_model.save(r'D:\Github Repos\ECE1512\-ECE1512_2022W_ProjectRepo_StephanieDiNunzio\Project B\Project_B_Supp\models\mhist_model_stu_fine.h5')

In [214]:
student_model = (r'D:\Github Repos\ECE1512\-ECE1512_2022W_ProjectRepo_StephanieDiNunzio\Project B\Project_B_Supp\models\mhist_model_stu_fine.h5')

In [222]:
teacher_model = keras.models.load_model(r'D:\Github Repos\ECE1512\-ECE1512_2022W_ProjectRepo_StephanieDiNunzio\Project B\Project_B_Supp\models\mhist_model.h5')

In [9]:
# Hyperparameters for distillation (need to be tuned).
ALPHA = 0.5 # task balance between cross-entropy and distillation loss
DISTILLATION_TEMPERATURE = 2.5 #temperature hyperparameter


# Build student.

mobilenet2 = tf.keras.applications.mobilenet_v2.MobileNetV2(input_shape=(244,244,3), weights='imagenet', include_top=False)
mobilenet2.trainable = False

inputs3 = keras.Input(shape=(244,244,3))
z = mobilenet(inputs3, training=False)
z= layers.GlobalAveragePooling2D()(z)
outputs3 = layers.Dense(1)(z)

student_model_KD = keras.Model(inputs3, outputs3)
student_model_KD.summary()


# 

LEARNING_RATE = 0.001
student_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

0.9857
