# üìä Performance Comparison of Four 10-Class Classifiers via Knowledge Distillation


In this notebook, we evaluate four 10-class classifiers:

1. **Small CNN trained from scratch** on CIFAR-10.
2. **Transfer learning using a pre-trained VGG16** with last 2 layers fine-tuned.
3. **Transfer learning using a pre-trained MobileNetV2** with last 2 layers fine-tuned.
4. **Knowledge Distillation**:
   - From single teacher (best of VGG16 or MobileNetV2) ‚Üí Small CNN
   - From both teachers jointly ‚Üí Small CNN


In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, GlobalAveragePooling2D, Resizing
from tensorflow.keras.applications import VGG16, MobileNetV2
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import CategoricalCrossentropy, KLDivergence
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


2025-07-28 12:11:29.829448: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-28 12:11:29.897981: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-28 12:11:29.906932: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
y_train_cat, y_test_cat = to_categorical(y_train, 10), to_categorical(y_test, 10)
class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 
               'dog', 'frog', 'horse', 'ship', 'truck']


## üîß Step 1: Train Small CNN from Scratch

In [3]:
def create_student_model(input_shape=(32, 32, 3), num_classes=10):
    inputs = Input(shape=input_shape)
    x = Conv2D(32, 3, padding='same', activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling2D()(x)
    x = Dropout(0.25)(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    outputs = Dense(num_classes, activation='softmax')(x)
    return Model(inputs, outputs, name="SmallCNN")

student = create_student_model()
student.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history_student = student.fit(x_train, y_train_cat, validation_split=0.2, epochs=30, batch_size=128, verbose=0)
student_acc = student.evaluate(x_test, y_test_cat, verbose=0)[1]
print("Student Baseline Accuracy:", student_acc)


2025-07-28 12:11:36.485369: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: tesla
2025-07-28 12:11:36.485436: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: tesla
2025-07-28 12:11:36.485677: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: NOT_FOUND: was unable to find libcuda.so DSO loaded into this program
2025-07-28 12:11:36.485763: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 575.64.3
2025-07-28 12:11:37.552438: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 491520000 exceeds 10% of free system memory.


Student Baseline Accuracy: 0.6284999847412109


## üß† Step 2: Fine-tune Pretrained CNNs

In [4]:
def create_teacher_model(base_model_class, model_name="Teacher", layers_to_unfreeze=2):
    base_model = base_model_class(include_top=False, weights='imagenet', input_shape=(48, 48, 3))
    base_model.trainable = False
    for layer in base_model.layers[-layers_to_unfreeze:]:
        layer.trainable = True

    inputs = Input(shape=(32, 32, 3))
    x = Resizing(48, 48)(inputs)
    x = base_model(x)
    x = GlobalAveragePooling2D()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    outputs = Dense(10, activation='softmax')(x)
    return Model(inputs, outputs, name=model_name)


In [5]:
teacher_vgg = create_teacher_model(VGG16, "Teacher_VGG16")
teacher_vgg.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
teacher_vgg.fit(x_train, y_train_cat, validation_split=0.2, epochs=30, batch_size=128, verbose=0)
acc_vgg = teacher_vgg.evaluate(x_test, y_test_cat, verbose=0)[1]
print("Teacher VGG16 Accuracy:", acc_vgg)


2025-07-28 12:31:41.606859: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 491520000 exceeds 10% of free system memory.


KeyboardInterrupt: 

In [None]:
teacher_mobile = create_teacher_model(MobileNetV2, "Teacher_MobileNetV2")
teacher_mobile.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
teacher_mobile.fit(x_train, y_train_cat, validation_split=0.2, epochs=30, batch_size=128, verbose=0)
acc_mobile = teacher_mobile.evaluate(x_test, y_test_cat, verbose=0)[1]
print("Teacher MobileNetV2 Accuracy:", acc_mobile)


## üîÑ Step 3: Knowledge Distillation

In [None]:
class Distiller(Model):
    def __init__(self, student, teachers):
        super().__init__()
        self.student = student
        self.teachers = teachers if isinstance(teachers, list) else [teachers]

    def compile(self, optimizer, metrics, student_loss_fn, distillation_loss_fn, alpha=0.1, temperature=3):
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        x, y = data
        teacher_preds = [t(x, training=False) for t in self.teachers]
        avg_teacher_preds = tf.reduce_mean(teacher_preds, axis=0)
        with tf.GradientTape() as tape:
            student_preds = self.student(x, training=True)
            s_loss = self.student_loss_fn(y, student_preds)
            d_loss = self.distillation_loss_fn(tf.nn.softmax(avg_teacher_preds / self.temperature),
                                               tf.nn.softmax(student_preds / self.temperature))
            loss = self.alpha * s_loss + (1 - self.alpha) * d_loss
        grads = tape.gradient(loss, self.student.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.student.trainable_variables))
        self.compiled_metrics.update_state(y, student_preds)
        return {m.name: m.result() for m in self.metrics}


In [None]:
best_teacher = teacher_mobile if acc_mobile > acc_vgg else teacher_vgg
student_d1 = create_student_model()
distiller1 = Distiller(student_d1, best_teacher)
distiller1.compile(optimizer='adam', metrics=['accuracy'],
                   student_loss_fn=CategoricalCrossentropy(),
                   distillation_loss_fn=KLDivergence(), alpha=0.1, temperature=10)
distiller1.fit(x_train, y_train_cat, validation_data=(x_test, y_test_cat), epochs=30, batch_size=128, verbose=0)
acc_d1 = distiller1.evaluate(x_test, y_test_cat, verbose=0)[1]
print("Single-Teacher Distillation Accuracy:", acc_d1)


In [None]:
student_d2 = create_student_model()
distiller2 = Distiller(student_d2, [teacher_vgg, teacher_mobile])
distiller2.compile(optimizer='adam', metrics=['accuracy'],
                   student_loss_fn=CategoricalCrossentropy(),
                   distillation_loss_fn=KLDivergence(), alpha=0.1, temperature=10)
distiller2.fit(x_train, y_train_cat, validation_data=(x_test, y_test_cat), epochs=30, batch_size=128, verbose=0)
acc_d2 = distiller2.evaluate(x_test, y_test_cat, verbose=0)[1]
print("Multi-Teacher Distillation Accuracy:", acc_d2)


In [None]:
df = pd.DataFrame({
    "Model": [
        "Student (Scratch)",
        "Teacher VGG16",
        "Teacher MobileNetV2",
        "Distilled (Single Teacher)",
        "Distilled (Multi-Teacher)"
    ],
    "Accuracy": [
        student_acc,
        acc_vgg,
        acc_mobile,
        acc_d1,
        acc_d2
    ]
})
df.sort_values("Accuracy", ascending=False).reset_index(drop=True)


## üìà Accuracy Comparison Visualization

In [None]:
import matplotlib.pyplot as plt

# Plot bar chart of model accuracy
plt.figure(figsize=(10, 6))
colors = ['#1f77b4', '#2ca02c', '#98df8a', '#ff7f0e', '#ffbb78']
bars = plt.bar(df['Model'], df['Accuracy'], color=colors, edgecolor='black')

# Add value labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height + 0.01,
             f"{height:.3f}", ha='center', va='bottom')

plt.title("Accuracy Comparison of Classifiers")
plt.ylabel("Accuracy")
plt.ylim(0, 1.0)
plt.xticks(rotation=15)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


## üîç Confusion Matrix (Student Model)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import numpy as np

# Predict on test data with final distilled model (multi-teacher)
y_pred = np.argmax(student_d2.predict(x_test), axis=1)
y_true = y_test.flatten()

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
disp.plot(cmap=plt.cm.Blues, xticks_rotation=45)
plt.title("Confusion Matrix - Final Distilled Student (Multi-Teacher)")
plt.show()
