<h2> Distillation Learning </h2>

also known as Teacher-Student learning or model compression learning. It involves the usage of small efficient model(1.e student model) to learn from larger complex models(Teacher model)

<b>Application of Distillation learning</b>
<ul>
    <li>Model compression</li>
    <li>Transfer learning</li>
    <li>Ensemble learning</li>
    <li>Multi-task learning</li>
    <li>Language models</li>
    <li>Computer vision</li>
    <li>Anomaly detection</li>
    <li>Generative models</li>
</ul>

In [None]:
import tensorflow as tf

In [None]:
#Load the MNIST dataset
(x_train,y_train),(x_test,y_test) = tf.keras.datasets.mnist.load_data()

#preprocess the data
x_train = x_train.reshape(-1,784).astype('float32') / 255
x_test = x_test.reshape(-1,784).astype('float32') / 255
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

In [None]:
#The teacher model -> a larger and more complex neural network
class TeacherModel(tf.keras.Model):
    def __init__(self):
        super(TeacherModel,self).__init__()
        #define the layers of the teacher model
        self.layer1 = tf.keras.layers.Dense(512,input_shape=(784),activation='relu')
        self.layer2 = tf.keras.layers.Dense(256,activation='relu')
        self.layer3 = tf.keras.layers.Dense(128,activation='relu')
        self.layer4 = tf.keras.layers.Dense(64,activation='relu')
        self.last = tf.keras.layers.Dense(10,activation='softmax')
        
    def call(self,x):
        #foward pass for the teacher model
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        return self.last(x)

In [None]:
#The student model  -> a small and more effecient neural network
class StudentModel(tf.keras.Model):
    def __init__(self):
        super(StudentModel,self).__init__()
        #define the layers of the student model
        self.layer1 = tf.keras.layers.Dense(512,input_shape=(784),activation='relu')
        self.layer2 = tf.keras.layers.Dense(256,activation='relu')
        self.last = tf.keras.layers.Dense(10,activation='softmax')
        
    def call(self,x):
        x = self.layer1(x)
        x = self.layer2(X)
        return self.last(x)

In [None]:
#call the models and define the hperameters for training
teacher_model = TeacherModel()

#define the loss function and the optimizer
loss_fn = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()


#train the teacher model
teacher_model.compile(optimizer=optimizer,loss=loss_fn,metrics=['accuracy'])
teacher_model.fit(x_train,y_train,epochs=5,batch_size=32,validation_data=(x_test,y_test))

In [None]:
#freeze the teacher model and use it in the teacher model
student_model = StudentModel()

for layer in teacher_model.layers:
    layer.trainable = False
    
#define the distillation loss function
temp = 5
def distillation_loss(y_true,y_pred):
    y_true = tf.nn.softmax(y_true / temp)
    y_pred = tf.nn.softmax(y_pred / temp)
    return tf.reduce_mean(tf.keras.losses.categorical_crossentropy(y_true,y_pred))

#train the srudent model
student_model.compile(optimizer=optimizer,loss=distillation_loss,metrics=['accuracy'])
student_model.fit(x_train,teacher_model.predict(x_train),epochs=5,batch_size=32,validation_data=(x_test,y_test))

In [None]:
test_loss,test_acc = student_model.evaluate(x_test,y_test)
print('Test Accurancy')

# Calculating Self Information

In [1]:
from mxnet import np

ModuleNotFoundError: No module named 'mxnet'