In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets

# Check device (supports torch.accelerator if available, else CPU)
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

# Preprocessing for FashionMNIST (grayscale, so mean and std are single values)
transforms_mnist = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.2860], std=[0.3530]),  # typical mean/std for FashionMNIST
])

# Loading FashionMNIST dataset:
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transforms_mnist)
test_dataset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transforms_mnist)

# Dataloaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)

# Adjusted DeepNN and LightNN for single channel input (1 channel instead of 3)

class DeepNN(nn.Module):
    def __init__(self, num_classes=10):
        super(DeepNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 128, kernel_size=3, padding=1),  # input channel 1
            nn.ReLU(),
            nn.Conv2d(128, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Linear(32 * 7 * 7, 512),  # 28x28 images downsampled twice by factor 2 -> 7x7
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

class LightNN(nn.Module):
    def __init__(self, num_classes=10):
        super(LightNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),  # input channel 1
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Linear(16 * 7 * 7, 256),  # similarly 7x7 size after pooling
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

# Training and testing functions remain unchanged

def train(model, train_loader, epochs, learning_rate, device):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    model.to(device)
    model.train()

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")

def test(model, test_loader, device):
    model.to(device)
    model.eval()

    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")
    return accuracy

# Set random seed for reproducibility
torch.manual_seed(42)

# Initialize and train teacher (DeepNN)
nn_deep = DeepNN(num_classes=10).to(device)
train(nn_deep, train_loader, epochs=10, learning_rate=0.001, device=device)
test_accuracy_deep = test(nn_deep, test_loader, device)

# Initialize student (LightNN)
torch.manual_seed(42)
nn_light = LightNN(num_classes=10).to(device)

# Print norm of first layer weights for sanity check
torch.manual_seed(42)
new_nn_light = LightNN(num_classes=10).to(device)
print("Norm of 1st layer of nn_light:", torch.norm(nn_light.features[0].weight).item())
print("Norm of 1st layer of new_nn_light:", torch.norm(new_nn_light.features[0].weight).item())

# Print total parameters
total_params_deep = "{:,}".format(sum(p.numel() for p in nn_deep.parameters()))
print(f"DeepNN parameters: {total_params_deep}")
total_params_light = "{:,}".format(sum(p.numel() for p in nn_light.parameters()))
print(f"LightNN parameters: {total_params_light}")

# Train student
train(nn_light, train_loader, epochs=10, learning_rate=0.001, device=device)
test_accuracy_light_ce = test(nn_light, test_loader, device)

print(f"Teacher accuracy: {test_accuracy_deep:.2f}%")
print(f"Student accuracy: {test_accuracy_light_ce:.2f}%")

# Knowledge Distillation training function remains unchanged

def train_knowledge_distillation(teacher, student, train_loader, epochs, learning_rate, T, soft_target_loss_weight, ce_loss_weight, device):
    ce_loss = nn.CrossEntropyLoss()
    optimizer = optim.Adam(student.parameters(), lr=learning_rate)

    teacher.eval()
    student.train()

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            with torch.no_grad():
                teacher_logits = teacher(inputs)

            student_logits = student(inputs)

            soft_targets = nn.functional.softmax(teacher_logits / T, dim=-1)
            soft_prob = nn.functional.log_softmax(student_logits / T, dim=-1)

            soft_targets_loss = torch.sum(soft_targets * (soft_targets.log() - soft_prob)) / soft_prob.size()[0] * (T**2)
            label_loss = ce_loss(student_logits, labels)

            loss = soft_target_loss_weight * soft_targets_loss + ce_loss_weight * label_loss

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")

# Run knowledge distillation training

train_knowledge_distillation(
    teacher=nn_deep,
    student=new_nn_light,
    train_loader=train_loader,
    epochs=10,
    learning_rate=0.001,
    T=2,
    soft_target_loss_weight=0.25,
    ce_loss_weight=0.75,
    device=device
)

test_accuracy_light_ce_and_kd = test(new_nn_light, test_loader, device)

print(f"Teacher accuracy: {test_accuracy_deep:.2f}%")
print(f"Student accuracy without teacher: {test_accuracy_light_ce:.2f}%")
print(f"Student accuracy with CE + KD: {test_accuracy_light_ce_and_kd:.2f}%")


Using cuda device


100%|██████████| 26.4M/26.4M [00:02<00:00, 13.1MB/s]
100%|██████████| 29.5k/29.5k [00:00<00:00, 208kB/s]
100%|██████████| 4.42M/4.42M [00:01<00:00, 3.89MB/s]
100%|██████████| 5.15k/5.15k [00:00<00:00, 23.1MB/s]


Epoch 1/10, Loss: 0.45593280092612515
Epoch 2/10, Loss: 0.2588799115913763
Epoch 3/10, Loss: 0.2103206562335049
Epoch 4/10, Loss: 0.18105123428774794
Epoch 5/10, Loss: 0.1549719602171419
Epoch 6/10, Loss: 0.1327816787352567
Epoch 7/10, Loss: 0.11191160735418039
Epoch 8/10, Loss: 0.0959089513637745
Epoch 9/10, Loss: 0.07870940383134493
Epoch 10/10, Loss: 0.06781428760445829
Test Accuracy: 92.92%
Norm of 1st layer of nn_light: 2.3761112689971924
Norm of 1st layer of new_nn_light: 2.3761112689971924
DeepNN parameters: 938,922
LightNN parameters: 206,010
Epoch 1/10, Loss: 0.5243566697086098
Epoch 2/10, Loss: 0.3378505093265952
Epoch 3/10, Loss: 0.2906147592357481
Epoch 4/10, Loss: 0.2623042359725753
Epoch 5/10, Loss: 0.242506469220622
Epoch 6/10, Loss: 0.22283650443815728
Epoch 7/10, Loss: 0.20786556247264337
Epoch 8/10, Loss: 0.19598372306015446
Epoch 9/10, Loss: 0.18320772083583417
Epoch 10/10, Loss: 0.1710275296987628
Test Accuracy: 91.25%
Teacher accuracy: 92.92%
Student accuracy: 91.2

This is the code for the Feature Based distillation using intermediate layers using a regressor and then mse loss we can also use cosine loss here we are making the dimensionalities of the student intermediate layer equal to those of the teacher layer using a convolution layer in the regressor function used in the forward block of student model

I have some doubt regarding the effectiveness or advantage of regressor

I request you to please give some material about this topic

In [None]:
# Create a sample input tensor
sample_input = torch.randn(128, 1, 28, 28).to(device)

# Pass the input through the student
# logits, hidden_representation = modified_nn_light(sample_input)

# # Print the shapes of the tensors
# print("Student logits shape:", logits.shape) # batch_size x total_classes
# print("Student hidden representation shape:", hidden_representation.shape) # batch_size x hidden_representation_size

# # Pass the input through the teacher
# logits, hidden_representation = modified_nn_deep(sample_input)

# # Print the shapes of the tensors
# print("Teacher logits shape:", logits.shape) # batch_size x total_classes
# print("Teacher hidden representation shape:", hidden_representation.shape) # batch_size x hidden_representation_size

In [None]:
# Pass the sample input only from the convolutional feature extractor
convolutional_fe_output_student = nn_light.features(sample_input)
convolutional_fe_output_teacher = nn_deep.features(sample_input)

# Print their shapes
print("Student's feature extractor output shape: ", convolutional_fe_output_student.shape)
print("Teacher's feature extractor output shape: ", convolutional_fe_output_teacher.shape)

Student's feature extractor output shape:  torch.Size([128, 16, 7, 7])
Teacher's feature extractor output shape:  torch.Size([128, 32, 7, 7])


In [None]:
class ModifiedDeepNNRegressor(nn.Module):
    def __init__(self, num_classes=10):
        super(ModifiedDeepNNRegressor, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Linear(32 * 7 * 7, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        conv_feature_map = x
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x, conv_feature_map

class ModifiedLightNNRegressor(nn.Module):
    def __init__(self, num_classes=10):
        super(ModifiedLightNNRegressor, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        # Include an extra regressor (in our case linear)
        self.regressor = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, padding=1)
        )
        self.classifier = nn.Sequential(
            nn.Linear(16 * 7 * 7, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        regressor_output = self.regressor(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x, regressor_output

In [None]:
def train_mse_loss(teacher, student, train_loader, epochs, learning_rate, feature_map_weight, ce_loss_weight, device):
    ce_loss = nn.CrossEntropyLoss()
    mse_loss = nn.MSELoss()
    optimizer = optim.Adam(student.parameters(), lr=learning_rate)

    teacher.to(device)
    student.to(device)
    teacher.eval()  # Teacher set to evaluation mode
    student.train() # Student to train mode

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            # Again ignore teacher logits
            with torch.no_grad():
                _, teacher_feature_map = teacher(inputs)

            # Forward pass with the student model
            student_logits, regressor_feature_map = student(inputs)

            # Calculate the loss
            hidden_rep_loss = mse_loss(regressor_feature_map, teacher_feature_map)

            # Calculate the true label loss
            label_loss = ce_loss(student_logits, labels)

            # Weighted sum of the two losses
            loss = feature_map_weight * hidden_rep_loss + ce_loss_weight * label_loss

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")

# Notice how our test function remains the same here with the one we used in our previous case. We only care about the actual outputs because we measure accuracy.

def test_multiple_outputs(model, test_loader, device):
    model.to(device)
    model.eval()

    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs, _ = model(inputs) # Disregard the second tensor of the tuple
            _, predicted = torch.max(outputs.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")
    return accuracy

# Initialize a ModifiedLightNNRegressor
torch.manual_seed(42)
modified_nn_light_reg = ModifiedLightNNRegressor(num_classes=10).to(device)

# We do not have to train the modified deep network from scratch of course, we just load its weights from the trained instance
modified_nn_deep_reg = ModifiedDeepNNRegressor(num_classes=10).to(device)
modified_nn_deep_reg.load_state_dict(nn_deep.state_dict())

# Train and test once again
train_mse_loss(teacher=modified_nn_deep_reg, student=modified_nn_light_reg, train_loader=train_loader, epochs=10, learning_rate=0.001, feature_map_weight=0.25, ce_loss_weight=0.75, device=device)
test_accuracy_light_ce_and_mse_loss = test_multiple_outputs(modified_nn_light_reg, test_loader, device)

Epoch 1/10, Loss: 0.6464882302385911
Epoch 2/10, Loss: 0.4519972195630389
Epoch 3/10, Loss: 0.4025828486947871
Epoch 4/10, Loss: 0.37031873355287986
Epoch 5/10, Loss: 0.34820735702382477
Epoch 6/10, Loss: 0.3313671725391071
Epoch 7/10, Loss: 0.3170622596735639
Epoch 8/10, Loss: 0.3026722513599945
Epoch 9/10, Loss: 0.29128484631270996
Epoch 10/10, Loss: 0.2833304999987962
Test Accuracy: 91.28%


In [None]:
print(f"Teacher accuracy: {test_accuracy_deep:.2f}%")
print(f"Student accuracy without teacher: {test_accuracy_light_ce:.2f}%")
print(f"Student accuracy with CE + KD: {test_accuracy_light_ce_and_kd:.2f}%")

print(f"Student accuracy with CE + RegressorMSE: {test_accuracy_light_ce_and_mse_loss:.2f}%")

Teacher accuracy: 92.92%
Student accuracy without teacher: 91.25%
Student accuracy with CE + KD: 91.46%
Student accuracy with CE + RegressorMSE: 91.28%


relation based distillation using the concept of fsp and mse loss

The FSP is calculated between the feature vectors of the fully connected layers of both teacher and the student models and then calculating the loss between those two matrices using mse.

I am also using a regressor to make the dimensions of the fully connected layers of teacher similar to student ones since the architecture of both the models are different.

In [None]:
class ModifiedDeepNNCosine(nn.Module):
    def __init__(self, num_classes=10):
        super(ModifiedDeepNNCosine, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )
        self.fc1 = nn.Linear(32 * 7 * 7, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        feat1 = self.fc1(x)
        out = self.fc2(self.dropout(self.relu(feat1)))
        return out, feat1


class ModifiedLightNNCosine(nn.Module):
    def __init__(self, num_classes=10):
        super(ModifiedLightNNCosine, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(16, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )
        self.fc1 = nn.Linear(16 * 7 * 7, 256)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        feat1 = self.fc1(x)
        out = self.fc2(self.dropout(self.relu(feat1)))
        return out, feat1


In [None]:
class TeacherRegressor(nn.Module):
    def __init__(self, in_dim=512, out_dim=256):
        super(TeacherRegressor, self).__init__()
        self.proj = nn.Linear(in_dim, out_dim)

    def forward(self, x):
        return self.proj(x)


In [None]:
def compute_fsp_matrix(A, B):
    """
    A: (batch_size, d1)
    B: (batch_size, d2)
    Returns: (d1, d2) FSP matrix
    """
    return torch.matmul(A.T, B) / A.size(0)

def train_relation_distillation(teacher, student, regressor, train_loader, epochs, learning_rate, alpha, ce_weight, device):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(list(student.parameters()) + list(regressor.parameters()), lr=learning_rate)

    teacher.eval()
    student.train()
    regressor.train()

    for epoch in range(epochs):
        total_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            with torch.no_grad():
                teacher_logits, teacher_feat = teacher(images)

            student_logits, student_feat = student(images)

            # Project teacher_feat to match student_feat dimensionality
            teacher_feat_projected = regressor(teacher_feat)

            # Compute FSP
            fsp_teacher = compute_fsp_matrix(teacher_feat_projected, teacher_feat_projected)
            fsp_student = compute_fsp_matrix(student_feat, student_feat)

            mse_loss = nn.MSELoss()(fsp_student, fsp_teacher)
            ce_loss = criterion(student_logits, labels)

            loss = alpha * mse_loss + ce_weight * ce_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(train_loader):.4f}")


In [None]:
def test(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs, _ = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")
    return accuracy


In [None]:
# # Assuming nn_deep is already trained and available
# modified_nn_deep = ModifiedDeepNNCosine(num_classes=10).to(device)
# modified_nn_deep.load_state_dict(nn_deep.state_dict())

# # Fresh student model
# torch.manual_seed(42)
# modified_nn_light = ModifiedLightNNCosine(num_classes=10).to(device)

# # Initialize a ModifiedLightNNRegressor
# torch.manual_seed(42)
# modified_nn_light_reg = ModifiedLightNNRegressor(num_classes=10).to(device)

# # We do not have to train the modified deep network from scratch of course, we just load its weights from the trained instance
# modified_nn_deep_reg = ModifiedDeepNNRegressor(num_classes=10).to(device)
# modified_nn_deep_reg.load_state_dict(nn_deep.state_dict())

# Step 1: Initialize ModifiedDeepNNCosine
modified_nn_deep = ModifiedDeepNNCosine(num_classes=10).to(device)

# Step 2: Copy compatible layers manually
with torch.no_grad():
    # Copy convolutional layers (same structure)
    modified_nn_deep.features.load_state_dict(nn_deep.features.state_dict())

    # Copy classifier layers manually
    modified_nn_deep.fc1.weight.copy_(nn_deep.classifier[0].weight)
    modified_nn_deep.fc1.bias.copy_(nn_deep.classifier[0].bias)

    modified_nn_deep.fc2.weight.copy_(nn_deep.classifier[3].weight)
    modified_nn_deep.fc2.bias.copy_(nn_deep.classifier[3].bias)


# Step 1: Initialize the modified student model
modified_nn_light = ModifiedLightNNCosine(num_classes=10).to(device)

# Step 2: Copy weights from LightNN to ModifiedLightNNCosine
with torch.no_grad():
    # Copy convolutional layers (features)
    modified_nn_light.features.load_state_dict(nn_light.features.state_dict())

    # Copy classifier layers manually
    modified_nn_light.fc1.weight.copy_(nn_light.classifier[0].weight)
    modified_nn_light.fc1.bias.copy_(nn_light.classifier[0].bias)

    modified_nn_light.fc2.weight.copy_(nn_light.classifier[3].weight)
    modified_nn_light.fc2.bias.copy_(nn_light.classifier[3].bias)



# 🔥 Initialize the regressor to project teacher features to student feature size
regressor = TeacherRegressor(in_dim=512, out_dim=256).to(device)

# Train student with FSP-based relation distillation
train_relation_distillation(
    teacher=modified_nn_deep,
    student=modified_nn_light,
    regressor=regressor,
    train_loader=train_loader,
    epochs=10,
    learning_rate=0.001,
    alpha=0.5,      # weight for FSP loss
    ce_weight=0.5,  # weight for cross-entropy
    device=device
)

# Test student
test_accuracy_relation_kd = test(modified_nn_light, test_loader, device)
print(f"Student Accuracy with FSP-based KD: {test_accuracy_relation_kd:.2f}%")


Epoch [1/10], Loss: 0.7992
Epoch [2/10], Loss: 0.1290
Epoch [3/10], Loss: 0.1062
Epoch [4/10], Loss: 0.0925
Epoch [5/10], Loss: 0.0823
Epoch [6/10], Loss: 0.0745
Epoch [7/10], Loss: 0.0677
Epoch [8/10], Loss: 0.0621
Epoch [9/10], Loss: 0.0564
Epoch [10/10], Loss: 0.0532
Test Accuracy: 91.66%
Student Accuracy with FSP-based KD: 91.66%
