In [1]:
!pip install pandas

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import torch

print("PyTorch version:", torch.__version__)


PyTorch version: 2.1.0+cu118


In [3]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader, random_split
import torch.nn.functional as F
import torchvision.models as models
import time

In [4]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
print(device)

cuda


In [6]:
# Data augmentation for training
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),  # Randomly flip the image horizontally
    transforms.RandomCrop(32, padding=4),  # Randomly crop the image
    transforms.Resize(224),  # Resize to 224x224 for ResNet
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# No augmentation for validation and test
val_test_transform = transforms.Compose([
    transforms.Resize(224),  # Resize to 224x224 for ResNet
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load CIFAR-10 dataset
train_val_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=val_test_transform)

# Split train_val_dataset into train and validation sets (80% train, 20% validation)
train_size = int(0.8 * len(train_val_dataset))
val_size = len(train_val_dataset) - train_size
train_dataset, val_dataset = random_split(train_val_dataset, [train_size, val_size])

# Apply val_test_transform to the validation set
val_dataset.dataset.transform = val_test_transform

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=4)

Files already downloaded and verified
Files already downloaded and verified


In [7]:
# Load pretrained ResNet-50 (Teacher Model)
teacher = models.resnet50(pretrained=True)

# Modify the final fully connected layer for 10 classes (CIFAR-10)
teacher.fc = nn.Linear(teacher.fc.in_features, 10)
# Move models to device
teacher = teacher.to(device)



In [8]:

model_path = 'Best_Teacher.pth'
# Load the model weights
teacher.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

<All keys matched successfully>

In [9]:
# Load pretrained ResNet-18 (Student Model)
student = models.resnet18(pretrained=True)
# Modify the final fully connected layer for 10 classes (CIFAR-10)
student.fc = nn.Linear(student.fc.in_features, 10)
student = student.to(device)



In [10]:

model_path = 'student_before_pruning.pth'
# Load the model weights
student.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

<All keys matched successfully>

In [11]:
# Logits normalization function
def normalize(logit):
    mean = logit.mean(dim=-1, keepdim=True)
    stdv = logit.std(dim=-1, keepdim=True)
    return (logit - mean) / (1e-7 + stdv)


In [12]:
# CA-KLD Loss for Classification
def cakld_loss(student_logits, teacher_logits, beta_prob):
    # Forward KL (student || teacher)
    student_log_prob = F.log_softmax(student_logits, dim=1)
    teacher_prob = F.softmax(teacher_logits, dim=1)
    forward_kl = F.kl_div(student_log_prob, teacher_prob, reduction='batchmean')

    # Reverse KL (teacher || student)
    teacher_log_prob = F.log_softmax(teacher_logits, dim=1)
    student_prob = F.softmax(student_logits, dim=1)
    reverse_kl = F.kl_div(teacher_log_prob, student_prob, reduction='batchmean')

    # Combined KL loss
    kl_loss = beta_prob * reverse_kl + (1 - beta_prob) * forward_kl
    return kl_loss


In [13]:
def evaluate(model, test_loader, device):
    model = model.to(device)  # Ensure model is on the correct device
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total


In [14]:
def calculate_sparsity(model):
    total_zeros = 0
    total_params = 0
    for name, param in model.named_parameters():
        if 'weight' in name:
            total_zeros += torch.sum(param == 0).item()
            total_params += param.numel()
    return total_zeros / total_params

In [15]:
import torch
import time
def measure_inference_time(model, test_loader, num_runs=5):
    device = torch.device('cpu')
    model.eval()
    model.to(device)

    # Warm-up (one batch to avoid startup cost)
    with torch.no_grad():
        for inputs, _ in test_loader:
            inputs = inputs.to(device)
            _ = model(inputs)
            break

    total_time = 0
    total_images = 0

    with torch.no_grad():
        for _ in range(num_runs):
            for inputs, _ in test_loader:
                inputs = inputs.to(device)
                batch_size = inputs.size(0)
                start_time = time.time()
                _ = model(inputs)
                end_time = time.time()

                total_time += (end_time - start_time)
                total_images += batch_size

    avg_time_per_image = total_time / total_images
    return avg_time_per_image


In [16]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

def calculate_model_size(model, filename="temp.pth"):
    torch.save(model.state_dict(), filename)
    size = os.path.getsize(filename) / (1024 * 1024)  # Size in MB
    os.remove(filename)
    return size

def compare_model_sizes(teacher, student, pruned_student):
    # Count parameters
    teacher_params = count_parameters(teacher)
    student_params = count_parameters(student)
    pruned_params = count_parameters(pruned_student)
    
    # Calculate disk size
    teacher_size = calculate_model_size(teacher, "teacher.pth")
    student_size = calculate_model_size(student, "student.pth")
    pruned_size = calculate_model_size(pruned_student, "pruned_student.pth")
    
    # Print comparison
    print("\n--- Model Size Comparison ---")
    print(f"Teacher Model: {teacher_params} parameters, {teacher_size:.2f} MB")
    print(f"Student Model (Before Pruning): {student_params} parameters, {student_size:.2f} MB")
    print(f"Student Model (After Pruning): {pruned_params} parameters, {pruned_size:.2f} MB")
    
    # Calculate compression ratio
    compression_ratio = student_size / pruned_size
    print(f"\nCompression Ratio: {compression_ratio:.2f}x")

In [17]:
def train_model(model, train_loader, val_loader, epochs=10, lr=0.001, patience=3):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    
    best_val_accuracy = 0.0
    best_model_state = None
    patience_counter = 0  # Counter for early stopping
    
    for epoch in range(epochs):
        print(epoch)
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        # Evaluate on the validation set
        val_accuracy = evaluate(model, val_loader, device)
        print(f"Epoch {epoch+1}/{epochs} | Loss: {running_loss/len(train_loader):.4f} | Val Accuracy: {val_accuracy:.2f}%")
        
        # Early stopping logic
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state = model.state_dict()
            patience_counter = 0  # Reset patience counter
            torch.save(model.state_dict(), 'best_teacher_model.pth')  # Save the best model
            print(f" New best model saved with validation accuracy: {best_val_accuracy:.2f}%")
        else:
            patience_counter += 1
            print(f" No improvement in validation accuracy ({patience_counter}/{patience})")
            
            # Stop training if no improvement for 'patience' epochs
            if patience_counter >= patience:
                print(f"\nEarly stopping triggered! No improvement for {patience} epochs.")
                break
    
    # Load the best model state
    model.load_state_dict(torch.load('best_teacher_model.pth'))
    print("\nLoading the best model for final evaluation.")
    
    # Evaluate on the test set
    test_accuracy = evaluate(model, test_loader, device)
    print(f"Test Accuracy with Best Model: {test_accuracy:.2f}%")
    
    return model



In [18]:
def compute_gradient_importance(
    teacher, student, data_loader, device, temperature=4.0, alpha=0.5, beta_prob=0.5, accumulation_epochs=3
):
    importance_scores = {}

    # Initialize importance score storage for conv layer weights only
    for name, param in student.named_parameters():
        if 'weight' in name and len(param.shape) == 4:  # Conv weights only
            importance_scores[name] = torch.zeros_like(param.data, device=device)

    teacher.to(device).eval()
    student.to(device).train()

    # Add momentum for gradient accumulation smoothing
    momentum = 0.9  # Controls exponential moving average
    accumulated_batches = 0  # Track for bias correction

    for epoch in range(accumulation_epochs):
        print(f"Accumulation Epoch {epoch+1}/{accumulation_epochs}")
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            student.zero_grad()

            with torch.no_grad():
                teacher_logits = teacher(inputs)

            student_logits = student(inputs)

            # Temperature scaling
            student_logits_temp = student_logits / temperature
            teacher_logits_temp = teacher_logits / temperature


            # Compute losses
            distillation_loss = cakld_loss(student_logits_temp, teacher_logits_temp, beta_prob) * (temperature ** 2)
            ce_loss = F.cross_entropy(student_logits, labels)
            loss = alpha * distillation_loss + (1 - alpha) * ce_loss

            # Modified backward propagation
            loss.backward()

            # Accumulate importance scores with parameter-gradient product
            accumulated_batches += 1
            for name, param in student.named_parameters():
                if name in importance_scores and param.grad is not None:
                    # Key modification: Use parameter-gradient product magnitude
                    grad_product = (param.data * param.grad).abs_()
                    
                    # Exponential moving average with bias correction
                    if accumulated_batches == 1:
                        importance_scores[name] = grad_product
                    else:
                        importance_scores[name] = momentum * importance_scores[name] + (1 - momentum) * grad_product

    # Apply bias correction for EMA
    for name in importance_scores:
        importance_scores[name] /= (1 - momentum**accumulated_batches)

    return importance_scores

In [19]:
def gradient_based_global_prune(model, importance_scores, prune_ratio=0.95):
    all_scores = torch.cat([score.flatten() for score in importance_scores.values()])
    threshold = torch.topk(all_scores, k=int(prune_ratio * all_scores.numel()), largest=False)[0][-1]

    for name, param in model.named_parameters():
        if name in importance_scores:
            mask = (importance_scores[name] > threshold).float()
            param.data.mul_(mask)

    return model


In [20]:
import torch
import torch.nn.functional as F
import torch.optim as optim

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

def retrain_with_sparsity(student, train_loader, val_loader, epochs=5, save_path="retrained_student_model.pt", patience=3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    optimizer = optim.SGD(student.parameters(), lr=0.01, momentum=0.9)

    # 1. Store masks AND zero momentum buffers for pruned weights
    masks = {}
    for name, param in student.named_parameters():
        if 'weight' in name and param.dim() == 4:  # Consider only conv layers
            mask = (param != 0).float().to(device)
            masks[name] = mask
            # Zero momentum buffers for pruned weights
            if optimizer.state.get(param, None) and 'momentum_buffer' in optimizer.state[param]:
                optimizer.state[param]['momentum_buffer'] *= mask

    student = student.to(device)
    best_val_acc = 0.0
    best_model = None
    patience_counter = 0  # Counter for early stopping

    # 2. Add gradient clipping to prevent NaN
    max_grad_norm = 1.0

    for epoch in range(epochs):
        student.train()
        total_loss = 0.0
        correct, total = 0, 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = student(inputs)
            loss = F.cross_entropy(outputs, labels)
            loss.backward()

            # Apply masks to gradients
            for name, param in student.named_parameters():
                if name in masks:
                    param.grad.data *= masks[name]

            # Gradient clipping before optimizer step
            torch.nn.utils.clip_grad_norm_(student.parameters(), max_grad_norm)

            optimizer.step()

            # Reapply masks and update momentum buffers
            for name, param in student.named_parameters():
                if name in masks:
                    param.data *= masks[name]
                    if optimizer.state.get(param, None) and 'momentum_buffer' in optimizer.state[param]:
                        optimizer.state[param]['momentum_buffer'] *= masks[name]

            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

        train_loss = total_loss / len(train_loader)
        train_acc = 100.0 * correct / total

        # Validation phase
        student.eval()
        val_loss, val_correct, val_total = 0.0, 0, 0

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = student(inputs)
                loss = F.cross_entropy(outputs, labels)

                val_loss += loss.item()
                _, predicted = outputs.max(1)
                val_correct += predicted.eq(labels).sum().item()
                val_total += labels.size(0)

        val_loss /= len(val_loader)
        val_acc = 100.0 * val_correct / val_total

        # Track best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = student.state_dict()
            torch.save(best_model, save_path)
            patience_counter = 0  # Reset patience counter
            print(f"New best model saved with Val Accuracy: {best_val_acc:.2f}%")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}. No improvement for {patience} epochs.")
                break  # Stop training

        # Print results
        sparsity = calculate_sparsity(student)
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
        print(f"Validation Loss: {val_loss:.4f} | Validation Acc: {val_acc:.2f}% | Sparsity: {sparsity*100:.2f}%\n")

    print(f"Best Validation Accuracy: {best_val_acc:.2f}% | Best Model Saved at: {save_path}")
    return student

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time

# KD training with CA-KLD loss and mask-based momentum handling
def retrain_with_KD(teacher, student, train_loader, val_loader, epochs=50,
                    temperature=5.0, alpha=0.5, beta_prob=0.5, patience=5,
                    save_path="student_before_pruning.pth"):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    optimizer = optim.SGD(student.parameters(), lr=0.01, momentum=0.9)

    # 1. Store masks and zero momentum buffers
    masks = {}
    for name, param in student.named_parameters():
        if 'weight' in name and param.dim() == 4:
            mask = (param != 0).float().to(device)
            masks[name] = mask
            if optimizer.state.get(param, None) and 'momentum_buffer' in optimizer.state[param]:
                optimizer.state[param]['momentum_buffer'] *= mask

    teacher = teacher.to(device).eval()
    student = student.to(device)

    best_val_acc = 0.0
    best_model_state = None
    patience_counter = 0
    start_time = time.time()

    for epoch in range(epochs):
        student.train()
        total_loss, correct, total = 0.0, 0, 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()

            with torch.no_grad():
                teacher_logits = teacher(inputs)

            student_logits = student(inputs)

            # Apply temperature
            teacher_logits_temp = teacher_logits / temperature
            student_logits_temp = student_logits / temperature

            # Logits normalization
            teacher_logits_temp = normalize(teacher_logits_temp)
            student_logits_temp = normalize(student_logits_temp)


            # CA-KLD loss
            kd_loss = cakld_loss(student_logits_temp, teacher_logits_temp, beta_prob) * (temperature ** 2)
            ce_loss = F.cross_entropy(student_logits, labels)

            loss = alpha * kd_loss + (1 - alpha) * ce_loss
            loss.backward()
            optimizer.step()

            # Reapply masks and update momentum
            for name, param in student.named_parameters():
                if name in masks:
                    param.data *= masks[name]
                    if optimizer.state.get(param, None) and 'momentum_buffer' in optimizer.state[param]:
                        optimizer.state[param]['momentum_buffer'] *= masks[name]

            total_loss += loss.item()
            _, predicted = student_logits.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

        train_loss = total_loss / len(train_loader)
        train_acc = 100.0 * correct / total

        # Validation
        student.eval()
        val_loss, val_correct, val_total = 0.0, 0, 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = student(inputs)
                loss = F.cross_entropy(outputs, labels)
                val_loss += loss.item()
                _, predicted = outputs.max(1)
                val_correct += predicted.eq(labels).sum().item()
                val_total += labels.size(0)

        val_loss /= len(val_loader)
        val_acc = 100.0 * val_correct / val_total
        sparsity = calculate_sparsity(student) * 100.0  # Assuming this function is defined elsewhere

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
              f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}% | Sparsity: {sparsity:.2f}%")

        # Early stopping logic
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = student.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}. No improvement for {patience} epochs.")
                break

    # Restore and save best model
    student.load_state_dict(best_model_state)
    torch.save(student.state_dict(), save_path)
    print(f"Student model saved before pruning at: {save_path}")
    total_time = time.time() - start_time
    print(f"Total Training Time: {total_time // 60:.0f}m {total_time % 60:.0f}s")

    return student

In [22]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

# Training function with KD + CA-KLD and logits normalization
def train_kd_pruning(teacher, student, train_loader, val_loader, epochs=50, temperature=5.0, alpha=0.5,
                     beta_prob=0.5, patience=5, save_path="student_before_pruning.pth"):
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    optimizer = optim.SGD(student.parameters(), lr=0.01, momentum=0.9)

    teacher = teacher.to(device)
    student = student.to(device)
    teacher.eval()  # Freeze teacher

    best_val_acc = 0.0
    best_model_state = None
    patience_counter = 0
    start_time = time.time()

    for epoch in range(epochs):
        student.train()
        total_loss = 0.0
        correct, total = 0, 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            with torch.no_grad():
                teacher_logits = teacher(inputs)

            student_logits = student(inputs)

            # Temperature scaling
            teacher_logits_temp = teacher_logits / temperature
            student_logits_temp = student_logits / temperature

            # Logits normalization
            teacher_logits_temp = normalize(teacher_logits_temp)
            student_logits_temp = normalize(student_logits_temp)

            # CA-KLD loss (normalized logits)
            distillation_loss = cakld_loss(student_logits_temp, teacher_logits_temp, beta_prob) * (temperature ** 2)

            # Cross-entropy loss
            ground_truth_loss = F.cross_entropy(student_logits, labels)

            # Combined loss
            loss = alpha * distillation_loss + (1 - alpha) * ground_truth_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = student_logits.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

        train_loss = total_loss / len(train_loader)
        train_acc = 100.0 * correct / total

        # Validation accuracy
        val_acc = evaluate(student, val_loader, device)

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | "
              f"Train Acc: {train_acc:.2f}% | Val Acc: {val_acc:.2f}%")

        # Early stopping
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = student.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}. No improvement for {patience} epochs.")
                break

    # Load best model state and save
    student.load_state_dict(best_model_state)
    torch.save(student.state_dict(), save_path)
    print(f"Student model saved before pruning at: {save_path}")

    total_time = time.time() - start_time
    print(f"Total Training Time: {total_time // 60:.0f}m {total_time % 60:.0f}s")

    return student

## 95% Sparsity

In [23]:

model_path = 'student_before_pruning.pth'
# Load the model weights
student.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

<All keys matched successfully>

In [24]:
# Pruning
print("Calculating Important Scores")
start_time = time.time()
importance_scores = compute_gradient_importance(
    teacher, student, train_loader, device, temperature=3.0, alpha=0.7,beta_prob=0.5, accumulation_epochs=3
)
total_time = time.time() - start_time
print(f"Total Time take to calculate Important scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")

print("Pruning the model")
start_time = time.time()
pruned_student = gradient_based_global_prune(student, importance_scores, prune_ratio=0.9508)
total_time = time.time() - start_time
print(f"Total Time take to prune the model scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")
student = student.to(device)


Calculating Important Scores
Accumulation Epoch 1/3
Accumulation Epoch 2/3
Accumulation Epoch 3/3
Total Time take to calculate Important scores: 2m 29s
Pruning the model
Total Time take to prune the model scores: 0m 0s


In [25]:
# Calculate sparsity
sparsity = calculate_sparsity(pruned_student)
print(f"Sparsity: {sparsity * 100:.2f}%")

Sparsity: 95.00%


In [26]:

start_time = time.time()
pruned_student = retrain_with_KD(
    teacher, pruned_student, train_loader, val_loader,
    epochs=50, temperature=3.0, alpha=0.7, beta_prob=0.5,patience=5,save_path="pruned_student_retrain_KD_90%.pth"
)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Retraining completed in {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")

Epoch 1/50 | Train Loss: 0.6680 | Train Acc: 91.05% | Val Loss: 0.2156 | Val Acc: 93.42% | Sparsity: 95.00%
Epoch 2/50 | Train Loss: 0.3079 | Train Acc: 96.12% | Val Loss: 0.1725 | Val Acc: 94.64% | Sparsity: 95.00%
Epoch 3/50 | Train Loss: 0.2172 | Train Acc: 97.70% | Val Loss: 0.1779 | Val Acc: 94.52% | Sparsity: 95.00%
Epoch 4/50 | Train Loss: 0.1719 | Train Acc: 98.44% | Val Loss: 0.1378 | Val Acc: 95.55% | Sparsity: 95.00%
Epoch 5/50 | Train Loss: 0.1452 | Train Acc: 98.81% | Val Loss: 0.1471 | Val Acc: 95.51% | Sparsity: 95.00%
Epoch 6/50 | Train Loss: 0.1294 | Train Acc: 98.97% | Val Loss: 0.1434 | Val Acc: 95.49% | Sparsity: 95.00%
Epoch 7/50 | Train Loss: 0.1177 | Train Acc: 99.19% | Val Loss: 0.1342 | Val Acc: 95.68% | Sparsity: 95.00%
Epoch 8/50 | Train Loss: 0.1090 | Train Acc: 99.22% | Val Loss: 0.1325 | Val Acc: 95.59% | Sparsity: 95.00%
Epoch 9/50 | Train Loss: 0.1049 | Train Acc: 99.19% | Val Loss: 0.1319 | Val Acc: 95.85% | Sparsity: 95.00%
Epoch 10/50 | Train Loss: 0.

In [27]:
student_accuracy = evaluate(pruned_student, test_loader, device)
print(f"Pruned Student Model Test Accuracy(After Retrain): {student_accuracy:.2f}%")

Pruned Student Model Test Accuracy(After Retrain): 94.09%


In [28]:

model_path = 'student_before_pruning.pth'
# Load the model weights
student.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

<All keys matched successfully>

In [29]:
# Pruning
print("Calculating Important Scores")
start_time = time.time()
importance_scores = compute_gradient_importance(
    teacher, student, train_loader, device, temperature=5.0, alpha=0.7,beta_prob=0.5, accumulation_epochs=3
)
total_time = time.time() - start_time
print(f"Total Time take to calculate Important scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")

print("Pruning the model")
start_time = time.time()
pruned_student = gradient_based_global_prune(student, importance_scores, prune_ratio=0.9508)
total_time = time.time() - start_time
print(f"Total Time take to prune the model scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")
student = student.to(device)


Calculating Important Scores
Accumulation Epoch 1/3
Accumulation Epoch 2/3
Accumulation Epoch 3/3
Total Time take to calculate Important scores: 2m 28s
Pruning the model
Total Time take to prune the model scores: 0m 0s


In [30]:
# Calculate sparsity
sparsity = calculate_sparsity(pruned_student)
print(f"Sparsity: {sparsity * 100:.2f}%")

Sparsity: 95.00%


In [None]:

start_time = time.time()
pruned_student = retrain_with_KD(
    teacher, pruned_student, train_loader, val_loader,
    epochs=50, temperature=5.0, alpha=0.7, beta_prob=0.5,patience=5,save_path="pruned_student_retrain_KD_90%.pth"
)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Retraining completed in {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")

Epoch 1/50 | Train Loss: 2.3030 | Train Acc: 87.75% | Val Loss: 0.5408 | Val Acc: 88.66% | Sparsity: 95.00%
Epoch 2/50 | Train Loss: 1.1570 | Train Acc: 93.82% | Val Loss: 0.4443 | Val Acc: 90.19% | Sparsity: 95.00%
Epoch 3/50 | Train Loss: 0.8297 | Train Acc: 95.83% | Val Loss: 0.3078 | Val Acc: 92.52% | Sparsity: 95.00%
Epoch 4/50 | Train Loss: 0.6606 | Train Acc: 96.88% | Val Loss: 0.2449 | Val Acc: 93.40% | Sparsity: 95.00%
Epoch 5/50 | Train Loss: 0.5237 | Train Acc: 97.78% | Val Loss: 0.2710 | Val Acc: 92.98% | Sparsity: 95.00%
Epoch 6/50 | Train Loss: 0.4255 | Train Acc: 98.42% | Val Loss: 0.2072 | Val Acc: 94.40% | Sparsity: 95.00%
Epoch 7/50 | Train Loss: 0.3572 | Train Acc: 98.78% | Val Loss: 0.1955 | Val Acc: 94.47% | Sparsity: 95.00%
Epoch 8/50 | Train Loss: 0.3228 | Train Acc: 98.81% | Val Loss: 0.1756 | Val Acc: 95.06% | Sparsity: 95.00%
Epoch 9/50 | Train Loss: 0.3016 | Train Acc: 99.02% | Val Loss: 0.1736 | Val Acc: 94.83% | Sparsity: 95.00%
Epoch 10/50 | Train Loss: 0.

In [57]:
student_accuracy = evaluate(pruned_student, test_loader, device)
print(f"Pruned Student Model Test Accuracy(After Retrain): {student_accuracy:.2f}%")

Pruned Student Model Test Accuracy(After Retrain): 95.88%


## 90% Sparsity

In [58]:
model_path = 'student_before_pruning.pth'
# Load the model weights
student.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))


# Pruning
print("Calculating Important Scores")
start_time = time.time()
importance_scores = compute_gradient_importance(
    teacher, student, train_loader, device, temperature=3.0, alpha=0.7,beta_prob=0.5, accumulation_epochs=3
)
total_time = time.time() - start_time
print(f"Total Time take to calculate Important scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")

print("Pruning the model")
start_time = time.time()
pruned_student = gradient_based_global_prune(student, importance_scores, prune_ratio=0.9008)
total_time = time.time() - start_time
print(f"Total Time take to prune the model scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")
student = student.to(device)

# Calculate sparsity
sparsity = calculate_sparsity(pruned_student)
print(f"Sparsity: {sparsity * 100:.2f}%")


Calculating Important Scores
Accumulation Epoch 1/3
Accumulation Epoch 2/3
Accumulation Epoch 3/3
Total Time take to calculate Important scores: 2m 28s
Pruning the model
Total Time take to prune the model scores: 0m 0s
Sparsity: 90.00%


In [59]:
start_time = time.time()
pruned_student = retrain_with_KD(
    teacher, pruned_student, train_loader, val_loader,
    epochs=50, temperature=3.0, alpha=0.7, beta_prob=0.5,patience=5,save_path="pruned_student_retrain_KD_90%.pth"
)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Retraining completed in {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")

Epoch 1/50 | Train Loss: 0.3262 | Train Acc: 95.87% | Val Loss: 0.1165 | Val Acc: 96.43% | Sparsity: 90.00%
Epoch 2/50 | Train Loss: 0.1695 | Train Acc: 98.32% | Val Loss: 0.0930 | Val Acc: 97.19% | Sparsity: 90.00%
Epoch 3/50 | Train Loss: 0.1240 | Train Acc: 98.98% | Val Loss: 0.0776 | Val Acc: 97.63% | Sparsity: 90.00%
Epoch 4/50 | Train Loss: 0.1073 | Train Acc: 99.20% | Val Loss: 0.0784 | Val Acc: 97.53% | Sparsity: 90.00%
Epoch 5/50 | Train Loss: 0.0975 | Train Acc: 99.21% | Val Loss: 0.0801 | Val Acc: 97.51% | Sparsity: 90.00%
Epoch 6/50 | Train Loss: 0.0881 | Train Acc: 99.29% | Val Loss: 0.0755 | Val Acc: 97.67% | Sparsity: 90.00%
Epoch 7/50 | Train Loss: 0.0852 | Train Acc: 99.30% | Val Loss: 0.0753 | Val Acc: 97.68% | Sparsity: 90.00%
Epoch 8/50 | Train Loss: 0.0799 | Train Acc: 99.38% | Val Loss: 0.0761 | Val Acc: 97.64% | Sparsity: 90.00%
Epoch 9/50 | Train Loss: 0.0777 | Train Acc: 99.33% | Val Loss: 0.0731 | Val Acc: 97.77% | Sparsity: 90.00%
Epoch 10/50 | Train Loss: 0.

In [60]:
student_accuracy = evaluate(pruned_student, test_loader, device)
print(f"Pruned Student Model Test Accuracy(After Retrain): {student_accuracy:.2f}%")

Pruned Student Model Test Accuracy(After Retrain): 94.68%


In [70]:
model_path = 'student_before_pruning.pth'
# Load the model weights
student.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

# Pruning
print("Calculating Important Scores")
start_time = time.time()
importance_scores = compute_gradient_importance(
    teacher, student, train_loader, device, temperature=5.0, alpha=0.7,beta_prob=0.5, accumulation_epochs=3
)
total_time = time.time() - start_time
print(f"Total Time take to calculate Important scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")

print("Pruning the model")
start_time = time.time()
pruned_student = gradient_based_global_prune(student, importance_scores, prune_ratio=0.9008)
total_time = time.time() - start_time
print(f"Total Time take to prune the model scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")
student = student.to(device)

# Calculate sparsity
sparsity = calculate_sparsity(pruned_student)
print(f"Sparsity: {sparsity * 100:.2f}%")

Calculating Important Scores
Accumulation Epoch 1/3
Accumulation Epoch 2/3
Accumulation Epoch 3/3
Total Time take to calculate Important scores: 2m 28s
Pruning the model
Total Time take to prune the model scores: 0m 0s
Sparsity: 90.00%


In [71]:

start_time = time.time()
pruned_student = retrain_with_KD(
    teacher, pruned_student, train_loader, val_loader,
    epochs=50, temperature=5.0, alpha=0.7, beta_prob=0.5,patience=5,save_path="pruned_student_retrain_KD_90%.pth"
)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Retraining completed in {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")

Epoch 1/50 | Train Loss: 1.3378 | Train Acc: 92.73% | Val Loss: 0.2690 | Val Acc: 93.05% | Sparsity: 90.00%
Epoch 2/50 | Train Loss: 0.7029 | Train Acc: 96.57% | Val Loss: 0.1775 | Val Acc: 94.99% | Sparsity: 90.00%
Epoch 3/50 | Train Loss: 0.4881 | Train Acc: 97.92% | Val Loss: 0.1637 | Val Acc: 95.56% | Sparsity: 90.00%
Epoch 4/50 | Train Loss: 0.3708 | Train Acc: 98.60% | Val Loss: 0.1260 | Val Acc: 96.32% | Sparsity: 90.00%
Epoch 5/50 | Train Loss: 0.2967 | Train Acc: 98.97% | Val Loss: 0.1204 | Val Acc: 96.33% | Sparsity: 90.00%
Epoch 6/50 | Train Loss: 0.2589 | Train Acc: 99.06% | Val Loss: 0.1070 | Val Acc: 96.83% | Sparsity: 90.00%
Epoch 7/50 | Train Loss: 0.2348 | Train Acc: 99.07% | Val Loss: 0.1014 | Val Acc: 96.82% | Sparsity: 90.00%
Epoch 8/50 | Train Loss: 0.2183 | Train Acc: 99.11% | Val Loss: 0.1056 | Val Acc: 96.78% | Sparsity: 90.00%
Epoch 9/50 | Train Loss: 0.2065 | Train Acc: 99.09% | Val Loss: 0.0995 | Val Acc: 96.87% | Sparsity: 90.00%
Epoch 10/50 | Train Loss: 0.

In [72]:
student_accuracy = evaluate(pruned_student, test_loader, device)
print(f"Pruned Student Model Test Accuracy(After Retrain): {student_accuracy:.2f}%")

Pruned Student Model Test Accuracy(After Retrain): 94.83%


## 79% Sparsity

In [73]:
model_path = 'student_before_pruning.pth'
# Load the model weights
student.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))


# Pruning
print("Calculating Important Scores")
start_time = time.time()
importance_scores = compute_gradient_importance(
    teacher, student, train_loader, device, temperature=3.0, alpha=0.7,beta_prob=0.5, accumulation_epochs=3
)
total_time = time.time() - start_time
print(f"Total Time take to calculate Important scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")

print("Pruning the model")
start_time = time.time()
pruned_student = gradient_based_global_prune(student, importance_scores, prune_ratio=0.7907)
total_time = time.time() - start_time
print(f"Total Time take to prune the model scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")
student = student.to(device)

# Calculate sparsity
sparsity = calculate_sparsity(pruned_student)
print(f"Sparsity: {sparsity * 100:.2f}%")


Calculating Important Scores
Accumulation Epoch 1/3
Accumulation Epoch 2/3
Accumulation Epoch 3/3
Total Time take to calculate Important scores: 2m 28s
Pruning the model
Total Time take to prune the model scores: 0m 0s
Sparsity: 79.00%


In [74]:
start_time = time.time()
pruned_student = retrain_with_KD(
    teacher, pruned_student, train_loader, val_loader,
    epochs=50, temperature=3.0, alpha=0.7, beta_prob=0.5,patience=5,save_path="pruned_student_retrain_KD_90%.pth"
)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Retraining completed in {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")

Epoch 1/50 | Train Loss: 0.1645 | Train Acc: 98.17% | Val Loss: 0.0591 | Val Acc: 98.33% | Sparsity: 79.00%
Epoch 2/50 | Train Loss: 0.1029 | Train Acc: 99.00% | Val Loss: 0.0507 | Val Acc: 98.44% | Sparsity: 79.00%
Epoch 3/50 | Train Loss: 0.0830 | Train Acc: 99.30% | Val Loss: 0.0477 | Val Acc: 98.63% | Sparsity: 79.00%
Epoch 4/50 | Train Loss: 0.0745 | Train Acc: 99.32% | Val Loss: 0.0457 | Val Acc: 98.58% | Sparsity: 79.00%
Epoch 5/50 | Train Loss: 0.0690 | Train Acc: 99.40% | Val Loss: 0.0457 | Val Acc: 98.65% | Sparsity: 79.00%
Epoch 6/50 | Train Loss: 0.0644 | Train Acc: 99.38% | Val Loss: 0.0467 | Val Acc: 98.62% | Sparsity: 79.00%
Epoch 7/50 | Train Loss: 0.0638 | Train Acc: 99.37% | Val Loss: 0.0454 | Val Acc: 98.67% | Sparsity: 79.00%
Epoch 8/50 | Train Loss: 0.0601 | Train Acc: 99.38% | Val Loss: 0.0464 | Val Acc: 98.66% | Sparsity: 79.00%
Epoch 9/50 | Train Loss: 0.0583 | Train Acc: 99.40% | Val Loss: 0.0462 | Val Acc: 98.72% | Sparsity: 79.00%
Epoch 10/50 | Train Loss: 0.

In [75]:
student_accuracy = evaluate(pruned_student, test_loader, device)
print(f"Pruned Student Model Test Accuracy(After Retrain): {student_accuracy:.2f}%")

Pruned Student Model Test Accuracy(After Retrain): 95.68%


In [76]:
model_path = 'student_before_pruning.pth'
# Load the model weights
student.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

# Pruning
print("Calculating Important Scores")
start_time = time.time()
importance_scores = compute_gradient_importance(
    teacher, student, train_loader, device, temperature=5.0, alpha=0.7,beta_prob=0.5, accumulation_epochs=3
)
total_time = time.time() - start_time
print(f"Total Time take to calculate Important scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")

print("Pruning the model")
start_time = time.time()
pruned_student = gradient_based_global_prune(student, importance_scores, prune_ratio=0.7907)
total_time = time.time() - start_time
print(f"Total Time take to prune the model scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")
student = student.to(device)

# Calculate sparsity
sparsity = calculate_sparsity(pruned_student)
print(f"Sparsity: {sparsity * 100:.2f}%")


Calculating Important Scores
Accumulation Epoch 1/3
Accumulation Epoch 2/3
Accumulation Epoch 3/3
Total Time take to calculate Important scores: 2m 28s
Pruning the model
Total Time take to prune the model scores: 0m 0s
Sparsity: 79.00%


In [77]:

start_time = time.time()
pruned_student = retrain_with_KD(
    teacher, pruned_student, train_loader, val_loader,
    epochs=50, temperature=5.0, alpha=0.7, beta_prob=0.5,patience=5,save_path="pruned_student_retrain_KD_90%.pth"
)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Retraining completed in {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")

Epoch 1/50 | Train Loss: 0.6436 | Train Acc: 96.80% | Val Loss: 0.1194 | Val Acc: 96.77% | Sparsity: 79.00%
Epoch 2/50 | Train Loss: 0.3933 | Train Acc: 98.37% | Val Loss: 0.0922 | Val Acc: 97.27% | Sparsity: 79.00%
Epoch 3/50 | Train Loss: 0.2880 | Train Acc: 98.88% | Val Loss: 0.0750 | Val Acc: 97.75% | Sparsity: 79.00%
Epoch 4/50 | Train Loss: 0.2278 | Train Acc: 99.08% | Val Loss: 0.0632 | Val Acc: 98.08% | Sparsity: 79.00%
Epoch 5/50 | Train Loss: 0.1989 | Train Acc: 99.14% | Val Loss: 0.0607 | Val Acc: 98.02% | Sparsity: 79.00%
Epoch 6/50 | Train Loss: 0.1831 | Train Acc: 99.14% | Val Loss: 0.0594 | Val Acc: 98.22% | Sparsity: 79.00%
Epoch 7/50 | Train Loss: 0.1708 | Train Acc: 99.15% | Val Loss: 0.0570 | Val Acc: 98.27% | Sparsity: 79.00%
Epoch 8/50 | Train Loss: 0.1612 | Train Acc: 99.11% | Val Loss: 0.0580 | Val Acc: 98.36% | Sparsity: 79.00%
Epoch 9/50 | Train Loss: 0.1528 | Train Acc: 99.16% | Val Loss: 0.0529 | Val Acc: 98.44% | Sparsity: 79.00%
Epoch 10/50 | Train Loss: 0.

In [78]:
student_accuracy = evaluate(pruned_student, test_loader, device)
print(f"Pruned Student Model Test Accuracy(After Retrain): {student_accuracy:.2f}%")

Pruned Student Model Test Accuracy(After Retrain): 95.52%


## 59% Sparsity

In [79]:
model_path = 'student_before_pruning.pth'
# Load the model weights
student.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))


# Pruning
print("Calculating Important Scores")
start_time = time.time()
importance_scores = compute_gradient_importance(
    teacher, student, train_loader, device, temperature=3.0, alpha=0.7,beta_prob=0.5, accumulation_epochs=3
)
total_time = time.time() - start_time
print(f"Total Time take to calculate Important scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")

print("Pruning the model")
start_time = time.time()
pruned_student = gradient_based_global_prune(student, importance_scores, prune_ratio=0.5905)
total_time = time.time() - start_time
print(f"Total Time take to prune the model scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")
student = student.to(device)

# Calculate sparsity
sparsity = calculate_sparsity(pruned_student)
print(f"Sparsity: {sparsity * 100:.2f}%")


Calculating Important Scores
Accumulation Epoch 1/3
Accumulation Epoch 2/3
Accumulation Epoch 3/3
Total Time take to calculate Important scores: 2m 30s
Pruning the model
Total Time take to prune the model scores: 0m 0s
Sparsity: 59.00%


In [80]:
start_time = time.time()
pruned_student = retrain_with_KD(
    teacher, pruned_student, train_loader, val_loader,
    epochs=50, temperature=3.0, alpha=0.7, beta_prob=0.5,patience=5,save_path="pruned_student_retrain_KD_90%.pth"
)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Retraining completed in {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")

Epoch 1/50 | Train Loss: 0.1113 | Train Acc: 98.66% | Val Loss: 0.0461 | Val Acc: 98.64% | Sparsity: 59.00%
Epoch 2/50 | Train Loss: 0.0738 | Train Acc: 99.22% | Val Loss: 0.0401 | Val Acc: 98.87% | Sparsity: 59.00%
Epoch 3/50 | Train Loss: 0.0610 | Train Acc: 99.40% | Val Loss: 0.0398 | Val Acc: 98.84% | Sparsity: 59.00%
Epoch 4/50 | Train Loss: 0.0569 | Train Acc: 99.36% | Val Loss: 0.0385 | Val Acc: 98.90% | Sparsity: 59.00%
Epoch 5/50 | Train Loss: 0.0532 | Train Acc: 99.43% | Val Loss: 0.0380 | Val Acc: 98.84% | Sparsity: 59.00%
Epoch 6/50 | Train Loss: 0.0511 | Train Acc: 99.43% | Val Loss: 0.0385 | Val Acc: 98.82% | Sparsity: 59.00%
Epoch 7/50 | Train Loss: 0.0497 | Train Acc: 99.42% | Val Loss: 0.0396 | Val Acc: 98.82% | Sparsity: 59.00%
Epoch 8/50 | Train Loss: 0.0472 | Train Acc: 99.41% | Val Loss: 0.0380 | Val Acc: 98.83% | Sparsity: 59.00%
Epoch 9/50 | Train Loss: 0.0469 | Train Acc: 99.44% | Val Loss: 0.0376 | Val Acc: 98.86% | Sparsity: 59.00%
Early stopping triggered at 

In [81]:
student_accuracy = evaluate(pruned_student, test_loader, device)
print(f"Pruned Student Model Test Accuracy(After Retrain): {student_accuracy:.2f}%")

Pruned Student Model Test Accuracy(After Retrain): 95.84%


In [82]:
model_path = 'student_before_pruning.pth'
# Load the model weights
student.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

# Pruning
print("Calculating Important Scores")
start_time = time.time()
importance_scores = compute_gradient_importance(
    teacher, student, train_loader, device, temperature=5.0, alpha=0.7,beta_prob=0.5, accumulation_epochs=3
)
total_time = time.time() - start_time
print(f"Total Time take to calculate Important scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")

print("Pruning the model")
start_time = time.time()
pruned_student = gradient_based_global_prune(student, importance_scores, prune_ratio=0.5905)
total_time = time.time() - start_time
print(f"Total Time take to prune the model scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")
student = student.to(device)

# Calculate sparsity
sparsity = calculate_sparsity(pruned_student)
print(f"Sparsity: {sparsity * 100:.2f}%")

Calculating Important Scores
Accumulation Epoch 1/3
Accumulation Epoch 2/3
Accumulation Epoch 3/3
Total Time take to calculate Important scores: 2m 28s
Pruning the model
Total Time take to prune the model scores: 0m 0s
Sparsity: 59.00%


In [83]:

start_time = time.time()
pruned_student = retrain_with_KD(
    teacher, pruned_student, train_loader, val_loader,
    epochs=50, temperature=5.0, alpha=0.7, beta_prob=0.5,patience=5,save_path="pruned_student_retrain_KD_90%.pth"
)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Retraining completed in {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")

Epoch 1/50 | Train Loss: 0.3811 | Train Acc: 98.25% | Val Loss: 0.0721 | Val Acc: 98.08% | Sparsity: 59.00%
Epoch 2/50 | Train Loss: 0.2529 | Train Acc: 98.89% | Val Loss: 0.0452 | Val Acc: 98.69% | Sparsity: 59.00%
Epoch 3/50 | Train Loss: 0.1925 | Train Acc: 99.10% | Val Loss: 0.0468 | Val Acc: 98.67% | Sparsity: 59.00%
Epoch 4/50 | Train Loss: 0.1669 | Train Acc: 99.11% | Val Loss: 0.0466 | Val Acc: 98.71% | Sparsity: 59.00%
Epoch 5/50 | Train Loss: 0.1516 | Train Acc: 99.15% | Val Loss: 0.0449 | Val Acc: 98.67% | Sparsity: 59.00%
Epoch 6/50 | Train Loss: 0.1403 | Train Acc: 99.14% | Val Loss: 0.0411 | Val Acc: 98.87% | Sparsity: 59.00%
Epoch 7/50 | Train Loss: 0.1319 | Train Acc: 99.17% | Val Loss: 0.0433 | Val Acc: 98.72% | Sparsity: 59.00%
Epoch 8/50 | Train Loss: 0.1256 | Train Acc: 99.14% | Val Loss: 0.0415 | Val Acc: 98.83% | Sparsity: 59.00%
Epoch 9/50 | Train Loss: 0.1213 | Train Acc: 99.19% | Val Loss: 0.0401 | Val Acc: 98.80% | Sparsity: 59.00%
Epoch 10/50 | Train Loss: 0.

In [84]:
student_accuracy = evaluate(pruned_student, test_loader, device)
print(f"Pruned Student Model Test Accuracy(After Retrain): {student_accuracy:.2f}%")

Pruned Student Model Test Accuracy(After Retrain): 95.80%


## 36% Sparsity

In [85]:
model_path = 'student_before_pruning.pth'
# Load the model weights
student.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))


# Pruning
print("Calculating Important Scores")
start_time = time.time()
importance_scores = compute_gradient_importance(
    teacher, student, train_loader, device, temperature=3.0, alpha=0.7,beta_prob=0.5, accumulation_epochs=3
)
total_time = time.time() - start_time
print(f"Total Time take to calculate Important scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")

print("Pruning the model")
start_time = time.time()
pruned_student = gradient_based_global_prune(student, importance_scores, prune_ratio=0.3603)
total_time = time.time() - start_time
print(f"Total Time take to prune the model scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")
student = student.to(device)

# Calculate sparsity
sparsity = calculate_sparsity(pruned_student)
print(f"Sparsity: {sparsity * 100:.2f}%")


Calculating Important Scores
Accumulation Epoch 1/3
Accumulation Epoch 2/3
Accumulation Epoch 3/3
Total Time take to calculate Important scores: 2m 29s
Pruning the model
Total Time take to prune the model scores: 0m 0s
Sparsity: 36.00%


In [90]:
start_time = time.time()
pruned_student = retrain_with_KD(
    teacher, pruned_student, train_loader, val_loader,
    epochs=50, temperature=3.0, alpha=0.7, beta_prob=0.5,patience=5,save_path="pruned_student_retrain_KD_90%.pth"
)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Retraining completed in {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")

Epoch 1/50 | Train Loss: 0.0591 | Train Acc: 99.16% | Val Loss: 0.0370 | Val Acc: 98.94% | Sparsity: 36.00%
Epoch 2/50 | Train Loss: 0.0507 | Train Acc: 99.38% | Val Loss: 0.0354 | Val Acc: 98.97% | Sparsity: 36.00%
Epoch 3/50 | Train Loss: 0.0478 | Train Acc: 99.38% | Val Loss: 0.0350 | Val Acc: 98.94% | Sparsity: 36.00%
Epoch 4/50 | Train Loss: 0.0452 | Train Acc: 99.45% | Val Loss: 0.0357 | Val Acc: 98.94% | Sparsity: 36.00%
Epoch 5/50 | Train Loss: 0.0430 | Train Acc: 99.42% | Val Loss: 0.0356 | Val Acc: 98.94% | Sparsity: 36.00%
Epoch 6/50 | Train Loss: 0.0416 | Train Acc: 99.48% | Val Loss: 0.0362 | Val Acc: 98.93% | Sparsity: 36.00%
Epoch 7/50 | Train Loss: 0.0410 | Train Acc: 99.42% | Val Loss: 0.0352 | Val Acc: 98.93% | Sparsity: 36.00%
Early stopping triggered at epoch 7. No improvement for 5 epochs.
Student model saved before pruning at: pruned_student_retrain_KD_90%.pth
Total Training Time: 6m 24s
Retraining completed in 6.40 minutes (383.97 seconds)


In [91]:
student_accuracy = evaluate(pruned_student, test_loader, device)
print(f"Pruned Student Model Test Accuracy(After Retrain): {student_accuracy:.2f}%")

Pruned Student Model Test Accuracy(After Retrain): 96.05%


In [92]:
model_path = 'student_before_pruning.pth'
# Load the model weights
student.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

# Pruning
print("Calculating Important Scores")
start_time = time.time()
importance_scores = compute_gradient_importance(
    teacher, student, train_loader, device, temperature=5.0, alpha=0.7,beta_prob=0.5, accumulation_epochs=3
)
total_time = time.time() - start_time
print(f"Total Time take to calculate Important scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")

print("Pruning the model")
start_time = time.time()
pruned_student = gradient_based_global_prune(student, importance_scores, prune_ratio=0.3603)
total_time = time.time() - start_time
print(f"Total Time take to prune the model scores: {total_time // 60:.0f}m {total_time % 60:.0f}s")
student = student.to(device)

# Calculate sparsity
sparsity = calculate_sparsity(pruned_student)
print(f"Sparsity: {sparsity * 100:.2f}%")


Calculating Important Scores
Accumulation Epoch 1/3
Accumulation Epoch 2/3
Accumulation Epoch 3/3
Total Time take to calculate Important scores: 2m 29s
Pruning the model
Total Time take to prune the model scores: 0m 0s
Sparsity: 36.00%


In [93]:

start_time = time.time()
pruned_student = retrain_with_KD(
    teacher, pruned_student, train_loader, val_loader,
    epochs=50, temperature=5.0, alpha=0.7, beta_prob=0.5,patience=5,save_path="pruned_student_retrain_KD_90%.pth"
)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Retraining completed in {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")

Epoch 1/50 | Train Loss: 0.3268 | Train Acc: 98.47% | Val Loss: 0.0557 | Val Acc: 98.28% | Sparsity: 36.00%
Epoch 2/50 | Train Loss: 0.2215 | Train Acc: 99.00% | Val Loss: 0.0457 | Val Acc: 98.68% | Sparsity: 36.00%
Epoch 3/50 | Train Loss: 0.1718 | Train Acc: 99.08% | Val Loss: 0.0408 | Val Acc: 98.79% | Sparsity: 36.00%
Epoch 4/50 | Train Loss: 0.1502 | Train Acc: 99.16% | Val Loss: 0.0413 | Val Acc: 98.86% | Sparsity: 36.00%
Epoch 5/50 | Train Loss: 0.1366 | Train Acc: 99.17% | Val Loss: 0.0399 | Val Acc: 98.89% | Sparsity: 36.00%
Epoch 6/50 | Train Loss: 0.1278 | Train Acc: 99.19% | Val Loss: 0.0377 | Val Acc: 98.87% | Sparsity: 36.00%
Epoch 7/50 | Train Loss: 0.1159 | Train Acc: 99.17% | Val Loss: 0.0375 | Val Acc: 98.86% | Sparsity: 36.00%
Epoch 8/50 | Train Loss: 0.1123 | Train Acc: 99.22% | Val Loss: 0.0381 | Val Acc: 98.93% | Sparsity: 36.00%
Epoch 9/50 | Train Loss: 0.1091 | Train Acc: 99.18% | Val Loss: 0.0395 | Val Acc: 98.89% | Sparsity: 36.00%
Epoch 10/50 | Train Loss: 0.

In [94]:
student_accuracy = evaluate(pruned_student, test_loader, device)
print(f"Pruned Student Model Test Accuracy(After Retrain): {student_accuracy:.2f}%")

Pruned Student Model Test Accuracy(After Retrain): 95.74%
