In [1]:
import os
!export CUDA_VISIBLE_DEVICES=0
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import distiller
import sys
import time
import torch
import torchvision
import vgg
from LiveTune import liveVar
from torch import nn as nn
from torch.nn import functional as F
from torch import optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision import models
from utils import *



Extension horovod.torch has not been built: /home/shariff/.local/lib/python3.8/site-packages/horovod/torch/mpi_lib_v2.cpython-39-x86_64-linux-gnu.so not found
If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.


In [3]:
def checkAccuracy(model, device, testloader):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for data in testloader:
            images, labels = data[0].to(device), data[1].to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100.0 * correct / total
    print('Accuracy of the network on the test images: %.2f %%' % (
        accuracy))
    model.train()
    return accuracy



In [4]:
def trainModelKD(model, modelLocation, teacher, device, trainloader, testloader, alpha, T, opt, startEpoch, totalEpochs, accuracy = 0):
    criterion = nn.KLDivLoss(reduction="batchmean") # nn.CrossEntropyLoss()
    dlw = distiller.DistillationLossWeights(alpha*T*T, 1-alpha, 0.0)
    kd_policy = distiller.KnowledgeDistillationPolicy(model, teacher, T, dlw)
    kd_policy.active = True
    bestAccuracy = accuracy
    bestEpoch = startEpoch
    torch.save(model.state_dict(), modelLocation)
    if opt == optim.SGD:
        scheme = 1
    else:
        scheme = 0
    print(f"model has {get_num_parameters(model, True)/1e6:.2f} M parameters")
    for epoch in range(startEpoch, totalEpochs):
        start = time.time()
        optimizer = opt(model.parameters(), lr=learning_rate(), weight_decay=5e-4)
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data[0].to(device), data[1].to(device)
            output = kd_policy.forward(inputs)
            np_labels = labels.cpu()
            one_hot_labels = np.eye(1000)[np_labels]
            one_hot_labels = torch.tensor(one_hot_labels).to(device)
            output_log = F.log_softmax(output).to(one_hot_labels.dtype)
            loss = criterion(output_log, one_hot_labels)
            loss = kd_policy.before_backward_pass(model, epoch, None, None, loss, None).overall_loss        
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if i % 50 == 49:
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 50))
                running_loss = 0.0
        accuracy = checkAccuracy(model, device, testloader)
        if accuracy > bestAccuracy:
            torch.save(model.state_dict(), modelLocation)
            bestAccuracy = accuracy
            bestEpoch = epoch+1
        print('Best Accuracy of', bestAccuracy,'at epoch',bestEpoch)
        end = time.time()
        print("time taken:", end-start)
    
    print('Finished Training Student.')
    try:
        model.load_state_dict(torch.load(modelLocation))
    except:
        pass
    

In [5]:
vgg_type = "VGG19"
if not os.path.isdir(vgg_type):
    os.mkdir(vgg_type)

teacher_model_path = os.path.join(vgg_type, "base_model.pt")
student_model_path = os.path.join(vgg_type, "student_model.pt")
classes = [i for i in range(1000)]
learning_rate = liveVar(0.001, 'learning_rate')
opt = optim.SGD
totalEpochs = 1000
startEpoch = 0
batchSize = 256
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("CUDA Device Detected. GPU will be used.")
else:
    device = torch.device("cpu")
    print("No CUDA supported GPU detected. CPU will be used.")


Port number for liveVar dictionary: 51503
CUDA Device Detected. GPU will be used.


In [6]:
print("Dataset- ImageNet")
dataloader = get_dataloader("imagenet", batch_size=batchSize)
trainloader = dataloader['train']
testloader = dataloader['val']

Dataset- ImageNet



KeyboardInterrupt



In [None]:
base_model = None


if (vgg_type == 'VGG11'):
    base_model = models.vgg11_bn(pretrained=True).cuda()
elif (vgg_type == 'VGG13'):
    base_model = models.vgg13_bn(pretrained=True).cuda()
elif (vgg_type == 'VGG16'):
    base_model = models.vgg16_bn(pretrained=True).cuda()
elif (vgg_type == 'VGG19'):
    base_model = models.vgg19_bn(pretrained=True).cuda()
else:
    print("ERROR")

if base_model:
    torch.save(base_model.state_dict(), teacher_model_path)


In [None]:
alpha = 0.5
T = 2.0
student_model = vgg.VGG(vgg_type)
# print(student_model)
if student_model is None:
    print('Invalid Model Type. Aborting.')
student_model = student_model.to(device)    
try:
    student_model.load_state_dict(torch.load(student_model_path))
    accuracy = checkAccuracy(student_model, device, testloader)
except Exception as e:
    print("Exception loading student model:", str(e))
    accuracy = 0

teacher = base_model
if teacher is None:
    print('Invalid Model Type for Teacher. Aborting.')
teacher = teacher.to(device)
try:
    teacher.load_state_dict(torch.load(teacher_model_path))
except Exception as e:
    print("Exception loading teacher model:", str(e))
    print('The Teacher model does not exists. Aborting.')


In [None]:
evaluate_model(teacher, dataloader=dataloader, device=device)

In [None]:
for param in teacher.parameters():
    param.requires_grad = False
teacher.eval()
trainModelKD(student_model, student_model_path, teacher, device, trainloader, testloader, alpha, T, opt, startEpoch, totalEpochs, accuracy)


In [None]:
evaluate_model(student_model, dataloader=dataloader, device=device)