In [1]:
import os
!export CUDA_VISIBLE_DEVICES=1
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
# !pip3 install distiller

In [3]:
import torch
import distiller
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.nn as nn
import torch.nn.functional as F
import time
from torch.utils.data import DataLoader
from LiveTune import liveVar


In [4]:
# !pip3 freeze|grep Dis

In [5]:
# classes = ('plane', 'car', 'bird', 'cat',
#                'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

classes = [i for i in range(1000)]
# classes

In [6]:
def countParameters(net):
    params = 0
    for par in net.parameters():
        k = 1
        for x in par.size():
            k *= x
        params += k
    return params

def checkAccuracy(model, device, testloader):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for data in testloader:
            images, labels = data[0].to(device), data[1].to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100.0 * correct / total
    print('Accuracy of the network on the test images: %.2f %%' % (
        accuracy))
    model.train()
    return accuracy



In [7]:
def testModel(net, device, testloader):
    global classes
    correct = 0
    total = 0
    net.eval()
    print('Parameters:', countParameters(net))
    t1 = time.time()
    with torch.no_grad():
        for data in testloader:
            images, labels = data[0].to(device), data[1].to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
    t2 = time.time()
    print('Top-1 Accuracy of the network on the 10000 test images: %.2f %%' % (
        100 * correct / total))
    print('Average Latency for 10000 test images:', (t2-t1)/10000,'seconds')
    
#     class_correct = list(0. for i in range(1000))
#     class_total = list(0. for i in range(1000))
#     with torch.no_grad():
#         for data in testloader:
#             images, labels = data[0].to(device), data[1].to(device)
#             outputs = net(images)
#             _, predicted = torch.max(outputs, 1)
#             c = (predicted == labels).squeeze()
#             for i in range(len(labels)):
#                 label = labels[i]
#                 class_correct[label] += c[i].item()
#                 class_total[label] += 1
    
#     for i in range(1000):
#         print('Accuracy of %5s : %2f %%' % (
#             classes[i], 100.0 * class_correct[i] / class_total[i]))



In [8]:
def trainModel(net, modelLocation, device, trainloader, testloader, opt, startEpoch, totalEpochs, accuracy = 0):
    
    criterion = nn.CrossEntropyLoss()
    bestAccuracy = accuracy
    bestEpoch = startEpoch
    torch.save(net.state_dict(), modelLocation)
    if opt == optim.SGD:
        scheme = 1
    else:
        scheme = 0
    print(f"model has {get_num_parameters(model, True)/1e6:.2f} M parameters")
    for epoch in range(startEpoch, totalEpochs):  # loop over the dataset multiple times
        if scheme == 1:
            if epoch < 150:
                optimizer = opt(net.parameters(), lr=0.1, momentum = 0.9, weight_decay=5e-4)
            elif epoch < 250:
                optimizer = opt(net.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
            else:
                optimizer = opt(net.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)
        else:
            optimizer = opt(net.parameters(), lr=0.001, weight_decay=5e-4)
        running_loss = 0.0
        
        for i, data in enumerate(trainloader, 0):
            # get the inputs
            inputs, labels = data[0].to(device), data[1].to(device)
    
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            # print("inputs.shape", inputs.shape)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            # print statistics
            running_loss += loss.item()
            
            if i % 100 == 99:    # print every 128 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 100))
                running_loss = 0.0
        
        accuracy = checkAccuracy(net, device, testloader)
        if accuracy >= bestAccuracy:    
            torch.save(net.state_dict(), modelLocation)
            bestAccuracy = accuracy
            bestEpoch = epoch+1
        print('Best Accuracy of', bestAccuracy,'at epoch',bestEpoch)
        
    print('Finished Training Model.')
    try:
        net.load_state_dict(torch.load(modelLocation))
    except:
        pass
    
    testModel(net, device, testloader)



In [9]:
learning_rate = liveVar(0.0001, 'learning_rate')


Port number for liveVar dictionary: 44113


In [10]:
type(learning_rate())

float

In [11]:
def trainModelKD(model, modelLocation, teacher, device, trainloader, testloader, alpha, T, opt, startEpoch, totalEpochs, accuracy = 0):
#     global output
#     global labels
    criterion = nn.KLDivLoss(reduction="batchmean") # nn.CrossEntropyLoss()
    dlw = distiller.DistillationLossWeights(alpha*T*T, 1-alpha, 0.0)
    kd_policy = distiller.KnowledgeDistillationPolicy(model, teacher, T, dlw)
    kd_policy.active = True
    bestAccuracy = accuracy
    bestEpoch = startEpoch
    torch.save(model.state_dict(), modelLocation)
    if opt == optim.SGD:
        scheme = 1
    else:
        scheme = 0
    print(f"model has {get_num_parameters(model, True)/1e6:.2f} M parameters")
#     if scheme == 1:
#         if epoch < 150:
#             optimizer = opt(model.parameters(), lr=0.1, momentum = 0.9, weight_decay=5e-4)
#         elif epoch < 250:
#             optimizer = opt(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
#         else:
#             optimizer = opt(model.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)
#     else:
#         optimizer = opt(model.parameters(), lr=0.001, weight_decay=5e-4)
    # optimizer = opt(model.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)
    # scheduler = lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', verbose=True)
    for epoch in range(startEpoch, totalEpochs):  # loop over the dataset multiple times
        start = time.time()
        # for g in optim.param_groups:
        #     g['lr'] = learning_rate
        # optimizer = opt(model.parameters(), lr=learning_rate(), momentum=0.9, weight_decay=5e-4)
        optimizer = opt(model.parameters(), lr=learning_rate(), weight_decay=5e-4)
#         if scheme == 1:
#             if epoch < 150:
#                 optimizer = opt(model.parameters(), lr=0.1, momentum = 0.9, weight_decay=5e-4)
#             elif epoch < 250:
#                 optimizer = opt(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
#             else:
#                 optimizer = opt(model.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)
#         else:
#             optimizer = opt(model.parameters(), lr=0.01, weight_decay=5e-4)
        running_loss = 0.0
#         scheduler = lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', verbose=True)
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data[0].to(device), data[1].to(device)
            output = kd_policy.forward(inputs)
            np_labels = labels.cpu()
            one_hot_labels = np.eye(1000)[np_labels]
            one_hot_labels = torch.tensor(one_hot_labels).to(device)
            output_log = F.log_softmax(output)
            loss = criterion(output_log, one_hot_labels)
            loss = kd_policy.before_backward_pass(model, epoch, None, None, loss, None).overall_loss        
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if i % 50 == 49:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 50))
                running_loss = 0.0
        accuracy = checkAccuracy(model, device, testloader)
        if accuracy > bestAccuracy:
            torch.save(model.state_dict(), modelLocation)
            bestAccuracy = accuracy
            bestEpoch = epoch+1
        print('Best Accuracy of', bestAccuracy,'at epoch',bestEpoch)
        # scheduler.step(accuracy)
        end = time.time()
        print("time taken:", end-start)
    
    print('Finished Training Student.')
    try:
        model.load_state_dict(torch.load(modelLocation))
    except:
        pass
    
    testModel(model, device, testloader)

In [12]:
import sys
# import resnet
import vgg
# import cnn5
import torch
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
# import train
# import avNet
# import netJoin
from utils import *



In [13]:
def get_platform():
    platforms = {
        'linux1' : 'Linux',
        'linux2' : 'Linux',
        'darwin' : 'OS X',
        'win32' : 'Windows'
    }
    if sys.platform not in platforms:
        return sys.platform
    
    return platforms[sys.platform]

# resDict = {'Resnet18': resnet.ResNet18, 
#                 'Resnet34': resnet.ResNet34,
#                 'Resnet50': resnet.ResNet50,
#                 'Resnet101': resnet.ResNet101,
#                 'Resnet152': resnet.ResNet152}


# models = ['Resnet18', 'Resnet34', 'Resnet50', 'Resnet101', 'Resnet152', 
#               'VGG6AS','VGG6AM','VGG6A','VGG6','VGG7','VGG8','VGG11','VGG13','VGG16','VGG16A','VGG19', '5-CNN']

models = ['VGG16']

def createResnet(ResType):
    global resDict    
    f = resDict[ResType]
    return f()

def createVGG(VGGType):
    return vgg.VGG(VGGType)

def create5_CNN():
    return cnn5.CNN_5()

def createNet(modelType):
    global models
    global jointModels
    
    for i in range(len(models)):
        if modelType.lower() == models[i].lower():
            if i <= 4:
                return createResnet(models[i])
            elif i <= len(models)-2:
                return createVGG(models[i])
            else:
                return create5_CNN()
    
    if modelType.lower() == 'avg':
        return avNet.AvNet()
    
    if modelType.lower() == 'jn':
        return netJoin.jointNet()
    
    return None



In [None]:
opt = optim.SGD
# elif sys.argv[4].lower() == 'sgd':
#     opt = optim.SGD
# else:
#     print('Invalid argument for specifying optimizer. Aborting.')
#     return

totalEpochs = 1000
startEpoch = 0
batchSize = 256

In [14]:
# def main():
    
global models

trainModel = trainModel
trainModelKD = trainModelKD
testModel = testModel

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("CUDA Device Detected. GPU will be used.")
else:
    device = torch.device("cpu")
    print("No CUDA supported GPU detected. CPU will be used.")

print("Dataset- ImageNet")

if get_platform() == 'Windows':
    workers = 0
else:
    workers = 2

dataloader = get_dataloader("imagenet", batch_size=batchSize)
trainloader = dataloader['train']
testloader = dataloader['val']
#     transform = transforms.Compose(
#         [transforms.ToTensor(),
#          transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
#     try:
#         testset = torchvision.datasets.CIFAR10(root=sys.argv[1], train=False,
#                                            download=False, transform=transform)
#     except:
#         testset = torchvision.datasets.CIFAR10(root=sys.argv[1], train=False,
#                                            download=True, transform=transform)
        
#     testloader = torch.utils.data.DataLoader(testset, batch_size=1024,
#                                              shuffle=False, num_workers=workers)
    

CUDA Device Detected. GPU will be used.
Dataset- ImageNet


In [15]:
# opt = optim.SGD
# # elif sys.argv[4].lower() == 'sgd':
# #     opt = optim.SGD
# # else:
# #     print('Invalid argument for specifying optimizer. Aborting.')
# #     return

# totalEpochs = 1000
# startEpoch = 0
# batchSize = 256

In [16]:
import torchvision.models as models
base_model = models.vgg16(pretrained=True).cuda()
torch.save(base_model.state_dict(), "base_vgg16.pt")

In [17]:
@torch.inference_mode()
def evaluate_topk(
    model: nn.Module,
    dataloader: DataLoader, 
    verbose=True,
    device=None,
    topk = (1,)
) -> float:
  # model.eval()
    mega_list = []
    for inputs, targets in tqdm(dataloader, desc="eval", leave=False, 
                                disable=not verbose):
        # Move the data from CPU to GPU
        if device is None:
            inputs = inputs.cuda()
            targets = targets.cuda()
        else:
            inputs = inputs.to(device)
            targets = targets.to(device)

        # Inference
        outputs = model(inputs)
        maxk = max(topk)
        batch_size = targets.size(0)

        _, y_pred = outputs.topk(k=maxk, dim=1) 
        y_pred = y_pred.t()  

        target_reshaped = targets.view(1, -1).expand_as(y_pred)  
        correct = (y_pred == target_reshaped)  

        list_topk_accs = []  # idx is topk1, topk2, ... etc
        for k in topk:
            ind_which_topk_matched_truth = correct[:k]  # [maxk, B] -> [k, B]
            flattened_indicator_which_topk_matched_truth = ind_which_topk_matched_truth.reshape(-1).float()  # [k, B] -> [kB]
            tot_correct_topk = flattened_indicator_which_topk_matched_truth.float().sum(dim=0, keepdim=True)  # [kB] -> [1]
            topk_acc = tot_correct_topk / batch_size  # topk accuracy for entire batch
            list_topk_accs.append(topk_acc)
        mega_list.append(list_topk_accs)
    return mega_list  # list of topk accuracies for entire batch [topk1, topk2, ... etc]


In [18]:
mega_list = evaluate_topk(base_model, dataloader['val'], topk=(5,))

                                                     

In [19]:
mega_list = [i[0].item() for i in mega_list]
print(sum(mega_list)/len(mega_list))

0.8827665448188782


In [20]:
testModel(base_model, device, testloader)

Parameters: 138357544
Top-1 Accuracy of the network on the 10000 test images: 71.22 %
Average Latency for 10000 test images: 0.0007960352420806885 seconds


In [21]:
# elif sys.argv[8].lower() == 'n':

# model = createVGG('VGG16')
# if model is None:
#     print('Invalid Model Type. Aborting.')
# model = model.to(device)
# print(model)
# try:
#     model.load_state_dict(torch.load("base_vgg16_256_scratch.pt"))
#     accuracy = checkAccuracy(model, device, testloader)
#     print("accuracy:", accuracy)
# except Exception as e:
#     print(str(e))
#     accuracy = 0

# trainModel(model, "base_vgg16_256_scratch.pt", device, trainloader, testloader, opt, startEpoch, totalEpochs, accuracy)

In [22]:
# # if sys.argv[8].lower() == 'y': # Use distillation
# try:
alpha = 0.5
T = 2.0
# except:
#     print('Invalid Arguments for temperature or alpha. Aborting')
    # return
student_model = createVGG('VGG16')
print(student_model)
if student_model is None:
    print('Invalid Model Type. Aborting.')
    # return
student_model = student_model.to(device)    
try:
    student_model.load_state_dict(torch.load("student_vgg.pt"))
    accuracy = checkAccuracy(student_model, device, testloader)
except Exception as e:
    print("Exception loading student model:", str(e))
    accuracy = 0

teacher = base_model
if teacher is None:
    print('Invalid Model Type for Teacher. Aborting.')
    # return
teacher = teacher.to(device)
try:
    teacher.load_state_dict(torch.load("base_vgg16.pt"))
except Exception as e:
    print("Exception loading teacher model:", str(e))
    print('The Teacher model does not exists. Aborting.')
    # return


VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace=True)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(128, 256

In [23]:
# student_model.features[0].weight = teacher.features[0].weight
# student_model.features[0].bias = teacher.features[0].bias
# student_model.features[3].weight = teacher.features[2].weight
# student_model.features[3].bias = teacher.features[2].bias
# student_model.features[7].weight = teacher.features[5].weight
# student_model.features[7].bias = teacher.features[5].bias
# student_model.features[10].weight = teacher.features[7].weight
# student_model.features[10].bias = teacher.features[7].bias
# student_model.features[14].weight = teacher.features[10].weight
# student_model.features[14].bias = teacher.features[10].bias
# student_model.features[17].weight = teacher.features[12].weight
# student_model.features[17].bias = teacher.features[12].bias
# student_model.features[20].weight = teacher.features[14].weight
# student_model.features[20].bias = teacher.features[14].bias
# student_model.features[24].weight = teacher.features[17].weight
# student_model.features[24].bias = teacher.features[17].bias
# student_model.features[27].weight = teacher.features[19].weight
# student_model.features[27].bias = teacher.features[19].bias
# student_model.features[30].weight = teacher.features[21].weight
# student_model.features[30].bias = teacher.features[21].bias
# student_model.features[34].weight = teacher.features[24].weight
# student_model.features[34].bias = teacher.features[24].bias
# student_model.features[37].weight = teacher.features[26].weight
# student_model.features[37].bias = teacher.features[26].bias
# student_model.features[40].weight = teacher.features[28].weight
# student_model.features[40].bias = teacher.features[28].bias


In [24]:
# student_model.classifier[6]

In [25]:
for param in teacher.parameters():
    param.requires_grad = False
teacher.eval()
trainModelKD(student_model, "student_vgg.pt", teacher, device, trainloader, testloader, alpha, T, opt, startEpoch, totalEpochs, accuracy)
# return


model has 39.81 M parameters
[1,    50] loss: 50.004
[1,   100] loss: 37.365
[1,   150] loss: 35.871
[1,   200] loss: 35.684
[1,   250] loss: 35.204
[1,   350] loss: 34.607
[1,   400] loss: 34.361
[1,   450] loss: 33.546
[1,   500] loss: 33.666
Accuracy of the network on the test images: 0.76 %
Best Accuracy of 0.76 at epoch 1
time taken: 673.0109901428223
[2,    50] loss: 34.235
[2,   100] loss: 33.677
[2,   150] loss: 33.045
[2,   200] loss: 32.949
[2,   250] loss: 32.842
[2,   300] loss: 32.657
[2,   350] loss: 32.048
[2,   400] loss: 31.048
[2,   450] loss: 30.481
[2,   500] loss: 30.416
Accuracy of the network on the test images: 3.18 %
Best Accuracy of 3.18 at epoch 2
time taken: 670.6775522232056
[3,    50] loss: 29.827
[3,   100] loss: 29.792
[3,   150] loss: 29.503
[3,   200] loss: 28.489
[3,   250] loss: 28.211
[3,   300] loss: 28.116
[3,   350] loss: 27.497
[3,   400] loss: 27.211
[3,   450] loss: 26.923
[3,   500] loss: 27.248
Accuracy of the network on the test images: 5.0

[24,    50] loss: 8.457
[24,   100] loss: 8.245
[24,   150] loss: 8.380
[24,   200] loss: 8.394
[24,   250] loss: 8.491
[24,   300] loss: 8.603
[24,   350] loss: 8.622
[24,   400] loss: 8.648
[24,   450] loss: 8.743
[24,   500] loss: 8.798
Accuracy of the network on the test images: 38.30 %
Best Accuracy of 38.3 at epoch 24
time taken: 669.8095555305481
[25,    50] loss: 8.186
[25,   100] loss: 8.089
[25,   150] loss: 8.102
[25,   200] loss: 8.295
[25,   250] loss: 8.470
[25,   300] loss: 8.370
[25,   350] loss: 8.483
[25,   400] loss: 8.550
[25,   450] loss: 8.473
[25,   500] loss: 8.610
Accuracy of the network on the test images: 37.52 %
Best Accuracy of 38.3 at epoch 24
time taken: 669.1401581764221
[26,    50] loss: 8.041
[26,   100] loss: 7.950
[26,   150] loss: 8.085
[26,   200] loss: 8.097
[26,   250] loss: 8.244
[26,   300] loss: 8.238
[26,   350] loss: 8.384
[26,   400] loss: 8.367
[26,   450] loss: 8.413
[26,   500] loss: 8.409
Accuracy of the network on the test images: 39.0

[47,    50] loss: 6.525
[47,   100] loss: 6.406
[47,   150] loss: 6.497
[47,   200] loss: 6.609
[47,   250] loss: 6.694
[47,   300] loss: 6.738
[47,   350] loss: 6.828
[47,   400] loss: 6.848
[47,   450] loss: 6.888
[47,   500] loss: 7.051
Accuracy of the network on the test images: 42.86 %
Best Accuracy of 43.22 at epoch 46
time taken: 664.4743039608002
[48,    50] loss: 6.465
[48,   100] loss: 6.360
[48,   150] loss: 6.470
[48,   200] loss: 6.672
[48,   250] loss: 6.658
[48,   300] loss: 6.704
[48,   350] loss: 6.790
[48,   400] loss: 6.853
[48,   450] loss: 6.800
[48,   500] loss: 7.025
Accuracy of the network on the test images: 42.94 %
Best Accuracy of 43.22 at epoch 46
time taken: 664.1481900215149
[49,    50] loss: 6.560
[49,   100] loss: 6.224
[49,   150] loss: 6.433
[49,   200] loss: 6.550
[49,   250] loss: 6.511
[49,   300] loss: 6.796
[49,   350] loss: 6.806
[49,   400] loss: 6.858
[49,   450] loss: 6.815
[49,   500] loss: 6.844
Accuracy of the network on the test images: 43

[70,    50] loss: 6.073
[70,   100] loss: 5.975
[70,   150] loss: 6.032
[70,   200] loss: 6.014
[70,   250] loss: 6.227
[70,   300] loss: 6.237
[70,   350] loss: 6.313
[70,   400] loss: 6.379
[70,   450] loss: 6.457
[70,   500] loss: 6.455
Accuracy of the network on the test images: 44.62 %
Best Accuracy of 44.62 at epoch 70
time taken: 664.6925337314606
[71,    50] loss: 6.052
[71,   100] loss: 6.022
[71,   150] loss: 6.005
[71,   200] loss: 6.137
[71,   250] loss: 6.209
[71,   300] loss: 6.236
[71,   350] loss: 6.239
[71,   400] loss: 6.321
[71,   450] loss: 6.429
[71,   500] loss: 6.437
Accuracy of the network on the test images: 43.62 %
Best Accuracy of 44.62 at epoch 70
time taken: 664.5104849338531
[72,    50] loss: 6.070
[72,   100] loss: 5.955
[72,   150] loss: 6.017
[72,   200] loss: 6.176
[72,   250] loss: 6.218
[72,   300] loss: 6.167
[72,   350] loss: 6.230
[72,   400] loss: 6.278
[72,   450] loss: 6.325
[72,   500] loss: 6.404
Accuracy of the network on the test images: 43

Best Accuracy of 54.38 at epoch 92
time taken: 670.7643530368805
[93,    50] loss: 3.219
[93,   100] loss: 3.230
[93,   150] loss: 3.204
[93,   200] loss: 3.272
[93,   250] loss: 3.236
[93,   300] loss: 3.257
[93,   350] loss: 3.317
[93,   400] loss: 3.278
[93,   450] loss: 3.283
[93,   500] loss: 3.290
Accuracy of the network on the test images: 54.04 %
Best Accuracy of 54.38 at epoch 92
time taken: 671.6462461948395
[94,    50] loss: 3.237
[94,   100] loss: 3.171
[94,   150] loss: 3.215
[94,   200] loss: 3.183
[94,   250] loss: 3.200
[94,   300] loss: 3.227
[94,   350] loss: 3.243
[94,   400] loss: 3.221
[94,   450] loss: 3.251
[94,   500] loss: 3.226
Accuracy of the network on the test images: 53.92 %
Best Accuracy of 54.38 at epoch 92
time taken: 670.7504572868347
[95,    50] loss: 3.149
[95,   100] loss: 3.178
[95,   150] loss: 3.166
[95,   200] loss: 3.189
[95,   250] loss: 3.198
[95,   300] loss: 3.195
[95,   350] loss: 3.214
[95,   400] loss: 3.219
[95,   450] loss: 3.216
[95, 

[115,   300] loss: 2.894
[115,   350] loss: 2.871
[115,   400] loss: 2.918
[115,   450] loss: 2.895
[115,   500] loss: 2.931
Accuracy of the network on the test images: 55.62 %
Best Accuracy of 55.62 at epoch 115
time taken: 664.8702352046967
[116,    50] loss: 2.805
[116,   100] loss: 2.840
[116,   150] loss: 2.809
[116,   200] loss: 2.840
[116,   250] loss: 2.853
[116,   300] loss: 2.921
[116,   350] loss: 2.892
[116,   400] loss: 2.877
[116,   450] loss: 2.876
[116,   500] loss: 2.896
Accuracy of the network on the test images: 55.60 %
Best Accuracy of 55.62 at epoch 115
time taken: 664.6256408691406
[117,    50] loss: 2.785
[117,   100] loss: 2.848
[117,   150] loss: 2.815
[117,   200] loss: 2.865
[117,   250] loss: 2.876
[117,   300] loss: 2.844
[117,   350] loss: 2.882
[117,   400] loss: 2.879
[117,   450] loss: 2.886
[117,   500] loss: 2.888
Accuracy of the network on the test images: 54.70 %
Best Accuracy of 55.62 at epoch 115
time taken: 664.4155631065369
[118,    50] loss: 2.

[137,   500] loss: 2.765
Accuracy of the network on the test images: 55.56 %
Best Accuracy of 56.52 at epoch 136
time taken: 665.137937784195
[138,    50] loss: 2.649
[138,   100] loss: 2.615
[138,   150] loss: 2.689
[138,   200] loss: 2.651
[138,   250] loss: 2.688
[138,   300] loss: 2.724
[138,   350] loss: 2.706
[138,   400] loss: 2.760
[138,   450] loss: 2.719
[138,   500] loss: 2.742
Accuracy of the network on the test images: 56.00 %
Best Accuracy of 56.52 at epoch 136
time taken: 665.6104555130005
[139,    50] loss: 2.685
[139,   100] loss: 2.621
[139,   150] loss: 2.676
[139,   200] loss: 2.662
[139,   250] loss: 2.685
[139,   300] loss: 2.701
[139,   350] loss: 2.734
[139,   400] loss: 2.734
[139,   450] loss: 2.705
[139,   500] loss: 2.721
Accuracy of the network on the test images: 55.54 %
Best Accuracy of 56.52 at epoch 136
time taken: 665.5196964740753
[140,    50] loss: 2.672
[140,   100] loss: 2.617
[140,   150] loss: 2.640
[140,   200] loss: 2.702
[140,   250] loss: 2.6

[160,    50] loss: 2.565
[160,   100] loss: 2.530
[160,   150] loss: 2.610
[160,   200] loss: 2.564
[160,   250] loss: 2.580
[160,   300] loss: 2.625
[160,   350] loss: 2.641
[160,   400] loss: 2.617
[160,   450] loss: 2.638
[160,   500] loss: 2.613
Accuracy of the network on the test images: 56.06 %
Best Accuracy of 56.52 at epoch 136
time taken: 665.2115385532379
[161,    50] loss: 2.576
[161,   100] loss: 2.544
[161,   150] loss: 2.566
[161,   200] loss: 2.584
[161,   250] loss: 2.589
[161,   300] loss: 2.596
[161,   350] loss: 2.601
[161,   400] loss: 2.609
[161,   450] loss: 2.602
[161,   500] loss: 2.647
Accuracy of the network on the test images: 56.32 %
Best Accuracy of 56.52 at epoch 136
time taken: 665.2492983341217
[162,    50] loss: 2.566
[162,   100] loss: 2.548
[162,   150] loss: 2.558
[162,   200] loss: 2.618
[162,   250] loss: 2.564
[162,   300] loss: 2.610
[162,   350] loss: 2.604
[162,   400] loss: 2.595
[162,   450] loss: 2.614
[162,   500] loss: 2.630
Accuracy of th

[182,   250] loss: 2.514
[182,   300] loss: 2.545
[182,   350] loss: 2.558
[182,   400] loss: 2.519
[182,   450] loss: 2.581
[182,   500] loss: 2.576
Accuracy of the network on the test images: 56.08 %
Best Accuracy of 56.54 at epoch 165
time taken: 663.8703579902649
[183,    50] loss: 2.517
[183,   100] loss: 2.461
[183,   150] loss: 2.503
[183,   200] loss: 2.501
[183,   250] loss: 2.546
[183,   300] loss: 2.550
[183,   350] loss: 2.525
[183,   400] loss: 2.542
[183,   450] loss: 2.553
[183,   500] loss: 2.573
Accuracy of the network on the test images: 56.50 %
Best Accuracy of 56.54 at epoch 165
time taken: 663.7101490497589
[184,    50] loss: 2.488
[184,   100] loss: 2.494
[184,   150] loss: 2.509
[184,   200] loss: 2.508
[184,   250] loss: 2.510
[184,   300] loss: 2.522
[184,   350] loss: 2.561
[184,   400] loss: 2.542
[184,   450] loss: 2.549
[184,   500] loss: 2.576
Accuracy of the network on the test images: 55.86 %
Best Accuracy of 56.54 at epoch 165
time taken: 663.8755629062

Traceback (most recent call last):
  File "/home/shariff/anaconda3/lib/python3.9/multiprocessing/queues.py", line 251, in _feed
    send_bytes(obj)
  File "/home/shariff/anaconda3/lib/python3.9/multiprocessing/connection.py", line 205, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/shariff/anaconda3/lib/python3.9/multiprocessing/connection.py", line 416, in _send_bytes
    self._send(header + buf)
  File "/home/shariff/anaconda3/lib/python3.9/multiprocessing/connection.py", line 373, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe

KeyboardInterrupt



In [None]:
# criterion = nn.KLDivLoss(reduction="batchmean")

In [None]:
# output_log = F.log_softmax(output, dim=1)

In [None]:
# criterion(output, one_hot_labels)