In [1]:
import numpy as np
import torch
import torchvision
import matplotlib.pyplot as plt
from time import time
from torchvision import datasets, transforms
from torch import nn, optim
from torch.autograd.functional import hessian
from torch.nn.utils import _stateless

In [2]:
transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])

In [3]:
trainset = datasets.MNIST('PATH_TO_STORE_TRAINSET', download=True, train=True, transform=transform)
valset = datasets.MNIST('PATH_TO_STORE_TESTSET', download=True, train=False, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=64, shuffle=True)

In [4]:
dataiter = iter(trainloader)
images, labels = next(dataiter)

# print(images.shape)
# print(labels.shape)

In [6]:
input_size = 784
hidden_sizes = [128, 64]
output_size = 10

model = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size),
                      nn.LogSoftmax(dim=1))
print(model)

Sequential(
  (0): Linear(in_features=784, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=10, bias=True)
  (5): LogSoftmax(dim=1)
)


In [7]:
criterion = nn.NLLLoss()
images, labels = next(iter(trainloader))
print(images.shape)
images = images.view(images.shape[0], -1)
print(images.shape)
logps = model(images) 
loss = criterion(logps, labels)
print(loss) 
print(logps.shape)

torch.Size([64, 1, 28, 28])
torch.Size([64, 784])
tensor(2.3046, grad_fn=<NllLossBackward0>)
torch.Size([64, 10])


In [8]:
print('Before backward pass: \n', model[0].weight.grad)
loss.backward()
print('After backward pass: \n', model[4].weight.grad)

Before backward pass: 
 None
After backward pass: 
 tensor([[ 0.0000e+00,  1.5453e-03,  7.4160e-05,  1.4030e-04,  1.4771e-03,
          5.3459e-04, -3.8966e-03,  1.4334e-02,  3.6914e-03,  1.5767e-03,
          5.2435e-03, -2.6516e-03, -2.4836e-04,  1.0339e-03,  4.5706e-05,
          1.2322e-03,  2.1753e-04,  3.4726e-04, -1.7591e-03,  1.1299e-02,
         -1.0000e-02,  3.7687e-03, -5.5512e-03,  1.4427e-03,  5.1039e-03,
         -2.3901e-03,  1.5269e-03,  8.7992e-05,  1.0616e-02,  6.3586e-03,
          7.0162e-04, -3.7762e-04, -5.5781e-03,  2.4017e-05,  0.0000e+00,
         -1.0679e-03,  7.9320e-05,  1.0696e-02,  1.1326e-02, -2.3999e-03,
         -3.7648e-04, -1.1017e-02,  7.3742e-05, -9.5129e-03,  5.6308e-03,
         -9.0303e-04,  2.6498e-03,  0.0000e+00,  7.7746e-05, -8.7291e-04,
          1.1455e-03,  9.7154e-04,  1.8541e-03, -1.0392e-04,  0.0000e+00,
          1.4327e-04,  1.5211e-03,  7.0983e-04,  1.7299e-04, -1.2977e-03,
          1.4544e-04, -4.2536e-03, -9.4178e-04,  2.5260e-03]

In [9]:
## Method 1 to calculate hessian:

# def eval_hessian(loss_grad, model):
#     cnt = 0
#     for g in loss_grad:
#         g_vector = g.contiguous().view(-1) if cnt == 0 else torch.cat([g_vector, g.contiguous().view(-1)])
#         cnt = 1
#     l = g_vector.size(0)
#     hessian = torch.zeros(l, l)
#     for idx in range(l):
#         grad2rd = torch.autograd.grad(g_vector[idx], model.parameters(), create_graph=True)
#         cnt = 0
#         for g in grad2rd:
#             g2 = g.contiguous().view(-1) if cnt == 0 else torch.cat([g2, g.contiguous().view(-1)])
#             cnt = 1
#         hessian[idx] = g2
#     return hessian.cpu().data.numpy()

None


In [8]:
optimizer = optim.SGD(model.parameters(), lr=0.003, momentum=0.9)
time0 = time()
epochs = 5
for e in range(epochs):
    running_loss = 0
    for images, labels in trainloader:
        # Flatten MNIST images into a 784 long vector
        images = images.view(images.shape[0], -1)
    
        # Training pass
        optimizer.zero_grad()
        
        output = model(images)
        loss = criterion(output, labels)
        
        #This is where the model learns by backpropagating
        loss.backward()
        
        #And optimizes its weights here
        optimizer.step()
        
        running_loss += loss.item()
    else:
        print("Epoch {} - Training loss: {}".format(e, running_loss/len(trainloader)))
print("\nTraining Time (in minutes) =",(time()-time0)/60)
print(model[0].weight)

Epoch 0 - Training loss: 0.6344314235836458
Epoch 1 - Training loss: 0.2816658278327507
Epoch 2 - Training loss: 0.22000452187031444
Epoch 3 - Training loss: 0.17839818782230685
Epoch 4 - Training loss: 0.14735293241817432

Training Time (in minutes) = 1.4188337246576945
Parameter containing:
tensor([[-0.0285, -0.0279,  0.0091,  ...,  0.0074, -0.0261, -0.0006],
        [-0.0293, -0.0035,  0.0196,  ...,  0.0311,  0.0285, -0.0139],
        [ 0.0339, -0.0122,  0.0100,  ...,  0.0158, -0.0294,  0.0045],
        ...,
        [-0.0344,  0.0077,  0.0268,  ..., -0.0153, -0.0320, -0.0343],
        [ 0.0209, -0.0103,  0.0107,  ...,  0.0077,  0.0239, -0.0136],
        [ 0.0131, -0.0225,  0.0281,  ...,  0.0109, -0.0218,  0.0003]],
       requires_grad=True)


In [10]:
# print(model[0].weight.grad)
images, labels = next(iter(trainloader))
images = images.view(images.shape[0], -1)
def loss1(*params):
    out: torch.Tensor = _stateless.functional_call(model, {n: p for n, p in zip(names, params)},images)
    return out.square().sum()

names = list(n for n, _ in model.named_parameters())
print(loss1)
print(model.parameters())

print(hessian(loss1, tuple(model.parameters())))
# loss_grad = torch.autograd.grad(loss, model.parameters(),retain_graph = True)
# h = eval_hessian(loss_grad, model)

<function loss1 at 0x7f1b14478700>
<generator object Module.parameters at 0x7f1a557ecb30>


KeyboardInterrupt: ignored

In [4]:
# images, labels = next(iter(valloader))

# img = images[0].view(1, 784)
# with torch.no_grad():
#     logps = model(img)

# ps = torch.exp(logps)
# probab = list(ps.numpy()[0])
# print("Predicted Digit =", probab.index(max(probab)))
# view_classify(img.view(1, 28, 28), ps)

In [None]:
# correct_count, all_count = 0, 0
# for images,labels in valloader:
#   for i in range(len(labels)):
#     img = images[i].view(1, 784)
#     with torch.no_grad():
#         logps = model(img)

    
#     ps = torch.exp(logps)
#     probab = list(ps.numpy()[0])
#     pred_label = probab.index(max(probab))
#     true_label = labels.numpy()[i]
#     if(true_label == pred_label):
#       correct_count += 1
#     all_count += 1

# print("Number Of Images Tested =", all_count)
# print("\nModel Accuracy =", (correct_count/all_count))

In [45]:
import torch
from torch.autograd.functional import hessian
from torch.nn.utils import _stateless

model = torch.nn.Linear(2, 2)
inp = torch.rand(1, 2)

def loss(*params):
    out: torch.Tensor = _stateless.functional_call(model, {n: p for n, p in zip(names, params)}, inp)
    return out.square().sum()
print(model.parameters())
print(loss())
names = list(n for n, _ in model.named_parameters())
print(hessian(loss, tuple(model.parameters())))

<generator object Module.parameters at 0x7fe4b072ef20>
tensor(0.1943, grad_fn=<SumBackward0>)
((tensor([[[[0.0852, 0.1074],
          [0.0000, 0.0000]],

         [[0.1074, 0.1353],
          [0.0000, 0.0000]]],


        [[[0.0000, 0.0000],
          [0.0852, 0.1074]],

         [[0.0000, 0.0000],
          [0.1074, 0.1353]]]]), tensor([[[0.4128, 0.0000],
         [0.5202, 0.0000]],

        [[0.0000, 0.4128],
         [0.0000, 0.5202]]])), (tensor([[[0.4128, 0.5202],
         [0.0000, 0.0000]],

        [[0.0000, 0.0000],
         [0.4128, 0.5202]]]), tensor([[2., 0.],
        [0., 2.]])))
