<a href="https://colab.research.google.com/github/step-cheng/cs496_gradienttheory/blob/main/CS_496_HW_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [91]:
import torch
from torchvision import datasets
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision
from torchvision import transforms
from tqdm import tqdm

In [119]:
transformation = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True,
                                 transform=transforms.Compose([transforms.RandomCrop(size=32, padding=4),
                                                               transforms.RandomHorizontalFlip(), transformation]))
train_dataset_fast = datasets.CIFAR10(root='./data', train=True, download=True,
                                      transform=transformation)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transformation)

train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True, num_workers=4)
train_loader_fast = DataLoader(train_dataset_fast, batch_size=100, shuffle=True, num_workers=4)

test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=4)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


#Question 1

In [78]:
class Model(nn.Module):
  def __init__(self):
    super(Model, self).__init__()
    self.layers = nn.Sequential(
      nn.Conv2d(3, 5, 5),
      nn.ReLU(),
      nn.MaxPool2d(2, 2),
      nn.Conv2d(5, 5, 5),
      nn.ReLU(),
      nn.MaxPool2d(2, 2),
      nn.Flatten(),
      nn.Linear(125, 30),
      nn.ReLU(),
      nn.Linear(30, 10),
    )

  def forward(self, x):
    out = self.layers(x)
    return out

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [26]:
def train():
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model = Model().to(device)
  optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
  criterion = nn.CrossEntropyLoss()

  iterations = 2
  for it in range(iterations):
    for x, y in tqdm(train_loader):
      x, y = x.to(device), y.to(device)
      optimizer.zero_grad()
      logits = model(x)
      loss = criterion(logits, y)
      loss.backward()
      optimizer.step()

    matches = 0
    samples = 0
    for _, (x, y) in tqdm(enumerate(test_loader)):
      x, y = x.to(device), y.to(device)
      logits = model(x)
      samples += x.shape[0]
      matches += torch.eq(torch.argmax(logits, axis=-1), y).sum().item()
    print(f"Test Accuracy: {matches / samples}")

  return model

In [27]:
trained_model = train()

100%|██████████| 500/500 [00:17<00:00, 28.13it/s]
1it [00:02,  2.49s/it]


Test Accuracy: 0.3871


100%|██████████| 500/500 [00:17<00:00, 28.68it/s]
1it [00:03,  3.20s/it]

Test Accuracy: 0.4435





In [65]:
def calculate_hessian(model):
  fast_iter = iter(train_loader_fast)
  (x, y) = next(fast_iter)
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  x, y = x.to(device), y.to(device)

  criterion = nn.CrossEntropyLoss()

  logits = model(x)
  loss = criterion(logits, y)

  grads = torch.autograd.grad(loss, model.parameters(), create_graph=True)
  grads = torch.flatten(torch.concatenate([grad.flatten() for grad in grads]))
  hessian = []
  print('computing hessian...')
  for g in grads:
    hess = torch.autograd.grad(g, model.parameters(), retain_graph=True)
    hess = torch.flatten(torch.concatenate([h.flatten() for h in hess]))
    hessian.append(hess)
  hessian = torch.stack(hessian)
  print('computed hessian')
  print(hessian.shape)

  return hessian

In [66]:
H = calculate_hessian(trained_model)

computing hessian...
computed hessian
torch.Size([5100, 5100])


In [70]:
eigens, _ = torch.linalg.eig(H)

In [90]:
eigens_real = eigens.real
lambdas, _ = torch.sort(eigens_real)
smallest_lambdas = lambdas[:5]
largest_lambdas = lambdas[-5:]

print(f"Smallest eigenvalues: {[l.item() for l in smallest_lambdas]}")
print(f"Largest eigenvalues: {[l.item() for l in largest_lambdas]}")
print(f"Smoothness constant: {lambdas[-1].item()}")

Smallest eigenvalues: [-3.288841962814331, -2.269296407699585, -2.157702922821045, -2.018486738204956, -1.9371747970581055]
Largest eigenvalues: [17.273191452026367, 18.88569450378418, 23.667661666870117, 29.18709373474121, 56.27843475341797]
Smoothness constant: 56.27843475341797


*   5 smallest eigenvalues: -3.29, -2.27, -2.16, -2.02, -1.94
*   5 largest eigenvalues: 17.27, 18.89, 23.67, 29.19, 56.28
*   smoothness constant is the largest eigenvalue: 56.28

#Question 2

In [113]:
def HVP(model):
  beta = 0.9
  num_params = 0
  for p in model.parameters():
    num_params += p.numel()
  v = torch.randn(num_params).to(device)
  v = v / torch.norm(v)
  criterion = nn.CrossEntropyLoss()
  iterations = 100

  lambdas = []

  for x, y in tqdm(train_loader_fast):
    x, y = x.to(device), y.to(device)

    logits = model(x)
    loss = criterion(logits, y)

    grads = torch.autograd.grad(loss, model.parameters(), create_graph=True)
    grads = torch.flatten(torch.concatenate([grad.flatten() for grad in grads]))

    scalar = torch.dot(grads, v)
    u = torch.autograd.grad(scalar, model.parameters(), retain_graph=True)
    u = torch.flatten(torch.concatenate([i.flatten() for i in u]))

    model.eval()
    largest_lambda = torch.dot(v, u)
    lambdas.append(largest_lambda.item())

    # v = u / torch.norm(u)
    w = u / torch.norm(u)
    v = (beta * v + (1 - beta) * w) / torch.norm(beta * v + (1 - beta) * w)
    model.train()

  return lambdas


In [93]:
lambdas = HVP(trained_model)
print('lambdas')
for i in range(len(lambdas)):
  print(f'Iteration {i}: {lambdas[i]}')

lambdas
Iteration 0: 0.0305522158741951
Iteration 1: 0.592523992061615
Iteration 2: 1.9779537916183472
Iteration 3: 5.465104103088379
Iteration 4: 5.223464012145996
Iteration 5: 11.303793907165527
Iteration 6: 19.11136245727539
Iteration 7: 19.27696418762207
Iteration 8: 18.090431213378906
Iteration 9: 21.556529998779297
Iteration 10: 23.42266845703125
Iteration 11: 27.26761817932129
Iteration 12: 34.03251647949219
Iteration 13: 39.713348388671875
Iteration 14: 32.801475524902344
Iteration 15: 44.373634338378906
Iteration 16: 36.10469055175781
Iteration 17: 45.42321014404297
Iteration 18: 41.0089111328125
Iteration 19: 50.74760818481445
Iteration 20: 59.04662322998047
Iteration 21: 44.14892578125
Iteration 22: 48.51994705200195
Iteration 23: 40.65997314453125
Iteration 24: 39.01117706298828
Iteration 25: 50.71582794189453
Iteration 26: 36.936302185058594
Iteration 27: 43.978187561035156
Iteration 28: 40.036041259765625
Iteration 29: 53.984947204589844
Iteration 30: 46.79404830932617
It

The largest eigenvalue of the Hessian is approximately 47.43 after iterating through the dataset once. This is similar to the eigenvalue calculated from question 1, 56.28

#Question 3

In [122]:
def train_resnet(model, iterations):
  optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
  criterion = nn.CrossEntropyLoss()
  model.train()
  for it in range(iterations):
    for x, y in tqdm(train_loader):
      x, y = x.to(device), y.to(device)
      optimizer.zero_grad()
      logits = model(x)
      loss = criterion(logits, y)
      loss.backward()
      optimizer.step()

  model.eval()
  matches = 0
  samples = 0
  for _, (x, y) in tqdm(enumerate(test_loader)):
    x, y = x.to(device), y.to(device)
    logits = model(x)
    samples += x.shape[0]
    matches += torch.eq(torch.argmax(logits, axis=-1), y).sum().item()
  print(f"Test Accuracy: {matches / samples}")

  return model

In [100]:
resnet = torchvision.models.resnet50().to(device)

In [114]:
lambdas = HVP(resnet)
print(f'largest Hessian eigenvalue: {lambdas[-1]}')

100%|██████████| 500/500 [02:04<00:00,  4.01it/s]

largest Hessian eigenvalue: 59950.4921875





In [121]:
resnet_10iter = train_resnet(resnet, iterations=10)

100%|██████████| 500/500 [00:33<00:00, 15.14it/s]
100%|██████████| 500/500 [00:33<00:00, 14.94it/s]
100%|██████████| 500/500 [00:32<00:00, 15.18it/s]
100%|██████████| 500/500 [00:33<00:00, 15.01it/s]
100%|██████████| 500/500 [00:33<00:00, 15.00it/s]
100%|██████████| 500/500 [00:32<00:00, 15.25it/s]
100%|██████████| 500/500 [00:33<00:00, 14.92it/s]
100%|██████████| 500/500 [00:32<00:00, 15.24it/s]
100%|██████████| 500/500 [00:33<00:00, 14.96it/s]
100%|██████████| 500/500 [00:34<00:00, 14.31it/s]
100it [00:03, 30.68it/s]

Test Accuracy: 0.5065





In [134]:
lambdas = HVP(resnet_10iter)
print(f'largest Hessian eigenvalue: {lambdas[-1]}')

100%|██████████| 500/500 [01:59<00:00,  4.17it/s]

largest Hessian eigenvalue: 99.82711791992188





In [136]:
resnet_100iter = train_resnet(resnet_10iter, iterations=100-10)

100%|██████████| 500/500 [00:33<00:00, 14.81it/s]
100%|██████████| 500/500 [00:32<00:00, 15.28it/s]
100%|██████████| 500/500 [00:33<00:00, 15.06it/s]
100%|██████████| 500/500 [00:33<00:00, 15.05it/s]
100%|██████████| 500/500 [00:33<00:00, 15.11it/s]
100%|██████████| 500/500 [00:33<00:00, 15.00it/s]
100%|██████████| 500/500 [00:32<00:00, 15.22it/s]
100%|██████████| 500/500 [00:33<00:00, 14.94it/s]
100%|██████████| 500/500 [00:33<00:00, 15.11it/s]
100%|██████████| 500/500 [00:33<00:00, 15.09it/s]
100%|██████████| 500/500 [00:33<00:00, 15.07it/s]
100%|██████████| 500/500 [00:32<00:00, 15.34it/s]
100%|██████████| 500/500 [00:33<00:00, 14.96it/s]
100%|██████████| 500/500 [00:33<00:00, 15.05it/s]
100%|██████████| 500/500 [00:32<00:00, 15.32it/s]
100%|██████████| 500/500 [00:33<00:00, 15.02it/s]
100%|██████████| 500/500 [00:32<00:00, 15.32it/s]
100%|██████████| 500/500 [00:33<00:00, 15.02it/s]
100%|██████████| 500/500 [00:33<00:00, 15.10it/s]
100%|██████████| 500/500 [00:32<00:00, 15.23it/s]


Test Accuracy: 0.7746


In [137]:
lambdas = HVP(resnet_100iter)
print(f'largest Hessian eigenvalue: {lambdas[-1]}')

100%|██████████| 500/500 [01:58<00:00,  4.20it/s]

largest Hessian eigenvalue: 5.35465669631958





As shown from the experiments, I list the largest Hessian eigenvalue below:

*   Before Training: 59950.49
*   After 10 epochs: 99.83
*   After 100 epochs: 5.35

