<a href="https://colab.research.google.com/github/rahulsm27/ML/blob/main/Newton_Method_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import numpy as np
import torch
import torch.optim
import torch.nn as nn
import torch.nn.functional as F


from utils import compute_stats, get_grad
from LBFGS import FullBatchLBFGS

In [29]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [30]:
# Creating our own LeNet5

class LeNet5(nn.Module):
  def __init__(self):
    super(LeNet5,self).__init__()
    self.conv1 = nn.Conv2d(3,6,5) # in channel , out channe, kernel
    self.relu1 = nn.ReLU()
    self.maxpool1 = nn.MaxPool2d((2,2))

    self.conv2 = nn.Conv2d(6,16,5)  # in channel , out channe, kernel
    self.relu2 = nn.ReLU()
    self.maxpool2 = nn.MaxPool2d((2,2))

    self.fc1 = nn.Linear(16*5*5,120)
    self.fc2 = nn.Linear(120,84)
    self.fc3 = nn.Linear(84,10)

  def forward(self, x):
    x = self.conv1(x)

    x = self.relu1(x)
    x= self.maxpool1(x)

    x = self.conv2(x)
    x = self.relu2(x)
    x= self.maxpool2(x)

    x = x.view(-1,int(x.nelement() / x.shape[0]))

    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)



    return x



model = LeNet5().to(device)
torch.nn.init.xavier_uniform_(model.conv1.weight)
torch.nn.init.xavier_uniform_(model.conv2.weight)

Parameter containing:
tensor([[[[ 3.0787e-02, -6.2263e-02, -5.1466e-02,  3.8122e-02,  3.2720e-02],
          [ 2.4915e-02, -7.6524e-02, -5.7436e-02, -1.0001e-01, -1.8606e-02],
          [-1.0129e-01,  3.5482e-02,  7.8135e-02, -1.0223e-01, -9.1828e-02],
          [ 2.7611e-02, -4.4325e-02,  5.1430e-02,  1.7977e-02, -6.9935e-02],
          [-4.2182e-03,  4.6309e-02, -1.8666e-02, -7.8248e-02,  2.3244e-02]],

         [[ 9.9645e-02, -6.8276e-02,  4.1207e-02,  1.0047e-01,  9.3279e-02],
          [ 5.1429e-02,  3.5563e-02, -7.3050e-02, -4.5599e-02, -7.8214e-02],
          [-8.7034e-02,  9.5617e-03,  2.2930e-02,  8.6502e-02, -1.0082e-01],
          [-8.8730e-02, -1.5538e-02, -9.0158e-02,  3.5200e-03, -4.2291e-02],
          [-4.1865e-02,  4.3006e-02,  4.0019e-02, -8.6938e-02,  4.0544e-02]],

         [[-3.5522e-02,  9.9218e-02,  4.2517e-02,  4.5931e-03,  8.0518e-02],
          [-5.4824e-02, -3.4293e-02,  7.6069e-02,  9.5703e-02,  3.5839e-02],
          [ 7.0171e-02, -5.3860e-02,  2.1003e-02, 

In [31]:
from tensorflow.keras.datasets import cifar10 # to load dataset

# Load data
(X_train, y_train), (X_test, y_test) = cifar10.load_data()
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train = X_train / 255
X_test = X_test / 255

X_train = np.transpose(X_train, (0, 3, 1, 2))
X_test = np.transpose(X_test, (0, 3, 1, 2))

In [36]:
#criterion = nn.CrossEntropyLoss()
optimizer = FullBatchLBFGS(model.parameters(), lr=1., history_size=10, line_search='Wolfe', debug=True)

# Forward pass
opfun = lambda X: model.forward(torch.tensor(X).to(device))

# Forward pass through the network given the input
predsfun = lambda op: np.argmax(op.cpu().data.numpy(), 1)

# Do the forward pass, then compute the accuracy
accfun = lambda op, y: np.mean(np.equal(predsfun(op), y.squeeze())) * 100

# Main training loop
no_samples = X_train.shape[0]

# compute initial gradient and objective
grad, obj = get_grad(optimizer, X_train, y_train, opfun)



In [37]:
# Parameters for L-BFGS training
max_iter = 200
ghost_batch = 128


# main loop
for n_iter in range(max_iter):

    # training mode
    model.train()

    # define closure for line search
    def closure():

        optimizer.zero_grad()

        loss_fn = torch.tensor(0, dtype=torch.float).to(device)

        for subsmpl in np.array_split(np.arange(no_samples), max(int(no_samples / ghost_batch), 1)):

            ops = opfun(X_train[subsmpl])

            tgts = torch.from_numpy(y_train[subsmpl]).to(device).long().squeeze()

            loss_fn += F.cross_entropy(ops, tgts) * (len(subsmpl) / no_samples)

        return loss_fn

    # perform line search step
    options = {'closure': closure, 'current_loss': obj}
    obj, grad, lr, _, _, _, _, _ = optimizer.step(options)

    # compute statistics
    model.eval()
    train_loss, test_loss, test_acc = compute_stats(X_train, y_train, X_test, y_test, opfun, accfun,
                                                    ghost_batch=128)

    # print data
    print('Iter:', n_iter + 1, 'lr:', lr, 'Training Loss:', train_loss, 'Test Loss:', test_loss,
          'Test Accuracy:', test_acc)

Iter: 1 lr: 4.0 Training Loss: 2.297409815716743 Test Loss: 2.2972478403329837 Test Accuracy: 9.95
Iter: 2 lr: 0.2 Training Loss: 2.294278284583091 Test Loss: 2.2938123137950894 Test Accuracy: 10.119999999999997
Iter: 3 lr: 1.0 Training Loss: 2.2879178206825257 Test Loss: 2.287668154406547 Test Accuracy: 13.340000000000002
Iter: 4 lr: 2.0 Training Loss: 2.2617658226919177 Test Loss: 2.2625082651138304 Test Accuracy: 14.440000000000003
Iter: 5 lr: 0.2 Training Loss: 2.25482784380436 Test Loss: 2.25677495856285 Test Accuracy: 13.640000000000002
Iter: 6 lr: 1.0 Training Loss: 2.232477621331215 Test Loss: 2.2384168288707738 Test Accuracy: 17.070000000000004
Iter: 7 lr: 1.0 Training Loss: 2.209806303195953 Test Loss: 2.215066414952279 Test Accuracy: 17.18000000000001
Iter: 8 lr: 1.0 Training Loss: 2.1970162607765196 Test Loss: 2.1995690737485893 Test Accuracy: 17.48
Iter: 9 lr: 1.0 Training Loss: 2.180640261464119 Test Loss: 2.1839404413938523 Test Accuracy: 20.989999999999995
Iter: 10 lr: 

KeyboardInterrupt: ignored