# This notebook first demonstrates the basic principle behind our method through autograd, then uses analytic derivatives to find the ntk in parallel

# the point is that for simple architectures the autograd NTK agrees with the analytic method up to a rtol typically of 1e-2 , visually inspecting shows that elements are very close.

# We can also use this notebook to benchmark the FC ntk-- though note the expected time for autograd NTK to finish for large network or networks with many points is also large, consider skipping these cells.

In [1]:
#from layerwise_ntk import compute_NTK_CNN
import numpy as np
import random
#import matplotlib.pyplot as plt

import torch
from torch import nn, optim
from torch import load
from torch.nn import functional as F
from torch import autograd

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

import time

from numba import njit

In [2]:
from easy_ntk import compute_NTK_CNN

In [2]:
#a = torch.tensor([0.0])
#a = a.to('cuda')

In [3]:
SEED = 0
how_many = 100_000
width = 100

In [4]:
# def activation(x):
#     return torch.tanh(x)


# @njit
# def d_activation(x):
#     return np.cosh(x)**-2

def d_activationt(x):
    return torch.cosh(x)**-2

def activation(x):
    return torch.tanh(x)

In [5]:
def NTK_weights(m):
    if isinstance(m, nn.Linear):
        print(m.weight.shape)
        nn.init.normal_(m.weight.data)#/m.weight.shape[0]
        if m.bias != None:
            nn.init.normal_(m.bias.data)#/m.weight.shape[0]
    if isinstance(m, nn.Conv2d):
        print(m.weight.shape)
        nn.init.normal_(m.weight.data)#/m.weight.shape[0]
        if m.bias != None:
            nn.init.normal_(m.bias.data)#/m.weight.shape[0]

In [6]:
class dumb_small(torch.nn.Module):
    '''
    simple network for test cases
    
    
    It seems like bias vectors aren't trivially added.
    '''
    def __init__(self,):
        super(dumb_small, self).__init__()
        
        self.d1 = torch.nn.Linear(784,width,bias=True) #28 -> 28

        self.d2 = torch.nn.Linear(width,width,bias=True) #28 -> 28
        
        self.d3 = torch.nn.Linear(width,width,bias=True) #28 -> 28
        
        self.d4 = torch.nn.Linear(width,width,bias=True) #28 -> 28
        
        self.d5 = torch.nn.Linear(width,1,bias=True)
        
    def forward(self, x_0):
        x_1 = activation(self.d1(x_0))
        x_2 = activation(self.d2(x_1))
        x_3 = activation(self.d3(x_2))
        x_4 = activation(self.d4(x_3))
        x_5 = self.d5(x_4)
        return x_5

class dumb_small_layerwise(torch.nn.Module):
    '''
    simple network for test cases
    
    
    It seems like bias vectors aren't trivially added.
    '''
    def __init__(self,):
        super(dumb_small_layerwise, self).__init__()
        
        self.d1 = torch.nn.Linear(784,width,bias=True) #28 -> 28

        self.d2 = torch.nn.Linear(width,width,bias=True) #28 -> 28
        
        self.d3 = torch.nn.Linear(width,width,bias=True) #28 -> 28
        
        self.d4 = torch.nn.Linear(width,width,bias=True) #28 -> 28
        
        self.d5 = torch.nn.Linear(width,1,bias=True)
        
    def forward(self, x_0):
        x_1 = activation(self.d1(x_0))
        x_2 = activation(self.d2(x_1))
        x_3 = activation(self.d3(x_2))
        x_4 = activation(self.d4(x_3))
        x_5 = self.d5(x_4)
        return x_5, x_4, x_3, x_2, x_1, x_0

In [7]:
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
device='cpu'

model = dumb_small_layerwise()
model.apply(NTK_weights)
model.to(device)

torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

model_2 = dumb_small()
model_2.apply(NTK_weights)

x_test = np.random.normal(0,1,(how_many,784)).astype(np.float32) #n c_in, h, w
x_test = torch.from_numpy(x_test)

torch.Size([100, 784])
torch.Size([100, 100])
torch.Size([100, 100])
torch.Size([100, 100])
torch.Size([1, 100])
torch.Size([100, 784])
torch.Size([100, 100])
torch.Size([100, 100])
torch.Size([100, 100])
torch.Size([1, 100])


# Autograd NTK-- uncomment and run if number used and width are small, like both under 100, in order to test that the result and the result of the easy_ntk layerwise algorithm agree with oneanother. 

In [8]:
# model_2.zero_grad()
# y = model_2(x_test)

# #in the future we would iterate over layers instead of like this...
# layer_components_w1 = [] 
# layer_components_w2 = []
# layer_components_w3 = []
# layer_components_w4 = []
# layer_components_w5 = []

# layer_components_b1 = []
# layer_components_b2 = []
# layer_components_b3 = []
# layer_components_b4 = []
# layer_components_b5 = []

# for i in range(len(y)):
#     model_2.zero_grad()
#     y[i].backward(retain_graph=True)
#     #Get the tensors
#     w1_grad = model_2.d1.weight.grad.detach().numpy()
#     w2_grad = model_2.d2.weight.grad.detach().numpy()
#     w3_grad = model_2.d3.weight.grad.detach().numpy()
#     w4_grad = model_2.d4.weight.grad.detach().numpy()
#     w5_grad = model_2.d5.weight.grad.detach().numpy()
    
#     b1_grad = model_2.d1.bias.grad.detach().numpy()
#     b2_grad = model_2.d2.bias.grad.detach().numpy()
#     b3_grad = model_2.d3.bias.grad.detach().numpy()
#     b4_grad = model_2.d4.bias.grad.detach().numpy()
#     b5_grad = model_2.d5.bias.grad.detach().numpy()

#     #reshape and append. deep copy neccessary or else they are the same objects
#     layer_components_w1.append(w1_grad.reshape(-1).copy())
#     layer_components_w2.append(w2_grad.reshape(-1).copy())
#     layer_components_w3.append(w3_grad.reshape(-1).copy())
#     layer_components_w4.append(w4_grad.reshape(-1).copy())
#     layer_components_w5.append(w5_grad.reshape(-1).copy())
    
#     layer_components_b1.append(b1_grad.reshape(-1).copy())
#     layer_components_b2.append(b2_grad.reshape(-1).copy())
#     layer_components_b3.append(b3_grad.reshape(-1).copy())
#     layer_components_b4.append(b4_grad.reshape(-1).copy())
#     layer_components_b5.append(b5_grad.reshape(-1).copy())

# layer_components_w1 = np.array(layer_components_w1)
# layer_components_w2 = np.array(layer_components_w2)
# layer_components_w3 = np.array(layer_components_w3)
# layer_components_w4 = np.array(layer_components_w4)
# layer_components_w5 = np.array(layer_components_w5)

# layer_components_b1 = np.array(layer_components_b1)
# layer_components_b2 = np.array(layer_components_b2)
# layer_components_b3 = np.array(layer_components_b3)
# layer_components_b4 = np.array(layer_components_b4)
# layer_components_b5 = np.array(layer_components_b5)

# autograd_NTK = layer_components_w1 @ layer_components_w1.T+\
#     layer_components_w2 @ layer_components_w2.T+\
#     layer_components_w3 @ layer_components_w3.T+\
#     layer_components_w4 @ layer_components_w4.T+\
#     layer_components_w5 @ layer_components_w5.T+\
#     layer_components_b1 @ layer_components_b1.T+\
#     layer_components_b2 @ layer_components_b2.T+\
#     layer_components_b3 @ layer_components_b3.T+\
#     layer_components_b4 @ layer_components_b4.T+\
#     layer_components_b5 @ layer_components_b5.T

# autograd_NTK

# easy NTK

In [13]:
x_test = x_test.to('cpu')
x_5, x_4, x_3, x_2, x_1, x_0 = model(x_test)

#These need to be numpy
Ws = []
Ws.append(model.d1.weight.detach())
Ws.append(model.d2.weight.detach())
Ws.append(model.d3.weight.detach())
Ws.append(model.d4.weight.detach())
Ws.append(model.d5.weight.detach())

#Kernel Matrices, Need to be numpy
Ks = []
Ks.append(torch.tensor([0.0],dtype=torch.float32)) 
Ks.append(torch.tensor([0.0],dtype=torch.float32))
Ks.append(torch.tensor([0.0],dtype=torch.float32))
Ks.append(torch.tensor([0.0],dtype=torch.float32))
Ks.append(torch.tensor([0.0],dtype=torch.float32))


Xs = [] # Xs are shape (output x #DP) ; however, typical python notation is reversed, so we take transpose here
Xs.append(x_0.T.detach())
Xs.append(x_1.T.detach())
Xs.append(x_2.T.detach())
Xs.append(x_3.T.detach())
Xs.append(x_4.T.detach())

#This is used to create arrays-- needs to be integer list to play nice with compilers
ds_int = []
ds_int.append(1)
ds_int.append(1)
ds_int.append(1)
ds_int.append(1)
ds_int.append(1)

ds_array = [] #this is for the NTK formulation, 
#ds_array.append(torch.tensor([1.0],dtype=torch.float32).to(device)) #first element is a spacer, could be anything.

ds_array.append(torch.tensor([1.0],dtype=torch.float32).to(device)) 
ds_array.append(torch.tensor([1.0],dtype=torch.float32).to(device)) 
ds_array.append(torch.tensor([1.0],dtype=torch.float32).to(device))
ds_array.append(torch.tensor([1.0],dtype=torch.float32).to(device))
ds_array.append(torch.tensor([1.0],dtype=torch.float32).to(device))

filters = []
filters.append(0)
filters.append(0)
filters.append(0)
filters.append(0)
filters.append(0)


padding = []
padding.append(0)
padding.append(0)
padding.append(0)
padding.append(0)
padding.append(0)


strides = []
strides.append(0)
strides.append(0)
strides.append(0)
strides.append(0)
strides.append(0)



layers=[model.d1,
        model.d2,
        model.d3,
        model.d4,
        model.d5
       ]

In [16]:
now = time.time()
ntk_components = compute_NTK_CNN(Ws, Ks, Xs, ds_int, ds_array, strides, padding, layers, d_activationt, device="cpu")
print(time.time() - now)

RuntimeError: [enforce fail at CPUAllocator.cpp:67] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 40000000000 bytes. Error code 12 (Cannot allocate memory)

In [17]:
%%timeit
ntk_components = compute_NTK_CNN(Ws, Ks, Xs, ds_int, ds_array, strides, padding, layers, d_activationt, device="cuda")

1.45 ms ± 2.37 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [41]:
NTK = torch.sum(torch.stack(ntk_components),[0,])

In [42]:
np.allclose((layer_components_w4 @ layer_components_w4.T),ntk_components[2].cpu().numpy(),1e-3)

False

In [60]:
a = layer_components_b4 @ layer_components_b4.T
a

array([[ 6.3095946 ,  0.11968824,  1.1234529 , ...,  2.032055  ,
         0.55126   ,  1.4060948 ],
       [ 0.11968824,  3.210734  ,  0.16980146, ...,  1.8690039 ,
         0.02743362,  0.8184704 ],
       [ 1.1234529 ,  0.16980146,  2.0710466 , ...,  0.13769273,
         0.22619286,  0.1258586 ],
       ...,
       [ 2.032055  ,  1.8690039 ,  0.13769273, ..., 12.222395  ,
         4.1873646 ,  0.6115483 ],
       [ 0.55126   ,  0.02743362,  0.22619286, ...,  4.1873646 ,
        10.260793  ,  1.1471453 ],
       [ 1.4060948 ,  0.8184704 ,  0.1258586 , ...,  0.6115483 ,
         1.1471453 ,  3.4726658 ]], dtype=float32)

In [59]:
b = ntk_components[3].cpu().numpy()
b

array([[ 6.309475  ,  0.11969332,  1.1234163 , ...,  2.0322046 ,
         0.55130553,  1.4061674 ],
       [ 0.11969332,  3.2107909 ,  0.16979226, ...,  1.8690201 ,
         0.02743386,  0.81847763],
       [ 1.1234163 ,  0.16979226,  2.071053  , ...,  0.13768236,
         0.22619265,  0.12585686],
       ...,
       [ 2.0322046 ,  1.8690201 ,  0.13768236, ..., 12.222448  ,
         4.187412  ,  0.61153996],
       [ 0.55130553,  0.02743386,  0.22619265, ...,  4.187412  ,
        10.260914  ,  1.1472785 ],
       [ 1.4061674 ,  0.81847763,  0.12585686, ...,  0.61153996,
         1.1472785 ,  3.4728098 ]], dtype=float32)

In [63]:
np.sum(np.isclose(a,b,1e-2)) / np.prod(np.shape(a))

1.0

In [69]:
np.sum(np.isclose(NTK.cpu().numpy(),autograd_NTK,1e-1)) / np.prod(np.shape(NTK))

0.9988