In [1]:
import torch
from torch import nn
from scipy import stats

class EGNNProject(nn.Module):
    """"
    Implementation of embedding with EGNN(C=0)-level expressivity 

    Input is b x d x n point cloud
    sparse1,2 means using scipy for distribution
    delta is about applying exponential(-x/delta)

    """
    def __init__(self, dim=3, n=6,const=2, batch_size=16, delta = 1., exp=False):
        super().__init__()
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        self.delta = delta
        self.seed = 42
        self.dim = dim
        self.n = n
        self.batch_size = batch_size
        self.const = const #const of embedding const*d*n +1
        self.embed_dim = self.const*n*dim + 1
        self.dtype = torch.double
        self.exp=exp
        if torch.cuda.is_available():
          self.device=torch.device('cuda')
        else:
          self.device=torch.device('cpu')
        torch.manual_seed(42)
        self.w = nn.Linear(self.n, self.embed_dim)
        self.w.weight.retain_grad()
        self._init_weights(self.w)
        torch.manual_seed(42)
        self.W = nn.Linear(self.n, self.embed_dim)
        self.W.weight.retain_grad()
        self._init_weights(self.W)
        self.to(self.device, dtype=torch.double)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight, gain=nn.init.calculate_gain('relu'))
            if module.bias is not None:
                module.bias.data.zero_()
    def forward(self, cloud):
        """
        Input is a b x d x n point cloud
        """
        #### create distance matrix + norms on diagonal
        cloud = cloud.transpose(dim0=-1, dim1=-2) # shape b x n x d
        dist_zero_diag = torch.cdist(cloud, cloud, p=2)
        norms = torch.linalg.norm(cloud, ord=None, axis=2, keepdims=False) #checked correct
        #### sort distances
        egnn_style, _ = torch.sort(dist_zero_diag, dim=1)
        ### add norms
        egnn_style[:,0, :] = norms #matrix with sorted distances from each node and node norm unsorted at top, size b x n x n
        if self.exp:
          egnn_style = torch.exp(torch.div((-1)*egnn_style, self.delta)) #exponential scaling
        #### project on 2nd+1 space
        #apply embedding
        sorted, _ = torch.sort( self.w(egnn_style.transpose(dim0=-1,dim1=-2)), dim=1)
        ## matrix multiplication
        embed = self.W(sorted.transpose(dim0=-1,dim1=-2))
        embed = torch.diagonal(embed, offset=0)
        return embed.t()

In [2]:
if torch.cuda.is_available():
  use_cuda=True
  device=torch.device('cuda')
else:
  use_cuda=False
  device=torch.device('cpu')
#first one from Pod. 2020 that egnn should separate
A = torch.tensor([[-2,0,-2],\
                  [2,0,2],\
                  [0,1,1],\
                  [1,1,0],\
                  [-1,-1,0] ], dtype=torch.double).to(device).reshape(1,3,5).expand(2,3,5)

B = torch.tensor([[-2,0,-2],\
                  [2,0,2],\
                  [0,1,-1],\
                  [1,1,0],\
                  [-1,-1,0] ], dtype=torch.double).to(device).reshape(1,3,5).expand(2,3,5)
egnn = EGNNProject(dim=3, n=5, batch_size=2, const=2, delta=-1/2., exp=True)
print(torch.norm(egnn(A)-egnn(B)))

tensor(15784.1585, device='cuda:0', dtype=torch.float64,
       grad_fn=<NormBackward1>)


In [3]:
C = torch.tensor([
    [-2,0,-2],
    [2,0,2],
    [1,1,0],
    [-1,-1,0],
    [1,2,0],
    [-1,2,0],
    [0,0,1]
], dtype=torch.double).to(device).reshape(1,3,7).expand(2,3,7)

D = torch.tensor([
    [-2,0,-2],
    [2,0,2],
    [1,1,0],
    [-1,-1,0],
    [1,2,0],
    [-1,2,0],
    [0,0,-1]
], dtype=torch.double).to(device).reshape(1,3,7).expand(2,3,7)
egnn = EGNNProject(dim=3, n=7, batch_size=2, const=2, delta=-1/2., exp=True)
print(torch.norm(egnn(C)-egnn(D)))

tensor(7775.1785, device='cuda:0', dtype=torch.float64,
       grad_fn=<NormBackward1>)


In [4]:
use_cuda

True

In [5]:
egnn = EGNNProject(dim=3, n=5, batch_size=2, const=2, delta=-1/2., exp=True)


In [6]:
#test permutation invariance
perm=torch.randperm(5)
torch.norm(egnn(A)-egnn(A[:,:,perm]))

tensor(0., device='cuda:0', dtype=torch.float64, grad_fn=<NormBackward1>)

In [7]:
from torch.nn.utils.parametrizations import orthogonal

orth_linear = orthogonal(nn.Linear(3, 3))
Q = orth_linear.weight.expand(1,3,3).to(device, dtype=torch.double)

torch.norm(egnn(A)-egnn(Q@A[:,:,perm]))

tensor(0.0015, device='cuda:0', dtype=torch.float64, grad_fn=<NormBackward1>)

Torch differentiation Profiler

In [8]:
with torch.autograd.profiler.profile(use_cuda=use_cuda) as prf:
    for _ in range(1000):
        egnn(A)
        

print(prf.key_averages().table(sort_by='self_cpu_time_total'))

STAGE:2022-12-22 10:38:57 2290886:2290886 ActivityProfilerController.cpp:294] Completed Stage: Warm Up
STAGE:2022-12-22 10:38:59 2290886:2290886 ActivityProfilerController.cpp:300] Completed Stage: Collection


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                             aten::sort        13.33%     188.435ms        27.61%     390.147ms     195.073us     142.784ms         9.71%     372.723ms     186.362us          2000  
                                           aten::matmul        10.76%     152.129ms        24.75%     349.857ms     174.929us     100.496ms         6.84%     358.406ms     179.203us          2000  
         

Look at backpropegation

In [9]:
from torch import nn
import torch.nn.functional as F
class Net(nn.Module):
    def __init__(
            self,
            batch,
            dim,
            n,
            delta = 1.,
            const=2,
            exp=False,
            l1 = 500,
            l2 = 50,
            l3 = 10,
            l4 = 5,
            l5 = 5,
            l6 = 5,
            l7 = 5,
            l8 = 5
    ):
        super(Net, self).__init__()
        if torch.cuda.is_available():
          self.device = torch.device('cuda')
        else:
          self.device = torch.device('cpu')
        self.n =n
        self.dim = dim
        self.const = const
        self.embed = EGNNProject(dim=dim, n=n, batch_size=batch, const=const, exp=exp, delta=delta)
        self.l1 = torch.tensor(l1).to(dtype=torch.int32,device=self.device)
        self.l2 = torch.tensor(l2).to(dtype=torch.int32,device=self.device)
        self.l3 = torch.tensor(l3).to(dtype=torch.int32,device=self.device)
        self.l4 = torch.tensor(l4).to(dtype=torch.int32,device=self.device)
        self.l5 = torch.tensor(l5).to(dtype=torch.int32,device=self.device)
        self.l6 = torch.tensor(l6).to(dtype=torch.int32,device=self.device)
        self.l7 = torch.tensor(l7).to(dtype=torch.int32,device=self.device)
        self.l8 = torch.tensor(l8).to(dtype=torch.int32,device=self.device)
        
        self.bm1 = nn.BatchNorm1d(self.l1)
        self.bm2 = nn.BatchNorm1d(self.l2)
        self.bm3 = nn.BatchNorm1d(self.l3)
        self.bm3 = nn.BatchNorm1d(self.l3)
        self.bm4 = nn.BatchNorm1d(self.l4)
        self.bm5 = nn.BatchNorm1d(self.l5)
        self.bm6 = nn.BatchNorm1d(self.l6)
        self.bm7 = nn.BatchNorm1d(self.l7)
        self.bm8 = nn.BatchNorm1d(self.l8)

        self.do1 = nn.Dropout(p=0.5)
        self.do2 = nn.Dropout(p=0.5)
        self.do3 = nn.Dropout(p=0.5)
        self.do4 = nn.Dropout(p=0.5)
        self.do5 = nn.Dropout(p=0.5)
        self.do6 = nn.Dropout(p=0.5)
        self.do7 = nn.Dropout(p=0.5)
        self.do8 = nn.Dropout(p=0.5)

        self.nonlin = F.relu
        self.first = self.const*self.dim*self.n + 1
        self.dense0 = nn.Linear(self.first, self.l1)
        self.dense0.weight.retain_grad()
        #self._init_weights(self.dense0)
        self.dense1 = nn.Linear(self.l1, self.l2)
        self.dense1.weight.retain_grad()
        #self._init_weights(self.dense1)
        self.dense2 = nn.Linear(self.l2, self.l3)
        self.dense2.weight.retain_grad()
        #self.dense3 = nn.Linear(self.l3, self.l4)
        #self.dense4 = nn.Linear(self.l4, self.l5)
        #self.dense5 = nn.Linear(self.l5, self.l6)
        #self.dense6 = nn.Linear(self.l6, self.l7)
        #self.dense7 = nn.Linear(self.l7, self.l8)
        self.output = nn.Linear(self.l3, 2) #binary
        self.output.weight.retain_grad()
        self.softmax = nn.Softmax(dim=1)
        self.to(self.device, dtype=torch.double)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight, gain=nn.init.calculate_gain('relu'))
            if module.bias is not None:
                module.bias.data.zero_()

    def forward(self, X, **kwargs):
        X = self.embed(X)
        X = self.nonlin(self.dense0(X))
        X = self.nonlin(self.dense1(X))
        X = self.nonlin(self.dense2(X))
        #X = self.nonlin(self.do4(self.bm4(self.dense3(X))))
        #X = self.nonlin(self.do5(self.bm5(self.dense4(X))))
        #X = self.nonlin(self.do6(self.bm6(self.dense5(X))))
        #X = self.nonlin(self.do7(self.bm7(self.dense6(X))))
        #X = self.nonlin(self.do8(self.bm8(self.dense7(X))))
        X = self.output(X)
        X = self.softmax(X)
        return X

In [10]:
model = Net(batch=4, dim=3, n=6, const=5, exp=-1/2)

some_input = torch.randn(4,3,6,dtype=torch.double,device=device)
ideal_output = torch.randn(4,1,dtype=torch.double,device=device)*10

In [11]:
print(model.dense0.weight[0][0:10]) # just a small slice
print(model.dense0.weight.grad)

tensor([-0.0844,  0.0129, -0.0410, -0.0019, -0.0241,  0.0163,  0.0682, -0.0348,
         0.0840,  0.0828], device='cuda:0', dtype=torch.float64,
       grad_fn=<SliceBackward0>)
None


Let’s see how this changes when we run through one training batch. For a
loss function, we’ll just use the square of the Euclidean distance
between our ``prediction`` and the ``ideal_output``, and we’ll use a
basic stochastic gradient descent optimizer.




In [12]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

prediction = model(some_input)

loss = (ideal_output - prediction).pow(2).sum()
print(loss)

tensor(897.0519, device='cuda:0', dtype=torch.float64, grad_fn=<SumBackward0>)


Now, let’s call ``loss.backward()`` and see what happens:




In [13]:
loss.backward()
print(model.dense0.weight[0][0:10])
print(model.dense0.weight.grad[0][0:10])

tensor([-0.0844,  0.0129, -0.0410, -0.0019, -0.0241,  0.0163,  0.0682, -0.0348,
         0.0840,  0.0828], device='cuda:0', dtype=torch.float64,
       grad_fn=<SliceBackward0>)
tensor([ 1.5828e-05,  3.2103e-06, -2.5608e-06, -4.1997e-06,  1.1579e-06,
         1.0487e-06, -3.5487e-06, -3.0139e-06,  4.8274e-06, -2.0324e-05],
       device='cuda:0', dtype=torch.float64)


We can see that the gradients have been computed for each learning
weight, but the weights remain unchanged, because we haven’t run the
optimizer yet. The optimizer is responsible for updating model weights
based on the computed gradients.




In [14]:
optimizer.step()
print(model.dense0.weight[0][0:10])
print(model.dense0.weight.grad[0][0:10])

tensor([-0.0844,  0.0129, -0.0410, -0.0019, -0.0241,  0.0163,  0.0682, -0.0348,
         0.0840,  0.0828], device='cuda:0', dtype=torch.float64,
       grad_fn=<SliceBackward0>)
tensor([ 1.5828e-05,  3.2103e-06, -2.5608e-06, -4.1997e-06,  1.1579e-06,
         1.0487e-06, -3.5487e-06, -3.0139e-06,  4.8274e-06, -2.0324e-05],
       device='cuda:0', dtype=torch.float64)


You should see that ``layer2``\ ’s weights have changed.

One important thing about the process: After calling
``optimizer.step()``, you need to call ``optimizer.zero_grad()``, or
else every time you run ``loss.backward()``, the gradients on the
learning weights will accumulate:




In [15]:
print(model.dense0.weight.grad[0][0:10])
print(model.dense0.weight[0][0:10])

for i in range(0, 100):
    prediction = model(some_input)
    loss = (ideal_output - prediction).pow(2).sum()
    loss.backward()
    
print(model.dense0.weight.grad[0][0:10])
optimizer.step()
print(model.dense0.weight[0][0:10])
optimizer.zero_grad()

print(model.dense0.weight.grad[0][0:10])

tensor([ 1.5828e-05,  3.2103e-06, -2.5608e-06, -4.1997e-06,  1.1579e-06,
         1.0487e-06, -3.5487e-06, -3.0139e-06,  4.8274e-06, -2.0324e-05],
       device='cuda:0', dtype=torch.float64)
tensor([-0.0844,  0.0129, -0.0410, -0.0019, -0.0241,  0.0163,  0.0682, -0.0348,
         0.0840,  0.0828], device='cuda:0', dtype=torch.float64,
       grad_fn=<SliceBackward0>)
tensor([ 0.0016,  0.0003, -0.0003, -0.0004,  0.0001,  0.0001, -0.0004, -0.0003,
         0.0005, -0.0020], device='cuda:0', dtype=torch.float64)
tensor([-0.0844,  0.0129, -0.0410, -0.0019, -0.0241,  0.0163,  0.0682, -0.0348,
         0.0840,  0.0828], device='cuda:0', dtype=torch.float64,
       grad_fn=<SliceBackward0>)
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0',
       dtype=torch.float64)


In [16]:
print(model.embed.w.weight.grad[0][0:10])
print(model.embed.w.weight[0][0:10])

for i in range(0, 100):
    prediction = model(some_input)
    loss = (ideal_output - prediction).pow(2).sum()
    loss.backward()
    
print(model.dense0.weight.grad[0][0:10])
optimizer.step()
print(model.embed.w.weight[0][0:10])
optimizer.zero_grad()


tensor([0., 0., 0., 0., 0., 0.], device='cuda:0', dtype=torch.float64)
tensor([-0.1814, -0.3080, -0.0805,  0.0718, -0.3295,  0.3071], device='cuda:0',
       dtype=torch.float64, grad_fn=<SliceBackward0>)
tensor([ 1.1503e-03,  2.3224e-04, -1.8537e-04, -3.0438e-04,  8.5983e-05,
         8.0847e-05, -2.6324e-04, -2.1862e-04,  3.4867e-04, -1.4753e-03],
       device='cuda:0', dtype=torch.float64)
tensor([-0.1814, -0.3080, -0.0805,  0.0718, -0.3295,  0.3071], device='cuda:0',
       dtype=torch.float64, grad_fn=<SliceBackward0>)


Train Model

In [17]:
#@title Data generation

from math import ceil
from re import L
import torch
from torch import nn
import pickle
######################################################
###################### data  #########################
######################################################
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader

######################################################
###################### math ##########################
######################################################
import itertools as it
from math import factorial

from modulefinder import Module
import torch
from torch import nn
from torch.nn import Module
import pandas as pd
import numpy as np
from torch.nn.utils.parametrizations import orthogonal
import os

import torch
from torch import clamp
from torch.utils.data import Dataset

import matplotlib.pyplot as plt


from torch.utils.data import DataLoader
from modulefinder import Module
import torch
from torch import nn
from torch.nn import Module
import pandas as pd
import numpy as np
from torch.nn.utils.parametrizations import orthogonal
import os

import torch
from torch.utils.data import Dataset

import matplotlib.pyplot as plt


from torch.utils.data import DataLoader

class DatasetGen(Dataset):
    def __init__(self, size, d, n, cloud_1, cloud_2, ce=False, device = 'cpu') -> None:
        super(DatasetGen, self).__init__()
        self.ce = ce
        self.cloud_1 = cloud_1
        self.cloud_2 = cloud_2
        self.d = torch.tensor(d)
        self.n = torch.tensor(n)
        self.device = device
        self.size = torch.tensor(size).to(device=self.device)
        self.dataset = torch.zeros(self.size*2, self.d, self.n).to(device=self.device)
        self.label_set = torch.zeros(self.size*2, 2).to(device=self.device)
        self.labels()
        self.populate()
        self.centralize()
        self.addOrthogonal()
        self.addGaussian(0, 0.1)
        self.addPerm()

        #self.to(self.device)
    def labels(self) -> None:
        for idx in range(self.size):
            self.label_set[idx,:] = torch.tensor([1,0], dtype=torch.float32).to(device=self.device)
            self.label_set[self.size + idx,:] = torch.tensor([0,1], dtype=torch.float32).to(device=self.device)     
    ###############################################################
    ##############initialize dataset ##############################
    ###############################################################
    def populate(self) -> None:
        for idx in range(self.size):
            self.dataset[idx,:,:] = torch.clone(self.cloud_1).to(device=self.device)
            self.dataset[self.size + idx,:,:] = torch.clone(self.cloud_2).to(device=self.device)
    ###############################################################
    #################centralize####################################
    ###############################################################
    def centralize(self) -> None:
        onesis = torch.ones(self.n, self.n, device = self.device)
        for idx in range(self.size):
            self.dataset[idx,:,:] -= torch.div(1, clamp(self.n, min=1))*torch.matmul(self.dataset[idx,:,:] , onesis).to(device=self.device)
            self.dataset[self.size + idx,:,:] -= torch.div(1, clamp(self.n,min=1))*torch.matmul(self.dataset[self.size + idx,:,:], onesis).to(device=self.device)      
            if idx == 999:
                print(self.dataset[idx,:,:])
                print(self.dataset[self.size + idx])
    ###############################################################
    ##############add Gaussian noise ##############################
    ###############################################################
    def addGaussian(self, mean, std) -> None:
        for idx in range(self.size):
           self.dataset[idx,:,:] += (torch.randn(self.d, self.n)*std + mean).to(device=self.device)
           self.dataset[self.size + idx,:,:] += (torch.randn(self.d, self.n)*std + mean).to(device=self.device)
    ###############################################################
    ##############add orthogonal transformation ###################
    ###############################################################
    def addOrthogonal(self) -> None :
        for idx in range(self.size):
           ortho = orthogonal(nn.Linear(self.d, self.d)).weight.to(device=self.device) #new mapping for each index
           self.dataset[idx,:,:] = torch.matmul(ortho, self.dataset[idx,:,:]).to(device=self.device)
           ortho = orthogonal(nn.Linear(self.d, self.d)).weight.to(device=self.device) #new mapping for each index           
           self.dataset[self.size + idx,:,:] = torch.matmul(ortho, self.dataset[self.size + idx,:,:]).to(device=self.device)
    ##################################################################
    #####################permute######################################
    ##################################################################
    def addPerm(self) -> None:
        for idx in range(self.size):
            index_list = torch.randperm(self.n).to(device=self.device)
            self.dataset[idx,:,:] =self.dataset[idx,:,index_list].clone().to(device=self.device)
            index_list = torch.randperm(self.n).to(device=self.device)
            self.dataset[self.size + idx,:,:] = self.dataset[self.size + idx,:,index_list] .clone().to(device=self.device)


    ###############################################################
    ####################Modify for dataloader #####################
    ###############################################################
    def __len__(self):
        return len(self.label_set)
    def __getitem__(self, idx):
        data = self.dataset[idx,:,:].clone().detach().to(device=self.device)
        labels = self.label_set[idx].clone().detach().to(device=self.device)
        data = torch.squeeze(data).to(device=self.device)
        return data, labels




In [18]:
#@title Train function

#example of equal sets of sets of distances https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.125.166001

C = torch.tensor([
    [-2,0,-2],
    [2,0,2],
    [1,1,0],
    [-1,-1,0],
    [1,2,0],
    [-1,2,0],
    [0,0,1]
])

D = torch.tensor([
    [-2,0,-2],
    [2,0,2],
    [1,1,0],
    [-1,-1,0],
    [1,2,0],
    [-1,2,0],
    [0,0,-1]
])
    
def train(epoch, loader, model, loss_fun, lr_scheduler, optimizer, batch_size=1,device='cuda:0', partition = 'train'):
    torch.manual_seed(42)

    res = {'loss': 0, 'counter': 0, 'loss_arr':[], 'accuracy': 0, 'acc_list' : []}
    for i, data in enumerate(loader):
        if partition == 'train':
            model.train() 
            optimizer.zero_grad()

        else:
            model.eval()

        #get data
        label = data[1].to(device)
        label = label.squeeze().to(device)
        data_now = data[0].to(device, dtype=torch.double)
        #predict
        pred = model(data_now).to(device)

        if partition == 'train':

            loss = loss_fun(pred.squeeze().to(torch.float), label.squeeze().to(torch.float))
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
        else:
            loss = loss_fun(pred.squeeze().to(torch.float), label.squeeze().to(torch.float))
        
        res['loss'] += loss.item() * batch_size
        res['counter'] += batch_size
        res['loss_arr'].append(loss.item())
        prefix = ""
        if partition != 'train':
            prefix = ">> %s \t" % partition
        log_interval = 10
        if i % log_interval == 0:
            print(prefix + "Epoch %d \t Iteration %d \t loss %.4f" % (epoch, i, sum(res['loss_arr'][-10:])/len(res['loss_arr'][-10:])))
    return res['loss'] / res['counter'], res['loss_arr']



In [19]:
#@title Point cloud dataset generation
cloud1= C
cloud2=D
dimension, n = 3, 7
batch_size = 1
cloud_1 = torch.clone(cloud1).to(device)
cloud_2 = torch.clone(cloud2).to(device)

train_len = 20000
test_len = 500
dataset_train = DatasetGen( train_len, dimension, n, torch.t(cloud_1), torch.t(cloud_2), ce = True )
dataset_test = DatasetGen( test_len, dimension, n, torch.t(cloud_1), torch.t(cloud_2), ce = True )
dataset_eval = DatasetGen( test_len, dimension, n, torch.t(cloud_1), torch.t(cloud_2), ce= True )

dataloader_train = DataLoader( dataset_train,  batch_size=batch_size, shuffle=True)
dataloader_eval = DataLoader( dataset_eval,  batch_size=batch_size, shuffle=True)
dataloader_test = DataLoader( dataset_test,  batch_size=batch_size, shuffle=True)


tensor([[-2.0000,  2.0000,  1.0000, -1.0000,  1.0000, -1.0000,  0.0000],
        [-0.5714, -0.5714,  0.4286, -1.5714,  1.4286,  1.4286, -0.5714],
        [-2.1429,  1.8571, -0.1429, -0.1429, -0.1429, -0.1429,  0.8571]])
tensor([[-2.0000,  2.0000,  1.0000, -1.0000,  1.0000, -1.0000,  0.0000],
        [-0.5714, -0.5714,  0.4286, -1.5714,  1.4286,  1.4286, -0.5714],
        [-1.8571,  2.1429,  0.1429,  0.1429,  0.1429,  0.1429, -0.8571]])


In [20]:
def main():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
    torch.cuda.device(device)
    epochs = 10
    #test_interval = 10


    dataloaders = { 'train' : dataloader_train, 'test' : dataloader_test, 'valid' : dataloader_eval}
    torch.manual_seed(42)
    model = Net(batch=1, dim=3, n=7, const=5, exp=True, delta=-1/2)
    lr = 2e-4
    wd = 1e-6
    #iterations = 1000
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)

    loss_fun = torch.nn.BCELoss(weight=None, size_average=None, reduce=None, reduction='mean')
    res = {'epochs': [], 'losess': [], 'accuracies': [],'best_val': 1e10, 'best_test': 1e10, 'best_epoch': 0}

    for epoch in range(0, epochs):
        _, _ = train(epoch, dataloaders['train'], model, loss_fun, lr_scheduler, optimizer, device=device, batch_size=batch_size,partition = 'train')
        val_loss,  _ = train(epoch, dataloaders['valid'], model, loss_fun, lr_scheduler, optimizer, device=device,  batch_size=batch_size,partition = 'valid')
        test_loss , _ = train(epoch, dataloaders['test'], model, loss_fun, lr_scheduler, optimizer, device=device,  batch_size=batch_size,partition = 'test')
        print("Epoch :{}, val loss: {}, test loss : {} ".format(epoch, val_loss, test_loss))
        if val_loss < res['best_val']:
            res['best_val'] = val_loss
            res['best_test'] = test_loss
            res['best_epoch'] = epoch
        print("Best: val loss: %.4f \t test loss: %.4f \t epoch %d" % (res['best_val'], res['best_test'], res['best_epoch']))
        


In [None]:
with torch.autograd.profiler.profile(use_cuda=use_cuda) as prf:
    main()
        

print(prf.key_averages().table(sort_by='self_cpu_time_total'))

STAGE:2022-12-22 10:39:46 2290886:2290886 ActivityProfilerController.cpp:294] Completed Stage: Warm Up


Epoch 0 	 Iteration 0 	 loss 100.0000
Epoch 0 	 Iteration 10 	 loss 50.0000
Epoch 0 	 Iteration 20 	 loss 50.0000
Epoch 0 	 Iteration 30 	 loss 40.0000
Epoch 0 	 Iteration 40 	 loss 40.0000
Epoch 0 	 Iteration 50 	 loss 50.0000
Epoch 0 	 Iteration 60 	 loss 40.0000
Epoch 0 	 Iteration 70 	 loss 40.0000
Epoch 0 	 Iteration 80 	 loss 70.0000
Epoch 0 	 Iteration 90 	 loss 30.0000
Epoch 0 	 Iteration 100 	 loss 50.0000
Epoch 0 	 Iteration 110 	 loss 40.0000
Epoch 0 	 Iteration 120 	 loss 40.0000
Epoch 0 	 Iteration 130 	 loss 69.7800
Epoch 0 	 Iteration 140 	 loss 60.0000
Epoch 0 	 Iteration 150 	 loss 48.1004
Epoch 0 	 Iteration 160 	 loss 56.7151
Epoch 0 	 Iteration 170 	 loss 28.8418
Epoch 0 	 Iteration 180 	 loss 39.5621
Epoch 0 	 Iteration 190 	 loss 47.5027
Epoch 0 	 Iteration 200 	 loss 40.0000
Epoch 0 	 Iteration 210 	 loss 46.4463
Epoch 0 	 Iteration 220 	 loss 46.2764
Epoch 0 	 Iteration 230 	 loss 35.7765
Epoch 0 	 Iteration 240 	 loss 32.1720
Epoch 0 	 Iteration 250 	 loss 12.9

[W profiler_kineto.cpp:441] Failed to record CUDA event. /opt/conda/conda-bld/pytorch_1666642969563/work/torch/csrc/profiler/cuda.cpp:44: out of memory
[W collection.cpp:246] Failed to record CUDA event. /opt/conda/conda-bld/pytorch_1666642969563/work/torch/csrc/profiler/cuda.cpp:44: out of memory
[W profiler_kineto.cpp:441] Failed to record CUDA event. /opt/conda/conda-bld/pytorch_1666642969563/work/torch/csrc/profiler/cuda.cpp:44: out of memory
[W profiler_kineto.cpp:441] Failed to record CUDA event. /opt/conda/conda-bld/pytorch_1666642969563/work/torch/csrc/profiler/cuda.cpp:44: out of memory
[W profiler_kineto.cpp:441] Failed to record CUDA event. /opt/conda/conda-bld/pytorch_1666642969563/work/torch/csrc/profiler/cuda.cpp:44: out of memory
[W profiler_kineto.cpp:441] Failed to record CUDA event. /opt/conda/conda-bld/pytorch_1666642969563/work/torch/csrc/profiler/cuda.cpp:44: out of memory
