In [3]:
####################################################################################
##Orvieto A, Raj A, Kersting H, Bach F. Explicit regularization in overparametrized#
##models via noise injection. 2023, arXiv, Available: https://arxiv.org/pdf/2206.###
###https://github.com/aorvieto/noise_injection_overparam ###########################
####################################################################################

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import time
import torchvision.models as models
from torchvision import datasets, transforms
from torchvision.datasets import CIFAR100
import copy
import math
import numpy as np
import random
import matplotlib.pyplot as plt
#from utils import flat_params, compute_jacobian, get_lr
#from models import CIFARCNN2, CIFARCNN3, MNISTNet1, MNISTNet2, MNISTNet3, CIFARCNN1, vgg11_bn, ResNet34, ResNet18
from datetime import datetime
from pyhessian import hessian
import torchvision.transforms as tt
from tqdm import tqdm

In [4]:
#数据flat成一维
def flat_params(m):
    flat_data = []
    for p in m.parameters():
        flat_data.append(p.data.view(-1))
    return torch.cat(flat_data)

#梯度的范数
def grad_norm(model):
    total_norm = 0
    for p in model.parameters():
        param_norm = p.grad.detach().data.norm(2)
        total_norm += param_norm.item() ** 2
    return total_norm.cpu() ** 0.5

#零梯度
def zero_gradients(x):
    if x.grad is not None:
    	x.grad.zero_()

#设置学习率
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

#计算Jacobian矩阵
def compute_jacobian(inputs, output):
	"""
	:param inputs: Batch X Size (e.g. Depth X Width X Height)
	:param output: Batch X Classes
	:return: jacobian: Batch X Classes X Size
	"""
	assert inputs.requires_grad

	num_classes = output.size()[1]

	jacobian = torch.zeros(num_classes, *inputs.size())
	grad_output = torch.zeros(*output.size())
	if inputs.is_cuda:
		grad_output = grad_output.cuda()
		jacobian = jacobian.cuda()

	for i in range(num_classes):
		zero_gradients(inputs)
		grad_output.zero_()
		grad_output[:, i] = 1
		output.backward(grad_output, retain_variables=True)
		jacobian[i] = inputs.grad.data

	return torch.transpose(jacobian, dim0=0, dim1=1)    

In [5]:
####定义了很多网络结构
####全连接网络，卷积网，VGG，残差ResN


class MNISTNet1(nn.Module):
    def __init__(self):
        super(MNISTNet1, self).__init__()
        self.fc1 = nn.Linear(28*28, 500)
        self.fc2 = nn.Linear(500, 500)
        self.fc3 = nn.Linear(500, 500)
        self.fc4 = nn.Linear(500, 10)

    def forward(self, x):
        x = x.view(-1, 28*28)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return F.log_softmax(self.fc4(x), dim=1)

class MNISTNet2(nn.Module):
    def __init__(self):
        super(MNISTNet2, self).__init__()
        self.fc1 = nn.Linear(28*28, 5000)
        self.fc2 = nn.Linear(5000, 5000)
        self.fc3 = nn.Linear(5000, 5000)
        self.fc4 = nn.Linear(5000, 10)

    def forward(self, x):
        x = x.view(-1, 28*28)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return F.log_softmax(self.fc4(x), dim=1)

class MNISTNet3(nn.Module):
    def __init__(self):
        super(MNISTNet3, self).__init__()
        self.fc1 = nn.Linear(28*28, 1000)
        self.fc2 = nn.Linear(1000, 1000)
        self.fc3 = nn.Linear(1000, 1000)
        self.fc4 = nn.Linear(1000, 1000)
        self.fc5 = nn.Linear(1000, 1000)
        self.fc6 = nn.Linear(1000, 10)

    def forward(self, x):
        x = x.view(-1, 28*28)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        return F.log_softmax(self.fc6(x), dim=1)

class CIFARCNN1(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class CIFARCNN2(nn.Module):
    def __init__(self):
        super().__init__()
        self.pool = nn.MaxPool2d(2, 2)
        self.conv1 = nn.Conv2d(3, 32, 3, padding = 1)
        self.conv2 = nn.Conv2d(32, 128, 3, padding = 1)
        self.conv3 = nn.Conv2d(128, 128, 3, padding = 1)
        self.conv4 = nn.Conv2d(128, 128, 3, padding = 1)
        self.fc1 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x))) 
        x = x.mean(dim=[2,3]) 
        x = self.fc1(x)
        return x

class CIFARCNN3(nn.Module):
    def __init__(self):
        super().__init__()
        self.pool = nn.MaxPool2d(2, 2)
        self.conv1 = nn.Conv2d(3, 32, 3, padding = 1)
        self.conv2 = nn.Conv2d(32, 128, 3, padding = 1)
        self.conv3 = nn.Conv2d(128, 128, 3, padding = 1)
        self.conv4 = nn.Conv2d(128, 128, 3, padding = 1)
        self.fc1 = nn.Linear(128, 5000)
        self.fc2 = nn.Linear(5000, 10)


    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x))) 
        x = x.mean(dim=[2,3]) 
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return x       

### CIFAR 100 stuff ###
cfg = {
    'A' : [64,     'M', 128,      'M', 256, 256,           'M', 512, 512,           'M', 512, 512,           'M'],
    'B' : [64, 64, 'M', 128, 128, 'M', 256, 256,           'M', 512, 512,           'M', 512, 512,           'M'],
    'D' : [64, 64, 'M', 128, 128, 'M', 256, 256, 256,      'M', 512, 512, 512,      'M', 512, 512, 512,      'M'],
    'E' : [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M']
}

class VGG(nn.Module):

    def __init__(self, features, num_class=100):
        super().__init__()
        self.features = features

        self.classifier = nn.Sequential(
            nn.Linear(512, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, num_class)
        )

    def forward(self, x):
        output = self.features(x)
        output = output.view(output.size()[0], -1)
        output = self.classifier(output)

        return output

def make_layers(cfg, batch_norm=False):
    layers = []

    input_channel = 3
    for l in cfg:
        if l == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            continue

        layers += [nn.Conv2d(input_channel, l, kernel_size=3, padding=1)]

        if batch_norm:
            layers += [nn.BatchNorm2d(l)]

        layers += [nn.ReLU(inplace=True)]
        input_channel = l

    return nn.Sequential(*layers)

def vgg11_bn():
    return VGG(make_layers(cfg['A'], batch_norm=True))

def vgg13_bn():
    return VGG(make_layers(cfg['B'], batch_norm=True))

def vgg16_bn():
    return VGG(make_layers(cfg['D'], batch_norm=True))

def vgg19_bn():
    return VGG(make_layers(cfg['E'], batch_norm=True))

##### REs CIFAR 100

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )
        if False:
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                    m.weight.data.normal_(0, math.sqrt(2. / n))

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

        if False:
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                    m.weight.data.normal_(0, math.sqrt(2. / n))

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out_final = self.linear(out)
        return out_final 
    
def ResNet18():
    return ResNet(BasicBlock, [2,2,2,2])

def ResNet34():
    return ResNet(BasicBlock, [3,4,6,3])

def ResNet50():
    return ResNet(Bottleneck, [3,4,6,3])

def ResNet101():
    return ResNet(Bottleneck, [3,4,23,3])

def ResNet152():
    return ResNet(Bottleneck, [3,8,36,3])

def ResNet200():
    return ResNet(Bottleneck, [3, 24, 36, 3])

def ResNet270():
    return ResNet(Bottleneck, [3,36,48,3])

def ResNet336():
    return ResNet(Bottleneck, [3,44,62,3])

def ResNet500():
    return ResNet(Bottleneck, [3,70,90,3])

In [6]:
def train_net(settings):

    ########### Setting Up GPU ###########  
   #gpu_ids = GPU
    #torch.cuda.set_device(gpu_ids[0])
    device = 'cuda'

    ########### Setup Data and Model ###########    
    if settings["dataset"]=="MNIST":

        #data
        train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transforms.ToTensor())
        validation_dataset = datasets.MNIST('./data', train=False, download=True, transform=transforms.ToTensor())
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=settings["bs"], shuffle=True, num_workers=8, pin_memory=True, persistent_workers=True)
        validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset, batch_size=settings["bs"], shuffle=False, num_workers=8, pin_memory=True, persistent_workers=True)

        #model
        if settings["net"] == "MLP1": 
            model = torch.nn.DataParallel(MNISTNet1()).cuda()
        elif settings["net"] == "MLP2":
            model = torch.nn.DataParallel(MNISTNet2()).cuda()
        elif settings["net"] == "MLP3":
            model = torch.nn.DataParallel(MNISTNet3()).cuda()
        else: print("model not defined")
        criterion = nn.CrossEntropyLoss()

    elif settings["dataset"]=="FMNIST":

        #data
        train_dataset = datasets.FashionMNIST('./data',download=True, train= True, transform=transforms.ToTensor())
        validation_dataset = datasets.FashionMNIST('./data',download=True, train= False, transform=transforms.ToTensor())

        #Trainloader subset
        #train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=settings["bs"], shuffle=True, num_workers=8, pin_memory=True, persistent_workers=True)
        subset = random.sample(range(train_dataset.data.shape[0]),settings["subset"])
        sample_ds = torch.utils.data.Subset(train_dataset, subset)
        sample_sampler = torch.utils.data.RandomSampler(sample_ds)
        train_loader = torch.utils.data.DataLoader(sample_ds, sampler=sample_sampler, batch_size=settings["bs"], num_workers=8, pin_memory=True, persistent_workers=True)

        #testloader
        validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset, batch_size=settings["bs"], shuffle=False, num_workers=8, pin_memory=True, persistent_workers=True)       

        #hessloader subset
        hess_loader = train_loader

        #models
        if settings["net"] == "MLP1": 
            model = torch.nn.DataParallel(MNISTNet1()).cuda()
        elif settings["net"] == "MLP2":
            model = torch.nn.DataParallel(MNISTNet2()).cuda()
        elif settings["net"] == "MLP3":
            model = torch.nn.DataParallel(MNISTNet3()).cuda()
        else: print("model not defined")
        criterion = nn.CrossEntropyLoss()

    elif settings["dataset"]=="CIFAR":

        #data transforms
        transform_train = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        ])

        transform_test = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        ])


        #data
        train_dataset = torchvision.datasets.CIFAR10(root='./data', download=True, train=True, transform=transform_train)
        validation_dataset = torchvision.datasets.CIFAR10(root='./data', download=True, train=False, transform=transform_test)
        
        #trainloader subset
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=settings["bs"], shuffle=True, num_workers=2, pin_memory=True, persistent_workers=True)
        #subset = random.sample(range(train_dataset.data.shape[0]),settings["subset"])
        #sample_ds = torch.utils.data.Subset(train_dataset, subset)
        #sample_sampler = torch.utils.data.RandomSampler(sample_ds)
        #train_loader = torch.utils.data.DataLoader(sample_ds, sampler=sample_sampler, batch_size=settings["bs"], num_workers=8, pin_memory=True, persistent_workers=True)

        #hessloader subset
        subset = random.sample(range(train_dataset.data.shape[0]),int(settings["subset"]/10))
        sample_ds_hess = torch.utils.data.Subset(train_dataset, subset)
        sample_sampler_hess = torch.utils.data.RandomSampler(sample_ds_hess)
        hess_loader = torch.utils.data.DataLoader(sample_ds_hess, sampler=sample_sampler_hess, batch_size=settings["bs"], num_workers=8, pin_memory=True, persistent_workers=True)

        #testloader
        validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset, batch_size=settings["bs"], shuffle=False, num_workers=8, pin_memory=True, persistent_workers=True)       

        #models
        if settings["net"] == "CNN1": 
            model = torch.nn.DataParallel(CIFARCNN1()).cuda()
        elif settings["net"] == "CNN2": 
            model = torch.nn.DataParallel(CIFARCNN2()).cuda()
        elif settings["net"] == "CNN3": 
            model = torch.nn.DataParallel(CIFARCNN3()).cuda()
        elif settings["net"] == "CIFAR10Res18":
            model = torch.nn.DataParallel(ResNet18()).cuda()
        elif settings["net"] == "CIFAR10Res34":
            model = torch.nn.DataParallel(ResNet34()).cuda()

        else: print("model not defined")
        criterion = nn.CrossEntropyLoss() 

    elif settings["dataset"]=="CIFAR100":
        stats = ((0.5074,0.4867,0.4411),(0.2011,0.1987,0.2025))
        train_transform = tt.Compose([tt.RandomHorizontalFlip(),tt.RandomCrop(32,padding=4,padding_mode="reflect"),tt.ToTensor(), tt.Normalize(*stats)])
        test_transform = tt.Compose([tt.ToTensor(),tt.Normalize(*stats)])
        train_dataset = CIFAR100(download=True,root="./data",transform=train_transform)
        test_data = CIFAR100(root="./data",train=False,transform=test_transform)

        #trainloader subset
        train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=settings["bs"],num_workers=8,pin_memory=True,shuffle=True, persistent_workers=True)
       
        #hessloader
        subset = random.sample(range(train_dataset.data.shape[0]),int(settings["subset"]/25))
        sample_ds_hess = torch.utils.data.Subset(train_dataset, subset)
        sample_sampler_hess = torch.utils.data.RandomSampler(sample_ds_hess)    
        hess_loader = torch.utils.data.DataLoader(sample_ds_hess, sampler=sample_sampler_hess, batch_size=settings["bs"], num_workers=8, pin_memory=True, persistent_workers=True)
    
        #testloader       
        validation_loader = torch.utils.data.DataLoader(test_data,batch_size=settings["bs"],num_workers=8,pin_memory=True, persistent_workers=True)

        #models
        if settings["net"] == "CIFAR100vgg": 
            model = torch.nn.DataParallel(vgg11_bn()).cuda()
        else: print("model not defined")
        criterion = nn.CrossEntropyLoss() 

    # elif settings["dataset"]=="TinyImagenet":
    #     #todo


    ########### Setup Optimizer ###########   
    if settings["optimizer"]=="SGD":
        optimizer = torch.optim.SGD(model.parameters(), lr=settings["lr"],momentum=0.9, weight_decay=5e-4)
    elif settings["optimizer"]=="Adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=settings["lr"])        
    else: print("method not defined!!")

    if settings["scheduler"]:
        #scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 100], gamma=0.1)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, int(settings["epochs"]))
        
    sigma_curr = settings["sigma"]

    ########### Setup Writer Variables ###########  
    results = {"rec_steps":[], "train_loss":[], "test_loss":[],"hess":[], "test_acc":[], "train_acc":[], "hess_trace":[], "l1_norm":[], "l2_norm":[], "grad_norm":[]}    

    ########### Getting number of layers ###########      
    n_groups = 0
    dim_model = 0
    with torch.no_grad():
        for param in model.parameters():   
            n_groups = n_groups + 1
            dim_model = dim_model + torch.numel(param)
    print('Model dimension: ' + str(dim_model))
    print('Number of groups: ' + str(n_groups))

    ##### iteration counter
    iter = 0
    
	########### Training ###########     
    for epoch in range(settings["epochs"]):
        start_time = time.time()
        print('epoch #'+str(epoch)+', lr = '+str(get_lr(optimizer)))

        ########### Computing Hessian if last iteration #####
        hess_train = 0
        if 0:
            if epoch==(settings["epochs"]-1):
                model.eval()
                optimizer.zero_grad(set_to_none=True)
                for d in hess_loader:
                    data, target = d[0].to(device, non_blocking=True),d[1].to(device, non_blocking=True)
                    hess_train = hess_train+np.sum(hessian(model, criterion, data=(data, target), cuda=True).trace())/len(hess_loader) 
                optimizer.zero_grad(set_to_none=True)

        ########### Saving stats every few epochs ########### 
        if (epoch%settings["rec_step"])==0:
            results["rec_steps"].append(epoch)
            model.eval()
            
            #computing stats: train loss
            train_loss, correct = 0, 0
            for d in train_loader:
                data, target = d[0].to(device, non_blocking=True),d[1].to(device, non_blocking=True)
                output = model(data)
                train_loss += criterion(output, target).data.item()/len(train_loader)
                pred = output.data.max(1)[1] # get the index of the max log-probability
                correct += pred.eq(target.data).cpu().sum()
            accuracy_train = 100. * correct.to(torch.float32) / len(train_loader.dataset)

            #computing stats: test loss
            test_loss, correct = 0, 0
            for data, target in validation_loader:
                data = data.to(device, non_blocking=True)
                target = target.to(device, non_blocking=True)
                output = model(data)
                #d_out = output.size[-1]
                #print(d_out)
                test_loss += criterion(output, target).data.item()/len(validation_loader)
                pred = output.data.max(1)[1] # get the index of the max log-probability
                correct += pred.eq(target.data).cpu().sum()
            accuracy_test = 100. * correct.to(torch.float32) / len(validation_loader.dataset)


            #regularized loss
            # J = torch.zeros((d_out, dim_model))
            # for data, target in validation_loader:
            #     data = data.to(device, non_blocking=True)
            #     target = target.to(device, non_blocking=True)
            #     output = model(data)
            #     J = J + torch.mean(compute_jacobian(data, output),0) / len(train_loader)            

            #saving stats
            results["train_loss"].append(train_loss)
            results["train_acc"].append(accuracy_train)
            results["test_loss"].append(test_loss)
            results["test_acc"].append(accuracy_test)
            results["hess"].append(hess_train)
            results["l1_norm"].append(torch.norm(flat_params(model),1).cpu().detach().numpy())
            results["l2_norm"].append(torch.norm(flat_params(model),2).cpu().detach().numpy())
            #results["grad_norm"].append(grad_norm(model).numpy())

            print('Epoch {}: Train L: {:.4f}, TrainAcc: {:.2f}, Test L: {:.4f}, TestAcc: {:.2f} \n'.format(epoch, train_loss, accuracy_train, test_loss, accuracy_test))

        ########### Saving stats every few epochs ########### 
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            ### model perturbation
            if epoch<135:
                if settings["noise"] != "no":
                    param_copy = []
                    with torch.no_grad():
                        i=0
                        for param in model.parameters():
                            param_copy.append(param.data)
                            if settings["noise"] == "all":
                                param.data = param.data + (sigma_curr/math.sqrt(n_groups))*torch.normal(0, 1, size=param.size(),device=device)
                            elif settings["noise"] == "layer":
                                if i==(iter%n_groups):
                                    param.data = param.data + sigma_curr*torch.normal(0, 1, size=param.size(),device=device)
                                i = i+1

            ### backprop
            data = data.to(device)
            target = target.to(device)
            optimizer.zero_grad() 
            output = model(data)
            loss = criterion(output, target)
            loss.backward()

            ### model recovery
            if settings["noise"] != "no":
                with torch.no_grad():
                    i=0
                    for param in model.parameters():
                        param.data = param_copy[i]
                        i=i+1

            optimizer.step()
            #print(loss.item())
            iter = iter +1

        if settings["scheduler"]:
            scheduler.step()
        print(time.time()-start_time)

    return results

def settings_to_str(settings):
    return datetime.now().strftime("%H_%M_%S")+ '_' + settings["dataset"] + '_subset' + str(settings["subset"])  + "_" + settings["net"] + "_noise_" + settings["noise"] + '_bs' + str(settings["bs"]) + '_' + settings["optimizer"] + '_lr' + str(settings["lr"]) + '_sigma'+ str(settings["sigma"])+ '_rec'+ str(settings["rec_step"])

if __name__ == "__main__":

    # example comparing different noise injections to vanilla SGD

    #GPU = [2,3]
    dataset = "CIFAR"
    subset = 50000
    nep = 50
    bs = 128
    rec_step = 1
    net = "CIFAR10Res18"

    noise = "layer" #no,layer,all
    sigma = 0.03
    lr1 = 0.01
    settings = {"dataset":dataset, "subset": subset, "net": net, "optimizer":"SGD", "scheduler":True, "noise":noise, "bs":bs, "lr":lr1, "sigma":sigma, "epochs":nep, "rec_step":rec_step}
    results = train_net(settings)
    torch.save(results, 'results'+settings_to_str(settings)+'.pt') 

    #noise = "layer" #layer,all
    #sigma = 0.03
    #lr1 = 0.01
    #settings = {"dataset":dataset, "subset": subset, "net": net, "optimizer":"SGD", "scheduler":True, "noise":noise, "bs":bs, "lr":lr1, "sigma":sigma, "epochs":nep, "rec_step":rec_step}
    #results = train_net(settings)
    #torch.save(results, 'results'+settings_to_str(settings)+'.pt') 

Files already downloaded and verified
Files already downloaded and verified
Model dimension: 11173962
Number of groups: 62
epoch #0, lr = 0.01
Epoch 0: Train L: 2.3023, TrainAcc: 9.72, Test L: 2.3018, TestAcc: 9.41 

89.8532280921936
epoch #1, lr = 0.009990133642141357
Epoch 1: Train L: 1.1323, TrainAcc: 59.62, Test L: 1.1449, TestAcc: 58.97 

53.822288036346436
epoch #2, lr = 0.009960573506572389
Epoch 2: Train L: 0.8713, TrainAcc: 69.37, Test L: 0.9118, TestAcc: 68.65 

57.694769859313965
epoch #3, lr = 0.009911436253643442
Epoch 3: Train L: 0.7879, TrainAcc: 72.45, Test L: 0.8116, TestAcc: 71.55 

66.92317152023315
epoch #4, lr = 0.009842915805643154
Epoch 4: Train L: 0.6334, TrainAcc: 78.14, Test L: 0.6977, TestAcc: 76.77 

76.02175617218018
epoch #5, lr = 0.009755282581475767
Epoch 5: Train L: 0.5502, TrainAcc: 80.60, Test L: 0.6188, TestAcc: 79.07 

85.50759744644165
epoch #6, lr = 0.009648882429441256
Epoch 6: Train L: 0.4803, TrainAcc: 83.66, Test L: 0.5395, TestAcc: 82.24 

91

In [12]:
 torch.save(results, 'results'+settings_to_str(settings)+'.pt') 