In [1]:
import torch
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import numpy as np
from sklearn.neighbors import KernelDensity

In [3]:
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * images.size(0)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    train_loss /= len(train_loader.dataset)
    accuracy = 100. * correct / total
    return train_loss, accuracy

In [4]:
import torch

def evaluate(device, model, dataloader):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    return accuracy

In [146]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import numpy as np
from sklearn.neighbors import KernelDensity


def extract_activations_layers(layers):
    """ Extract for each layer the activations

    Args:
        layers (np.array): shape (layer_activation, batch_size, number_of_neurons)

    Returns:
        np.array: shape (layer_activation, batch_size, number_of_activations)
    """

    return np.array([np.array([np.array(h) for h in l]) for l in layers])

def extract_activations_per_sample(layers, mask = False):
    """ Extract for each sample the activations 
    for each layer and store them in a list.

    Args:
        layers (np.array): shape (layer_activation, batch_size, number_of_neurons)

    Returns:
        np.array: shape (batch_size, number_of_activations)
    """

    if mask == True:
        # mask the activations to remove zeros
        mask = layers != 0
        layers = [[np.array(h[m]) for h, m in zip(l,sm)] 
                for l, sm in zip(layers, mask)]
        
    return np.array([layers[:,i,:].flatten().reshape(-1, 1) for i in range(layers.shape[1])])


def get_sampled_activations(activations, bandwidth = 0.2):
    """ Sample the activations using KDE

    Args:
        activations (np.array): shape (batch_size, number_of_activations)

    Returns:
        np.array: shape (batch_size, number_of_activations)
    """

    return torch.from_numpy(np.array([KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(a).sample([64]) for a in activations], dtype="float32")).squeeze(2)

def wd(layers: list()):
    """ Compute the weight decay for each layer

    Args:
        layers (list): list of layers

    Returns:
        torch.tensor: weight decay

    """
    return get_sampled_activations(
                list(
                    extract_activations_per_sample(
                            extract_activations_layers(layers), 
                            mask=False
                        )
                ), 
                bandwidth=0.2
            )

class MLPWD(nn.Module):
    def __init__(self):
        super(MLPWD, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(784, 64)
        self.l2 = LinW(in_features=64, out_features=64, depth=0)
        self.l3 = LinW(in_features=64, out_features=64, depth=1, layers=[self.l2])
        self.l4 = nn.Linear(64, 10)
        self.gelu = nn.GELU()
        self.layers = [self.l2, self.l3]

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l2(x, repr))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l3(x, repr))
        x = self.l4(x)
        return x
    
    def __getitem__(self, idx):
        return self.layers[idx]
    
    def __len__(self):
        return len(self.layers)
    

class LinW(nn.Linear):
    def __init__(self, in_features, out_features, depth, layers=[]):
        super(LinW, self).__init__(in_features=in_features, out_features=out_features)
        self.depth = depth
        self.layers = layers[:self.depth] if len(layers)>0 else layers

    def forward(self, input, prev=[]):
        # weight_decay = wd(prev)
        # weight = self.weight * weight_decay.to('cuda:0')
        # return F.linear(input, weight, self.bias)
        # print(wd(prev).shape)
        return F.linear(input* wd(prev).to('cuda:0'), self.weight, self.bias)

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLPWD().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

print("LinW layers:", "\n".join([f"Depth {model[i].depth}: {model[i]}" for i in range(len(model))]), sep="\n\n")

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()


LinW layers:

Depth 0: LinW(in_features=64, out_features=64, bias=True)
Depth 1: LinW(in_features=64, out_features=64, bias=True)
Epoch 1/10, Training Loss: 1.6687, Training Accuracy: 39.01%, Test accuracy: 49.06%
Epoch 2/10, Training Loss: 1.3445, Training Accuracy: 51.48%, Test accuracy: 53.64%
Epoch 3/10, Training Loss: 1.2649, Training Accuracy: 54.51%, Test accuracy: 56.10%
Epoch 4/10, Training Loss: 1.2258, Training Accuracy: 55.48%, Test accuracy: 57.01%
Epoch 5/10, Training Loss: 1.1836, Training Accuracy: 56.70%, Test accuracy: 56.09%
Epoch 6/10, Training Loss: 1.1564, Training Accuracy: 57.52%, Test accuracy: 57.70%
Epoch 7/10, Training Loss: 1.1321, Training Accuracy: 58.13%, Test accuracy: 57.77%
Epoch 8/10, Training Loss: 1.1127, Training Accuracy: 59.04%, Test accuracy: 59.86%
Epoch 9/10, Training Loss: 1.0984, Training Accuracy: 59.62%, Test accuracy: 59.93%
Epoch 10/10, Training Loss: 1.0898, Training Accuracy: 59.99%, Test accuracy: 61.62%


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import numpy as np
from sklearn.neighbors import KernelDensity


def extract_activations_layers(layers):
    """ Extract for each layer the activations

    Args:
        layers (np.array): shape (layer_activation, batch_size, number_of_neurons)

    Returns:
        np.array: shape (layer_activation, batch_size, number_of_activations)
    """

    return np.array([np.array([np.array(h) for h in l]) for l in layers])

def extract_activations_per_sample(layers, mask = False):
    """ Extract for each sample the activations 
    for each layer and store them in a list.

    Args:
        layers (np.array): shape (layer_activation, batch_size, number_of_neurons)

    Returns:
        np.array: shape (batch_size, number_of_activations)
    """

    if mask == True:
        # mask the activations to remove zeros
        mask = layers != 0
        layers = [[np.array(h[m]) for h, m in zip(l,sm)] 
                for l, sm in zip(layers, mask)]
        
    return np.array([layers[:,i,:].flatten().reshape(-1, 1) for i in range(layers.shape[1])])


def get_sampled_activations(activations, bandwidth = 0.2):
    """ Sample the activations using KDE

    Args:
        activations (np.array): shape (batch_size, number_of_activations)

    Returns:
        np.array: shape (batch_size, number_of_activations)
    """

    return torch.from_numpy(np.array([KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(a).sample([64]) for a in activations], dtype="float32")).squeeze(2)

def wd(layers: list()):
    """ Compute the weight decay for each layer

    Args:
        layers (list): list of layers

    Returns:
        torch.tensor: weight decay

    """
    return get_sampled_activations(
                list(
                    extract_activations_per_sample(
                            extract_activations_layers(layers), 
                            mask=False
                        )
                ), 
                bandwidth=0.2
            )

class MLPWD(nn.Module):
    def __init__(self):
        super(MLPWD, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(784, 64)
        self.l2 = LinW(in_features=64, out_features=64, depth=0)
        self.l3 = LinW(in_features=64, out_features=64, depth=1, layers=[self.l2])
        self.l4 = nn.Linear(64, 10)
        self.gelu = nn.GELU()
        self.layers = [self.l2, self.l3]

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l2(x, repr))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l3(x, repr))
        x = self.l4(x)
        return x
    
    def __getitem__(self, idx):
        return self.layers[idx]
    
    def __len__(self):
        return len(self.layers)
    

class LinW(nn.Linear):
    def __init__(self, in_features, out_features, depth, layers=[]):
        super(LinW, self).__init__(in_features=in_features, out_features=out_features)
        self.depth = depth
        self.layers = layers[:self.depth] if len(layers)>0 else layers

    def forward(self, input, prev=[]):
        # weight_decay = wd(prev)
        # weight = self.weight * weight_decay.to('cuda:0')
        # return F.linear(input, weight, self.bias)
        # print(wd(prev).shape)
        return F.linear(wd(prev).to('cuda:0'), self.weight, self.bias)

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLPWD().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

print("LinW layers:", "\n".join([f"Depth {model[i].depth}: {model[i]}" for i in range(len(model))]), sep="\n\n")

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()


LinW layers:

Depth 0: LinW(in_features=64, out_features=64, bias=True)
Depth 1: LinW(in_features=64, out_features=64, bias=True)
Epoch 1/10, Training Loss: 2.3016, Training Accuracy: 11.37%, Test accuracy: 11.24%
Epoch 2/10, Training Loss: 2.2997, Training Accuracy: 11.81%, Test accuracy: 11.93%
Epoch 3/10, Training Loss: 2.2986, Training Accuracy: 12.07%, Test accuracy: 12.17%
Epoch 4/10, Training Loss: 2.2990, Training Accuracy: 11.91%, Test accuracy: 11.67%
Epoch 5/10, Training Loss: 2.2993, Training Accuracy: 11.83%, Test accuracy: 12.25%
Epoch 6/10, Training Loss: 2.2988, Training Accuracy: 12.08%, Test accuracy: 12.16%


KeyboardInterrupt: 

In [34]:
""" In this experiment we introduce a mutation magnitude over the 
activations in the intermediate layers. Since the activations mutate
we still introduce an additional dynamic weight that play an important
role in the training process to update the weights of the network.

"""


import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import numpy as np
from sklearn.neighbors import KernelDensity

import random


def extract_activations_layers(layers):
    """ Extract for each layer the activations

    Args:
        layers (np.array): shape (layer_activation, batch_size, number_of_neurons)

    Returns:
        np.array: shape (layer_activation, batch_size, number_of_activations)
    """

    return np.array([np.array([np.array(h) for h in l]) for l in layers])

def extract_activations_per_sample(layers, mask = False):
    """ Extract for each sample the activations 
    for each layer and store them in a list.

    Args:
        layers (np.array): shape (layer_activation, batch_size, number_of_neurons)

    Returns:
        np.array: shape (batch_size, number_of_activations)
    """

    if mask == True:
        # mask the activations to remove zeros
        mask = layers != 0
        layers = [[np.array(h[m]) for h, m in zip(l,sm)] 
                for l, sm in zip(layers, mask)]
        
    return np.array([layers[:,i,:].flatten().reshape(-1, 1) for i in range(layers.shape[1])])


def get_sampled_activations(activations, bandwidth = 0.2):
    """ Sample the activations using KDE

    Args:
        activations (np.array): shape (batch_size, number_of_activations)

    Returns:
        np.array: shape (batch_size, number_of_activations)
    """

    return torch.from_numpy(np.array([KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(a).sample([64]) for a in activations], dtype="float32")).squeeze(2)

def wd(layers: list()):
    """ Compute the weight decay for each layer

    Args:
        layers (list): list of layers

    Returns:
        torch.tensor: weight decay

    """
    return get_sampled_activations(
                list(
                    extract_activations_per_sample(
                            extract_activations_layers(layers), 
                            mask=False
                        )
                ), 
                bandwidth=0.2
            )

class MLPWD(nn.Module):
    def __init__(self):
        super(MLPWD, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(784, 64)
        self.l2 = LinW(in_features=64, out_features=64, depth=0)
        self.l3 = LinW(in_features=64, out_features=64, depth=1, layers=[self.l2])
        self.l4 = nn.Linear(64, 10)
        self.gelu = nn.GELU()
        self.layers = [self.l2, self.l3]

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l2(x, repr))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l3(x, repr))
        x = self.l4(x)
        return x
    
    def __getitem__(self, idx):
        return self.layers[idx]
    
    def __len__(self):
        return len(self.layers)
    

class LinW(nn.Linear):
    def __init__(self, in_features, out_features, depth, layers=[]):
        super(LinW, self).__init__(in_features=in_features, out_features=out_features)
        self.depth = depth
        self.layers = layers[:self.depth] if len(layers)>0 else layers

    def forward(self, input, prev=[]):
        # weight_decay = wd(prev)
        # weight = self.weight * weight_decay.to('cuda:0')
        # return F.linear(input, weight, self.bias)
        # print(wd(prev).shape)
        return F.linear(input*random.uniform(0.99,1.09), self.weight, self.bias)

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLPWD().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

print("LinW layers:", "\n".join([f"Depth {model[i].depth}: {model[i]}" for i in range(len(model))]), sep="\n\n")

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()


LinW layers:

Depth 0: LinW(in_features=64, out_features=64, bias=True)
Depth 1: LinW(in_features=64, out_features=64, bias=True)
Epoch 1/10, Training Loss: 0.3766, Training Accuracy: 88.75%, Test accuracy: 93.53%
Epoch 2/10, Training Loss: 0.1644, Training Accuracy: 95.06%, Test accuracy: 95.93%
Epoch 3/10, Training Loss: 0.1139, Training Accuracy: 96.59%, Test accuracy: 96.23%
Epoch 4/10, Training Loss: 0.0916, Training Accuracy: 97.22%, Test accuracy: 96.74%
Epoch 5/10, Training Loss: 0.0743, Training Accuracy: 97.71%, Test accuracy: 97.07%
Epoch 6/10, Training Loss: 0.0606, Training Accuracy: 98.08%, Test accuracy: 96.70%
Epoch 7/10, Training Loss: 0.0522, Training Accuracy: 98.39%, Test accuracy: 96.80%
Epoch 8/10, Training Loss: 0.0446, Training Accuracy: 98.58%, Test accuracy: 97.38%
Epoch 9/10, Training Loss: 0.0388, Training Accuracy: 98.82%, Test accuracy: 97.44%
Epoch 10/10, Training Loss: 0.0337, Training Accuracy: 98.93%, Test accuracy: 97.44%


In [38]:
""" Here insted of multiply I sum the sampled activations
of the previous layers on the inputs.



"""


import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import numpy as np
from sklearn.neighbors import KernelDensity


def extract_activations_layers(layers):
    """ Extract for each layer the activations

    Args:
        layers (np.array): shape (layer_activation, batch_size, number_of_neurons)

    Returns:
        np.array: shape (layer_activation, batch_size, number_of_activations)
    """

    return np.array([np.array([np.array(h) for h in l]) for l in layers])

def extract_activations_per_sample(layers, mask = False):
    """ Extract for each sample the activations 
    for each layer and store them in a list.

    Args:
        layers (np.array): shape (layer_activation, batch_size, number_of_neurons)

    Returns:
        np.array: shape (batch_size, number_of_activations)
    """

    if mask == True:
        # mask the activations to remove zeros
        mask = layers != 0
        layers = [[np.array(h[m]) for h, m in zip(l,sm)] 
                for l, sm in zip(layers, mask)]
        
    return np.array([layers[:,i,:].flatten().reshape(-1, 1) for i in range(layers.shape[1])])


def get_sampled_activations(activations, bandwidth = 0.2):
    """ Sample the activations using KDE

    Args:
        activations (np.array): shape (batch_size, number_of_activations)

    Returns:
        np.array: shape (batch_size, number_of_activations)
    """

    return torch.from_numpy(np.array([KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(a).sample([64]) for a in activations], dtype="float32")).squeeze(2)

def wd(layers: list()):
    """ Compute the weight decay for each layer

    Args:
        layers (list): list of layers

    Returns:
        torch.tensor: weight decay

    """
    return get_sampled_activations(
                list(
                    extract_activations_per_sample(
                            extract_activations_layers(layers), 
                            mask=False
                        )
                ), 
                bandwidth=0.2
            )

class MLPWD(nn.Module):
    def __init__(self):
        super(MLPWD, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(784, 64)
        self.l2 = LinW(in_features=64, out_features=64, depth=0)
        self.l3 = LinW(in_features=64, out_features=64, depth=1, layers=[self.l2])
        self.l4 = nn.Linear(64, 10)
        self.gelu = nn.GELU()
        self.layers = [self.l2, self.l3]

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l2(x, repr))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l3(x, repr))
        x = self.l4(x)
        return x
    
    def __getitem__(self, idx):
        return self.layers[idx]
    
    def __len__(self):
        return len(self.layers)
    

class LinW(nn.Linear):
    def __init__(self, in_features, out_features, depth, layers=[]):
        super(LinW, self).__init__(in_features=in_features, out_features=out_features)
        self.depth = depth
        self.layers = layers[:self.depth] if len(layers)>0 else layers

    def forward(self, input, prev=[]):
        # weight_decay = wd(prev)
        # weight = self.weight * weight_decay.to('cuda:0')
        # return F.linear(input, weight, self.bias)
        return F.linear(input+wd(prev).to('cuda:0'), self.weight, self.bias)

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLPWD().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

print("LinW layers:", "\n".join([f"Depth {model[i].depth}: {model[i]}" for i in range(len(model))]), sep="\n\n")

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()


LinW layers:

Depth 0: LinW(in_features=64, out_features=64, bias=True)
Depth 1: LinW(in_features=64, out_features=64, bias=True)
Epoch 1/10, Training Loss: 0.7291, Training Accuracy: 75.94%, Test accuracy: 82.01%
Epoch 2/10, Training Loss: 0.5565, Training Accuracy: 82.27%, Test accuracy: 83.00%
Epoch 3/10, Training Loss: 0.5347, Training Accuracy: 82.81%, Test accuracy: 83.62%
Epoch 4/10, Training Loss: 0.5326, Training Accuracy: 83.13%, Test accuracy: 83.76%
Epoch 5/10, Training Loss: 0.5273, Training Accuracy: 83.41%, Test accuracy: 82.78%
Epoch 6/10, Training Loss: 0.5334, Training Accuracy: 83.11%, Test accuracy: 82.91%
Epoch 7/10, Training Loss: 0.5336, Training Accuracy: 83.21%, Test accuracy: 84.01%
Epoch 8/10, Training Loss: 0.5358, Training Accuracy: 83.08%, Test accuracy: 83.01%
Epoch 9/10, Training Loss: 0.5353, Training Accuracy: 83.09%, Test accuracy: 82.71%
Epoch 10/10, Training Loss: 0.5376, Training Accuracy: 83.33%, Test accuracy: 83.13%


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import numpy as np
from sklearn.neighbors import KernelDensity


def extract_activations_layers(layers):
    """ Extract for each layer the activations

    Args:
        layers (np.array): shape (layer_activation, batch_size, number_of_neurons)

    Returns:
        np.array: shape (layer_activation, batch_size, number_of_activations)
    """

    return np.array([np.array([np.array(h) for h in l]) for l in layers])

def extract_activations_per_sample(layers, mask = False):
    """ Extract for each sample the activations 
    for each layer and store them in a list.

    Args:
        layers (np.array): shape (layer_activation, batch_size, number_of_neurons)

    Returns:
        np.array: shape (batch_size, number_of_activations)
    """

    if mask == True:
        # mask the activations to remove zeros
        mask = layers != 0
        layers = [[np.array(h[m]) for h, m in zip(l,sm)] 
                for l, sm in zip(layers, mask)]
        
    return np.array([layers[:,i,:].flatten().reshape(-1, 1) for i in range(layers.shape[1])])


def get_sampled_activations(activations, bandwidth = 0.2):
    """ Sample the activations using KDE

    Args:
        activations (np.array): shape (batch_size, number_of_activations)

    Returns:
        np.array: shape (batch_size, number_of_activations)
    """

    return torch.from_numpy(np.array([KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(a).sample([64]) for a in activations], dtype="float32")).squeeze(2)

def wd(layers: list()):
    """ Compute the weight decay for each layer

    Args:
        layers (list): list of layers

    Returns:
        torch.tensor: weight decay

    """
    return get_sampled_activations(
                list(
                    extract_activations_per_sample(
                            extract_activations_layers(layers), 
                            mask=False
                        )
                ), 
                bandwidth=0.2
            )

class MLPWD(nn.Module):
    def __init__(self):
        super(MLPWD, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(3072, 64)
        self.l2 = LinW(in_features=64, out_features=64, depth=0)
        self.l3 = LinW(in_features=64, out_features=64, depth=1, layers=[self.l2])
        self.l4 = nn.Linear(64, 10)
        self.gelu = nn.GELU()
        self.layers = [self.l2, self.l3]

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l2(x, repr))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l3(x, repr))
        x = self.l4(x)
        return x
    
    def __getitem__(self, idx):
        return self.layers[idx]
    
    def __len__(self):
        return len(self.layers)
    

class LinW(nn.Linear):
    def __init__(self, in_features, out_features, depth, layers=[]):
        super(LinW, self).__init__(in_features=in_features, out_features=out_features)
        self.depth = depth
        self.layers = layers[:self.depth] if len(layers)>0 else layers

    def forward(self, input, prev=[]):
        # weight_decay = wd(prev)
        # weight = self.weight * weight_decay.to('cuda:0')
        # return F.linear(input, weight, self.bias)
        print(wd(prev).shape)
        return F.linear(input, self.weight* wd(prev).to('cuda:0'), self.bias)

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLPWD().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

print("LinW layers:", "\n".join([f"Depth {model[i].depth}: {model[i]}" for i in range(len(model))]), sep="\n\n")

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()


In [37]:
layers = [torch.randn(16,256) for i in range(5)]

layers = np.array([np.array(i.flatten()) for i in layers]).flatten()
mask = layers > 0
layers = layers[mask]

res = layers[np.random.randint(0, high = len(layers), size = 256)].reshape(-1, 1)

array([[0.5981148 ],
       [0.9634854 ],
       [1.1180352 ],
       [0.25245157],
       [2.484147  ],
       [1.161348  ],
       [0.87380683],
       [0.44336656],
       [0.9320248 ],
       [1.1274574 ],
       [0.12749925],
       [1.7562745 ],
       [0.7457105 ],
       [0.08640575],
       [1.2357011 ],
       [1.8209292 ],
       [1.2359884 ],
       [0.30162323],
       [0.12449323],
       [0.05077058],
       [0.42775792],
       [0.5930919 ],
       [0.6066395 ],
       [0.26709583],
       [1.0321921 ],
       [0.07010241],
       [0.12023831],
       [0.1930366 ],
       [1.2029365 ],
       [0.21384852],
       [0.667683  ],
       [0.22509019],
       [0.97664076],
       [0.33046725],
       [1.3761361 ],
       [0.6719827 ],
       [0.3749767 ],
       [0.51912063],
       [2.6440165 ],
       [0.3438414 ],
       [0.44473463],
       [0.35270986],
       [0.8643006 ],
       [0.46024805],
       [0.7238416 ],
       [1.5828304 ],
       [1.4176172 ],
       [0.280

In [18]:

def extract_activations_layers(layers):
    """ Extract for each layer the activations

    Args:
        layers (np.array): shape (layer_activation, batch_size, number_of_neurons)

    Returns:
        np.array: shape (layer_activation, batch_size, number_of_activations)
    """

    return np.array([np.array([np.array(h) for h in l]) for l in layers])

def extract_activations_per_sample(layers, mask = False):
    """ Extract for each sample the activations 
    for each layer and store them in a list.

    Args:
        layers (np.array): shape (layer_activation, batch_size, number_of_neurons)

    Returns:
        np.array: shape (batch_size, number_of_activations)
    """

    if mask == True:
        # mask the activations to remove zeros
        mask = layers != 0
        layers = [[np.array(h[m]) for h, m in zip(l,sm)] 
                for l, sm in zip(layers, mask)]
        
    return np.array([layers[:,i,:].flatten().reshape(-1, 1) for i in range(layers.shape[1])])


def get_sampled_activations(activations, bandwidth = 0.2):
    """ Sample the activations using KDE

    Args:
        activations (np.array): shape (batch_size, number_of_activations)

    Returns:
        np.array: shape (batch_size, number_of_activations)
    """

    return torch.from_numpy(np.array([KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(a).sample([64]) for a in activations], dtype="float32")).squeeze(2)

def wd(layers: list()):
    """ Compute the weight decay for each layer

    Args:
        layers (list): list of layers

    Returns:
        torch.tensor: weight decay

    """
    return get_sampled_activations(
                list(
                    extract_activations_per_sample(
                            extract_activations_layers(layers), 
                            mask=False
                        )
                ), 
                bandwidth=0.2
            )


import random

layers = [torch.randn(64,5) for i in range(5)]

layers = extract_activations_per_sample(extract_activations_layers(layers))
layers = [l[l < 0] for l in layers]


[random.choice(l) for l in layers]

# res = layers[np.random.randint(0, high = layers.shape[0]-1, size = 256)].reshape(-1, 1)
# return torch.from_numpy(np.array(res, dtype="float32"))


# wd(layers).shape

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(3072, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, 256)
        self.l4 = nn.Linear(256, 10)
        self.gelu = nn.GELU()

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        x = self.gelu(self.l2(x))
        x = self.gelu(self.l3(x))
        x = self.l4(x)
        return x
    

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLP().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()

Files already downloaded and verified
Files already downloaded and verified
Epoch 1/10, Training Loss: 1.8164, Training Accuracy: 33.93%, Test accuracy: 39.72%
Epoch 2/10, Training Loss: 1.6439, Training Accuracy: 40.74%, Test accuracy: 41.99%
Epoch 3/10, Training Loss: 1.5454, Training Accuracy: 44.20%, Test accuracy: 42.73%
Epoch 4/10, Training Loss: 1.4934, Training Accuracy: 45.90%, Test accuracy: 44.51%
Epoch 5/10, Training Loss: 1.4409, Training Accuracy: 48.04%, Test accuracy: 49.23%
Epoch 6/10, Training Loss: 1.4050, Training Accuracy: 49.34%, Test accuracy: 47.79%
Epoch 7/10, Training Loss: 1.3646, Training Accuracy: 50.74%, Test accuracy: 48.76%
Epoch 8/10, Training Loss: 1.3404, Training Accuracy: 51.69%, Test accuracy: 48.74%
Epoch 9/10, Training Loss: 1.3010, Training Accuracy: 53.03%, Test accuracy: 49.75%
Epoch 10/10, Training Loss: 1.2674, Training Accuracy: 54.31%, Test accuracy: 49.79%
