In [1]:
import torch

In [4]:
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * images.size(0)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    train_loss /= len(train_loader.dataset)
    accuracy = 100. * correct / total
    return train_loss, accuracy

In [5]:
import torch

def evaluate(device, model, dataloader):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    return accuracy

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import numpy as np
from sklearn.neighbors import KernelDensity

def wd(layers: list()):
    layers = np.array([layers]).flatten().reshape(-1, 1)
    mask = layers != 0
    layers = layers[mask].reshape(-1, 1)
    #print(layers.shape)
    # layers = np.array([layer*np.sqrt(depth+1) for depth, layer in enumerate(layers)]).flatten().reshape(-1, 1)
    res = KernelDensity(kernel="gaussian", bandwidth=0.2).fit(layers).sample([256])
    # return torch.from_numpy(np.array(res, dtype="float32"))
    # return torch.from_numpy(np.array(np.array([1]), dtype="float32"))
    return torch.from_numpy(np.array(res, dtype="float32"))
    # return 1 - len(layers)/10 if len(layers)>0 else 1

class MLPWD(nn.Module):
    def __init__(self):
        super(MLPWD, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(784, 256)
        self.l2 = LinW(in_features=256, out_features=256, depth=0)
        self.l3 = LinW(in_features=256, out_features=256, depth=1, layers=[self.l2])
        self.l4 = nn.Linear(256, 10)
        self.gelu = nn.GELU()
        self.layers = [self.l2, self.l3]

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l2(x, repr))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l3(x, repr))
        x = self.l4(x)
        return x
    
    def __getitem__(self, idx):
        return self.layers[idx]
    
    def __len__(self):
        return len(self.layers)
    

class LinW(nn.Linear):
    def __init__(self, in_features, out_features, depth, layers=[]):
        super(LinW, self).__init__(in_features=in_features, out_features=out_features)
        self.depth = depth
        self.layers = layers[:self.depth] if len(layers)>0 else layers

    def forward(self, input, prev=[]):
        return F.linear(input * wd(prev).squeeze(1).to('cuda:0'), self.weight, self.bias)

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLPWD().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

print("LinW layers:", "\n".join([f"Depth {model[i].depth}: {model[i]}" for i in range(len(model))]), sep="\n\n")

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()


LinW layers:

Depth 0: LinW(in_features=256, out_features=256, bias=True)
Depth 1: LinW(in_features=256, out_features=256, bias=True)
Epoch 1/10, Training Loss: 1.3389, Training Accuracy: 53.04%, Test accuracy: 65.71%
Epoch 2/10, Training Loss: 1.0873, Training Accuracy: 63.03%, Test accuracy: 63.16%
Epoch 3/10, Training Loss: 1.0985, Training Accuracy: 62.67%, Test accuracy: 62.11%
Epoch 4/10, Training Loss: 1.1364, Training Accuracy: 61.78%, Test accuracy: 61.58%
Epoch 5/10, Training Loss: 1.1638, Training Accuracy: 61.03%, Test accuracy: 59.98%
Epoch 6/10, Training Loss: 1.1696, Training Accuracy: 60.92%, Test accuracy: 60.02%
Epoch 7/10, Training Loss: 1.1096, Training Accuracy: 63.38%, Test accuracy: 63.92%
Epoch 8/10, Training Loss: 1.0814, Training Accuracy: 64.90%, Test accuracy: 65.40%
Epoch 9/10, Training Loss: 1.0776, Training Accuracy: 64.94%, Test accuracy: 65.47%
Epoch 10/10, Training Loss: 1.0470, Training Accuracy: 66.40%, Test accuracy: 63.66%


In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import numpy as np
from sklearn.neighbors import KernelDensity

def wd(layers: list()):
    layers = np.array([np.array(i.flatten()) for i in layers]).flatten()
    mask = layers > 0
    layers = layers[mask].reshape(-1, 1)
    res = KernelDensity(kernel="gaussian", bandwidth=0.2).fit(layers).sample([256])
    return torch.from_numpy(np.array(res, dtype="float32"))

class MLPWD(nn.Module):
    def __init__(self):
        super(MLPWD, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(784, 256)
        self.l2 = LinW(in_features=256, out_features=256, depth=0)
        self.l3 = LinW(in_features=256, out_features=256, depth=1, layers=[self.l2])
        self.l4 = nn.Linear(256, 10)
        self.gelu = nn.GELU()
        self.layers = [self.l2, self.l3]

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l2(x, repr))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l3(x, repr))
        x = self.l4(x)
        return x
    
    def __getitem__(self, idx):
        return self.layers[idx]
    
    def __len__(self):
        return len(self.layers)
    

class LinW(nn.Linear):
    def __init__(self, in_features, out_features, depth, layers=[]):
        super(LinW, self).__init__(in_features=in_features, out_features=out_features)
        self.depth = depth
        self.layers = layers[:self.depth] if len(layers)>0 else layers

    def forward(self, input, prev=[]):
        return F.linear(input * wd(prev).squeeze(1).to('cuda:0'), self.weight, self.bias)

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLPWD().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

print("LinW layers:", "\n".join([f"Depth {model[i].depth}: {model[i]}" for i in range(len(model))]), sep="\n\n")

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()


LinW layers:

Depth 0: LinW(in_features=256, out_features=256, bias=True)
Depth 1: LinW(in_features=256, out_features=256, bias=True)
Epoch 1/10, Training Loss: 0.5012, Training Accuracy: 84.04%, Test accuracy: 93.12%
Epoch 2/10, Training Loss: 0.2055, Training Accuracy: 93.80%, Test accuracy: 94.64%
Epoch 3/10, Training Loss: 0.1557, Training Accuracy: 95.39%, Test accuracy: 95.81%
Epoch 4/10, Training Loss: 0.1290, Training Accuracy: 96.19%, Test accuracy: 95.94%
Epoch 5/10, Training Loss: 0.1170, Training Accuracy: 96.42%, Test accuracy: 96.36%
Epoch 6/10, Training Loss: 0.1069, Training Accuracy: 96.82%, Test accuracy: 96.30%
Epoch 7/10, Training Loss: 0.0939, Training Accuracy: 97.19%, Test accuracy: 96.57%
Epoch 8/10, Training Loss: 0.0876, Training Accuracy: 97.41%, Test accuracy: 96.86%
Epoch 9/10, Training Loss: 0.0845, Training Accuracy: 97.49%, Test accuracy: 96.58%
Epoch 10/10, Training Loss: 0.0782, Training Accuracy: 97.69%, Test accuracy: 96.66%


In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import numpy as np
from sklearn.neighbors import KernelDensity

def wd(layers: list()):
    layers = np.array([np.array(i.flatten()) for i in layers]).flatten()
    mask = layers > 0
    layers = layers[mask].reshape(-1, 1)
    res = KernelDensity(kernel="gaussian", bandwidth=0.2).fit(layers).sample([256])
    return torch.from_numpy(np.array(res, dtype="float32"))

class MLPWD(nn.Module):
    def __init__(self):
        super(MLPWD, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(784, 256)
        self.l2 = LinW(in_features=256, out_features=256, depth=0)
        self.l3 = LinW(in_features=256, out_features=256, depth=1, layers=[self.l2])
        self.l4 = nn.Linear(256, 10)
        self.gelu = nn.GELU()
        self.layers = [self.l2, self.l3]

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l2(x, repr))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l3(x, repr))
        x = self.l4(x)
        return x
    
    def __getitem__(self, idx):
        return self.layers[idx]
    
    def __len__(self):
        return len(self.layers)
    

class LinW(nn.Linear):
    def __init__(self, in_features, out_features, depth, layers=[]):
        super(LinW, self).__init__(in_features=in_features, out_features=out_features)
        self.depth = depth
        self.layers = layers[:self.depth] if len(layers)>0 else layers

    def forward(self, input, prev=[]):
        return F.linear(input, self.weight * wd(prev).squeeze(1).to('cuda:0'), self.bias)

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLPWD().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

print("LinW layers:", "\n".join([f"Depth {model[i].depth}: {model[i]}" for i in range(len(model))]), sep="\n\n")

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()


LinW layers:

Depth 0: LinW(in_features=256, out_features=256, bias=True)
Depth 1: LinW(in_features=256, out_features=256, bias=True)
Epoch 1/10, Training Loss: 0.5038, Training Accuracy: 83.66%, Test accuracy: 92.97%
Epoch 2/10, Training Loss: 0.1993, Training Accuracy: 94.02%, Test accuracy: 94.85%
Epoch 3/10, Training Loss: 0.1565, Training Accuracy: 95.28%, Test accuracy: 95.44%
Epoch 4/10, Training Loss: 0.1311, Training Accuracy: 96.10%, Test accuracy: 95.69%
Epoch 5/10, Training Loss: 0.1141, Training Accuracy: 96.61%, Test accuracy: 96.53%
Epoch 6/10, Training Loss: 0.1022, Training Accuracy: 96.98%, Test accuracy: 96.21%
Epoch 7/10, Training Loss: 0.0972, Training Accuracy: 97.12%, Test accuracy: 96.91%
Epoch 8/10, Training Loss: 0.0899, Training Accuracy: 97.29%, Test accuracy: 96.68%
Epoch 9/10, Training Loss: 0.0782, Training Accuracy: 97.64%, Test accuracy: 96.65%
Epoch 10/10, Training Loss: 0.0819, Training Accuracy: 97.50%, Test accuracy: 96.86%


In [48]:
""" Selecting randomly from the layers activations 
    Filter out the positive values

    The network converge but still reach better 
    accuracy compared to the one with 
    kernel density estimation


"""


import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import numpy as np
from sklearn.neighbors import KernelDensity

def wd(layers: list()):
    layers = np.array([np.array(i.flatten()) for i in layers]).flatten()
    mask = layers < 0
    layers = layers[mask]
    res = layers[np.random.randint(0, high = layers.shape[0]-1, size = 256)].reshape(-1, 1)
    return torch.from_numpy(np.array(res, dtype="float32"))

class MLPWD(nn.Module):
    def __init__(self):
        super(MLPWD, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(784, 256)
        self.l2 = LinW(in_features=256, out_features=256, depth=0)
        self.l3 = LinW(in_features=256, out_features=256, depth=1, layers=[self.l2])
        self.l4 = nn.Linear(256, 10)
        self.gelu = nn.GELU()
        self.layers = [self.l2, self.l3]

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l2(x, repr))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l3(x, repr))
        x = self.l4(x)
        return x
    
    def __getitem__(self, idx):
        return self.layers[idx]
    
    def __len__(self):
        return len(self.layers)
    

class LinW(nn.Linear):
    def __init__(self, in_features, out_features, depth, layers=[]):
        super(LinW, self).__init__(in_features=in_features, out_features=out_features)
        self.depth = depth
        self.layers = layers[:self.depth] if len(layers)>0 else layers

    def forward(self, input, prev=[]):
        return F.linear(input, self.weight * wd(prev).squeeze(1).to('cuda:0'), self.bias)

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLPWD().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

print("LinW layers:", "\n".join([f"Depth {model[i].depth}: {model[i]}" for i in range(len(model))]), sep="\n\n")

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()


LinW layers:

Depth 0: LinW(in_features=256, out_features=256, bias=True)
Depth 1: LinW(in_features=256, out_features=256, bias=True)
Epoch 1/10, Training Loss: 0.6480, Training Accuracy: 78.64%, Test accuracy: 89.76%
Epoch 2/10, Training Loss: 0.2932, Training Accuracy: 91.19%, Test accuracy: 92.56%
Epoch 3/10, Training Loss: 0.2182, Training Accuracy: 93.32%, Test accuracy: 94.18%
Epoch 4/10, Training Loss: 0.1826, Training Accuracy: 94.45%, Test accuracy: 95.05%
Epoch 5/10, Training Loss: 0.1542, Training Accuracy: 95.28%, Test accuracy: 95.39%
Epoch 6/10, Training Loss: 0.1376, Training Accuracy: 95.93%, Test accuracy: 95.44%
Epoch 7/10, Training Loss: 0.1241, Training Accuracy: 96.13%, Test accuracy: 95.70%
Epoch 8/10, Training Loss: 0.1157, Training Accuracy: 96.50%, Test accuracy: 96.21%
Epoch 9/10, Training Loss: 0.1034, Training Accuracy: 96.84%, Test accuracy: 95.83%
Epoch 10/10, Training Loss: 0.0970, Training Accuracy: 96.93%, Test accuracy: 96.42%


In [49]:
""" Selecting randomly from the layers activations 
    Filter out the negative values

    The network converge, however, the accuracy is
    lower than the one with kernel density estimation


"""


import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import numpy as np
from sklearn.neighbors import KernelDensity

def wd(layers: list()):
    layers = np.array([np.array(i.flatten()) for i in layers]).flatten()
    mask = layers > 0
    layers = layers[mask]
    res = layers[np.random.randint(0, high = layers.shape[0]-1, size = 256)].reshape(-1, 1)
    return torch.from_numpy(np.array(res, dtype="float32"))

class MLPWD(nn.Module):
    def __init__(self):
        super(MLPWD, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(784, 256)
        self.l2 = LinW(in_features=256, out_features=256, depth=0)
        self.l3 = LinW(in_features=256, out_features=256, depth=1, layers=[self.l2])
        self.l4 = nn.Linear(256, 10)
        self.gelu = nn.GELU()
        self.layers = [self.l2, self.l3]

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l2(x, repr))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l3(x, repr))
        x = self.l4(x)
        return x
    
    def __getitem__(self, idx):
        return self.layers[idx]
    
    def __len__(self):
        return len(self.layers)
    

class LinW(nn.Linear):
    def __init__(self, in_features, out_features, depth, layers=[]):
        super(LinW, self).__init__(in_features=in_features, out_features=out_features)
        self.depth = depth
        self.layers = layers[:self.depth] if len(layers)>0 else layers

    def forward(self, input, prev=[]):
        return F.linear(input, self.weight * wd(prev).squeeze(1).to('cuda:0'), self.bias)

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLPWD().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

print("LinW layers:", "\n".join([f"Depth {model[i].depth}: {model[i]}" for i in range(len(model))]), sep="\n\n")

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()


LinW layers:

Depth 0: LinW(in_features=256, out_features=256, bias=True)
Depth 1: LinW(in_features=256, out_features=256, bias=True)
Epoch 1/10, Training Loss: 0.4526, Training Accuracy: 85.59%, Test accuracy: 92.92%
Epoch 2/10, Training Loss: 0.1994, Training Accuracy: 94.16%, Test accuracy: 95.12%
Epoch 3/10, Training Loss: 0.1511, Training Accuracy: 95.48%, Test accuracy: 96.00%
Epoch 4/10, Training Loss: 0.1255, Training Accuracy: 96.24%, Test accuracy: 96.37%
Epoch 5/10, Training Loss: 0.1119, Training Accuracy: 96.67%, Test accuracy: 96.21%
Epoch 6/10, Training Loss: 0.0980, Training Accuracy: 97.03%, Test accuracy: 96.57%
Epoch 7/10, Training Loss: 0.0951, Training Accuracy: 97.18%, Test accuracy: 96.40%
Epoch 8/10, Training Loss: 0.0854, Training Accuracy: 97.45%, Test accuracy: 96.67%
Epoch 9/10, Training Loss: 0.0796, Training Accuracy: 97.63%, Test accuracy: 96.71%
Epoch 10/10, Training Loss: 0.0776, Training Accuracy: 97.64%, Test accuracy: 96.83%


In [46]:
""" Selecting randomly from the layers activations 
    without any activation filtering approach

    The evidence show that the model is not learning
    and the accuracy is not improving


"""

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import numpy as np
from sklearn.neighbors import KernelDensity

def wd(layers: list()):
    layers = np.array([np.array(i.flatten()) for i in layers]).flatten()
    # mask = layers > 0
    # layers = layers[mask]
    res = layers[np.random.randint(0, high = layers.shape[0]-2, size = 256)].reshape(-1, 1)
    return torch.from_numpy(np.array(res, dtype="float32"))

class MLPWD(nn.Module):
    def __init__(self):
        super(MLPWD, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(3072, 256)
        self.l2 = LinW(in_features=256, out_features=256, depth=0)
        self.l3 = LinW(in_features=256, out_features=256, depth=1, layers=[self.l2])
        self.l4 = nn.Linear(256, 10)
        self.gelu = nn.GELU()
        self.layers = [self.l2, self.l3]

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l2(x, repr))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l3(x, repr))
        x = self.l4(x)
        return x
    
    def __getitem__(self, idx):
        return self.layers[idx]
    
    def __len__(self):
        return len(self.layers)
    

class LinW(nn.Linear):
    def __init__(self, in_features, out_features, depth, layers=[]):
        super(LinW, self).__init__(in_features=in_features, out_features=out_features)
        self.depth = depth
        self.layers = layers[:self.depth] if len(layers)>0 else layers

    def forward(self, input, prev=[]):
        return F.linear(input * wd(prev).squeeze(1).to('cuda:0'), self.weight, self.bias)

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLPWD().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

print("LinW layers:", "\n".join([f"Depth {model[i].depth}: {model[i]}" for i in range(len(model))]), sep="\n\n")

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()


Files already downloaded and verified
Files already downloaded and verified
LinW layers:

Depth 0: LinW(in_features=256, out_features=256, bias=True)
Depth 1: LinW(in_features=256, out_features=256, bias=True)
Epoch 1/10, Training Loss: 2.3640, Training Accuracy: 10.02%, Test accuracy: 10.00%
Epoch 2/10, Training Loss: 2.3029, Training Accuracy: 10.03%, Test accuracy: 10.00%
Epoch 3/10, Training Loss: 2.3029, Training Accuracy: 9.88%, Test accuracy: 9.99%
Epoch 4/10, Training Loss: 2.3028, Training Accuracy: 9.97%, Test accuracy: 10.00%
Epoch 5/10, Training Loss: 2.3029, Training Accuracy: 9.78%, Test accuracy: 9.94%
Epoch 6/10, Training Loss: 2.3029, Training Accuracy: 9.79%, Test accuracy: 10.00%
Epoch 7/10, Training Loss: 2.3028, Training Accuracy: 10.04%, Test accuracy: 10.00%
Epoch 8/10, Training Loss: 2.3029, Training Accuracy: 9.94%, Test accuracy: 10.00%
Epoch 9/10, Training Loss: 2.3029, Training Accuracy: 9.92%, Test accuracy: 10.00%
Epoch 10/10, Training Loss: 2.3029, Traini

In [47]:
""" Selecting randomly from the layers activations 
    filtering the negative activations

    It seems that the network still converge but the
    accuracy isn't better than the backprop approch
"""

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import numpy as np
from sklearn.neighbors import KernelDensity

def wd(layers: list()):
    layers = np.array([np.array(i.flatten()) for i in layers]).flatten()
    mask = layers < 0
    layers = layers[mask]
    res = layers[np.random.randint(0, high = layers.shape[0]-1, size = 256)].reshape(-1, 1)
    return torch.from_numpy(np.array(res, dtype="float32"))

class MLPWD(nn.Module):
    def __init__(self):
        super(MLPWD, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(3072, 256)
        self.l2 = LinW(in_features=256, out_features=256, depth=0)
        self.l3 = LinW(in_features=256, out_features=256, depth=1, layers=[self.l2])
        self.l4 = nn.Linear(256, 10)
        self.gelu = nn.GELU()
        self.layers = [self.l2, self.l3]

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l2(x, repr))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l3(x, repr))
        x = self.l4(x)
        return x
    
    def __getitem__(self, idx):
        return self.layers[idx]
    
    def __len__(self):
        return len(self.layers)
    

class LinW(nn.Linear):
    def __init__(self, in_features, out_features, depth, layers=[]):
        super(LinW, self).__init__(in_features=in_features, out_features=out_features)
        self.depth = depth
        self.layers = layers[:self.depth] if len(layers)>0 else layers

    def forward(self, input, prev=[]):
        return F.linear(input * wd(prev).squeeze(1).to('cuda:0'), self.weight, self.bias)

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLPWD().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

print("LinW layers:", "\n".join([f"Depth {model[i].depth}: {model[i]}" for i in range(len(model))]), sep="\n\n")

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()


Files already downloaded and verified
Files already downloaded and verified
LinW layers:

Depth 0: LinW(in_features=256, out_features=256, bias=True)
Depth 1: LinW(in_features=256, out_features=256, bias=True)
Epoch 1/10, Training Loss: 2.1189, Training Accuracy: 19.49%, Test accuracy: 24.11%
Epoch 2/10, Training Loss: 1.9967, Training Accuracy: 24.88%, Test accuracy: 26.07%
Epoch 3/10, Training Loss: 1.9454, Training Accuracy: 27.92%, Test accuracy: 29.07%
Epoch 4/10, Training Loss: 1.9000, Training Accuracy: 30.44%, Test accuracy: 31.48%
Epoch 5/10, Training Loss: 1.8667, Training Accuracy: 31.60%, Test accuracy: 32.62%
Epoch 6/10, Training Loss: 1.8423, Training Accuracy: 32.65%, Test accuracy: 34.03%
Epoch 7/10, Training Loss: 1.8235, Training Accuracy: 33.38%, Test accuracy: 33.37%
Epoch 8/10, Training Loss: 1.8031, Training Accuracy: 34.34%, Test accuracy: 35.02%
Epoch 9/10, Training Loss: 1.7879, Training Accuracy: 34.94%, Test accuracy: 35.54%
Epoch 10/10, Training Loss: 1.7748

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import numpy as np
from sklearn.neighbors import KernelDensity

def wd(layers: list()):
    layers = np.array([np.array(i.flatten()) for i in layers]).flatten()
    mask = layers > 0
    layers = layers[mask].reshape(-1, 1)
    #print(layers.shape)
    # layers = np.array([layer*np.sqrt(depth+1) for depth, layer in enumerate(layers)]).flatten().reshape(-1, 1)
    res = KernelDensity(kernel="gaussian", bandwidth=0.2).fit(layers).sample([256])
    # return torch.from_numpy(np.array(res, dtype="float32"))
    # return torch.from_numpy(np.array(np.array([1]), dtype="float32"))
    return torch.from_numpy(np.array(res, dtype="float32"))
    # return 1 - len(layers)/10 if len(layers)>0 else 1

class MLPWD(nn.Module):
    def __init__(self):
        super(MLPWD, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(3072, 256)
        self.l2 = LinW(in_features=256, out_features=256, depth=0)
        self.l3 = LinW(in_features=256, out_features=256, depth=1, layers=[self.l2])
        self.l4 = nn.Linear(256, 10)
        self.gelu = nn.GELU()
        self.layers = [self.l2, self.l3]

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l2(x, repr))
        repr.append(x.detach().cpu().numpy())
        x = self.gelu(self.l3(x, repr))
        x = self.l4(x)
        return x
    
    def __getitem__(self, idx):
        return self.layers[idx]
    
    def __len__(self):
        return len(self.layers)
    

class LinW(nn.Linear):
    def __init__(self, in_features, out_features, depth, layers=[]):
        super(LinW, self).__init__(in_features=in_features, out_features=out_features)
        self.depth = depth
        self.layers = layers[:self.depth] if len(layers)>0 else layers

    def forward(self, input, prev=[]):
        return F.linear(input, self.weight * wd(prev).squeeze(1).to('cuda:0'), self.bias)

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLPWD().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

print("LinW layers:", "\n".join([f"Depth {model[i].depth}: {model[i]}" for i in range(len(model))]), sep="\n\n")

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()


In [19]:
import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(784, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, 256)
        self.l4 = nn.Linear(256, 10)
        self.gelu = nn.GELU()

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        x = self.gelu(self.l2(x))
        x = self.gelu(self.l3(x))
        x = self.l4(x)
        return x
    

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLP().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()

Epoch 1/10, Training Loss: 0.2664, Training Accuracy: 91.96%, Test accuracy: 95.56%
Epoch 2/10, Training Loss: 0.1017, Training Accuracy: 96.92%, Test accuracy: 97.26%
Epoch 3/10, Training Loss: 0.0686, Training Accuracy: 97.84%, Test accuracy: 97.71%
Epoch 4/10, Training Loss: 0.0496, Training Accuracy: 98.42%, Test accuracy: 97.56%
Epoch 5/10, Training Loss: 0.0399, Training Accuracy: 98.70%, Test accuracy: 97.83%
Epoch 6/10, Training Loss: 0.0308, Training Accuracy: 98.99%, Test accuracy: 97.61%
Epoch 7/10, Training Loss: 0.0244, Training Accuracy: 99.17%, Test accuracy: 97.47%
Epoch 8/10, Training Loss: 0.0233, Training Accuracy: 99.24%, Test accuracy: 97.71%
Epoch 9/10, Training Loss: 0.0194, Training Accuracy: 99.39%, Test accuracy: 97.82%
Epoch 10/10, Training Loss: 0.0171, Training Accuracy: 99.43%, Test accuracy: 98.06%


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(3072, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, 256)
        self.l4 = nn.Linear(256, 10)
        self.gelu = nn.GELU()

    def forward(self, x):
        repr = []
        x = self.flatten(x)
        x = self.gelu(self.l1(x))
        x = self.gelu(self.l2(x))
        x = self.gelu(self.l3(x))
        x = self.l4(x)
        return x
    

EPOCHS = 10
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = MLP().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_accuracy = evaluate(device, model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test accuracy: {test_accuracy:.2f}%')
    lr_scheduler.step()

Files already downloaded and verified
Files already downloaded and verified
Epoch 1/10, Training Loss: 1.8242, Training Accuracy: 33.66%, Test accuracy: 38.77%
Epoch 2/10, Training Loss: 1.6521, Training Accuracy: 40.46%, Test accuracy: 42.90%
Epoch 3/10, Training Loss: 1.5589, Training Accuracy: 43.71%, Test accuracy: 43.64%
Epoch 4/10, Training Loss: 1.4902, Training Accuracy: 46.38%, Test accuracy: 46.61%
Epoch 5/10, Training Loss: 1.4421, Training Accuracy: 48.10%, Test accuracy: 47.41%
Epoch 6/10, Training Loss: 1.4022, Training Accuracy: 49.61%, Test accuracy: 48.92%
Epoch 7/10, Training Loss: 1.3659, Training Accuracy: 50.90%, Test accuracy: 50.17%
Epoch 8/10, Training Loss: 1.3299, Training Accuracy: 52.06%, Test accuracy: 49.90%
Epoch 9/10, Training Loss: 1.2966, Training Accuracy: 53.28%, Test accuracy: 49.69%
Epoch 10/10, Training Loss: 1.2629, Training Accuracy: 54.69%, Test accuracy: 50.50%
