# SPARSE AUTOENCODERS

A sparse autoencoder is a type of neural network architecture that is used for unsupervised learning. It is an extension of a basic autoencoder and involves adding a sparsity constraint to the learning process. The idea is to encourage the encoder to learn a compact representation of the input data, where most of the activations in the hidden layer are zero (i.e., sparse). This results in a more interpretable and computationally efficient representation of the data. The sparse constraint is typically achieved by adding a sparsity regularization term to the loss function, which penalizes the network if the average activation degree of the hidden layer neurons exceeds a certain threshold. The sparse autoencoder can be used for various tasks such as image compression, feature learning, and dimensionality reduction.

Yes, the average activation degree of hidden layer neurons is commonly used in the training process of sparse autoencoders. Sparse autoencoders aim to encourage the hidden layer neurons to have low activation values for most of the inputs, only a small percentage of neurons would activate for a given input. The sparsity constraint can be achieved by adding a sparsity loss term to the autoencoder's loss function which encourages a target average activation degree for the hidden layer neurons. The average activation degree of the hidden layer neurons is usually calculated as the mean activation across all neurons and all inputs in a batch.

## AVERAGE ACTIVATION DEGREE OF A HIDDEN LAYER
The average activation degree of a hidden layer neuron in a neural network is the **average output of the activation function applied to the weighted sum of inputs from the previous layer**. It measures the level of activation of the neuron and its contribution to the overall output of the network. The activation degree is calculated for each neuron and can vary between 0 and 1 for binary activation functions, such as the sigmoid function, or between negative and positive values for other activation functions.

In [1]:
# import the necessary libraries
from tqdm.auto import tqdm
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

# Load the digits dataset from scikit-learn
digits = load_digits()
X = digits.images
y = digits.target

# Reshape the images to be in the form (number of samples, number of features)
X = X.reshape((len(X), -1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Convert numpy arrays to torch tensors
X_train = torch.tensor(X_train, dtype=torch.float)
X_test = torch.tensor(X_test, dtype=torch.float)

# Create custom activation functions
class Satlin(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        # return torch.where(x > 0, x, x/(1 + torch.abs(x)))
        return torch.clamp(x, 0, 1)
    
class Purelin(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        return x
    
# MAPE Metric for evaluation
class Mape(nn.Module):
    def forward(self, true, pred):
        num = torch.sum(torch.abs(true - pred))
        den = torch.sum(torch.abs(true)) * true.shape[0]
        mape = 100 * (num/den)
        return mape
    

class SparseAutoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, sparsity_param, beta):
        super(SparseAutoencoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.sparsity_param = sparsity_param
        self.beta = beta
        self.satlin = Satlin()
        self.purelin = Purelin()
        
        self.encoder = nn.Linear(input_size, hidden_size)
        self.decoder = nn.Linear(hidden_size, input_size)
        
    def forward(self, x):
        x = self.encoder(x)
        x = self.satlin(x)
        x = self.decoder(x)
        x = self.purelin(x)
        
        avg_act = torch.mean(x, dim=0)
        
        # sparsity_loss = self.sparsity_param * torch.log(self.sparsity_param / avg_act) + \
        #     (1 - self.sparsity_param) * torch.log((1 - self.sparsity_param) / (1 - avg_act))
        sparsity_loss = self.sparsity_param * torch.log(self.sparsity_param / avg_act) + \
                        torch.log((1-self.sparsity_param)/(1-avg_act))
        
        sparsity_loss = self.beta * torch.sum(sparsity_loss)
        
        return x, sparsity_loss
    
def train(model, train_loader, criterion, optimizer, beta):
    model.train()
    train_loss = 0
    for data in train_loader:
        optimizer.zero_grad()
        outputs, sparsity_loss = model(data)
        loss = criterion(outputs, data) + sparsity_loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        
        m = mape(data, outputs)
    return train_loss / len(train_loader), m

def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0.0
    for data in test_loader:
        with torch.no_grad():
            outputs, _ = model(data)
            loss = criterion(data, outputs)
            m = mape(data, outputs)
            test_loss += loss
    # print(f"Test loss: {test_loss/len(test_loader):.4f}, Test MAPE: {m:.4f}")
    return test_loss/len(test_loader), m

input_size = 64
hidden_size = 100
sparsity_param = 0.05
beta = 3

model = SparseAutoencoder(input_size, hidden_size, sparsity_param, beta)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
mape = Mape()

# Replace this with your own data loading code
train_loader = X_train
test_loader = X_test

n_epochs = 10
for epoch, _ in enumerate(tqdm(range(n_epochs), desc='Training model', total=n_epochs)):
    train_loss, m = train(model, train_loader, criterion, optimizer, beta)
    test_loss, test_m = test(model, test_loader, criterion)
    
    acc = 100 - m
    test_acc = 100 - test_m
    print(f"Epoch: [{epoch}/{n_epochs}] Train Loss: {train_loss:.4f}, Train MAPE: {m:.4f}, Train Acc: {acc:.4f}")
    print(f"Epoch: [{epoch}/{n_epochs}] Test Loss: {test_loss:.4f}, Test MAPE: {test_m:.4f}, Test Acc: {test_acc:.4f}\n")

Training model:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch: [0/10] Train Loss: 36.2877, Train MAPE: 1.5438, Train Acc: 98.4562
Epoch: [0/10] Test Loss: 30.1308, Test MAPE: 1.3893, Test Acc: 98.6107

Epoch: [1/10] Train Loss: 31.0789, Train MAPE: 1.4962, Train Acc: 98.5038
Epoch: [1/10] Test Loss: 27.4427, Test MAPE: 1.3795, Test Acc: 98.6205

Epoch: [2/10] Train Loss: 29.2663, Train MAPE: 1.5112, Train Acc: 98.4888
Epoch: [2/10] Test Loss: 25.9166, Test MAPE: 1.3777, Test Acc: 98.6223

Epoch: [3/10] Train Loss: 28.3035, Train MAPE: 1.4835, Train Acc: 98.5165
Epoch: [3/10] Test Loss: 25.1895, Test MAPE: 1.3791, Test Acc: 98.6209

Epoch: [4/10] Train Loss: 27.6367, Train MAPE: 1.4792, Train Acc: 98.5208
Epoch: [4/10] Test Loss: 24.9073, Test MAPE: 1.3751, Test Acc: 98.6249

Epoch: [5/10] Train Loss: 27.3457, Train MAPE: 1.3975, Train Acc: 98.6025
Epoch: [5/10] Test Loss: 24.3469, Test MAPE: 1.3739, Test Acc: 98.6261

Epoch: [6/10] Train Loss: 27.0100, Train MAPE: 1.4349, Train Acc: 98.5651
Epoch: [6/10] Test Loss: 24.2027, Test MAPE: 1.379

# Variational Autoencoder

Variational Autoencoders (VAE) are a type of generative model that can be used for various applications including:

* Data generation: VAEs can be used to generate new data points that are similar to the training data by sampling from the encoded latent space.

* Data compression: VAEs can be used as an unsupervised learning method for compressing high-dimensional data into a lower dimensional latent space, where the data can be reconstructed with a lower error.

* Data visualization: By projecting high-dimensional data into a 2D latent space, VAEs can be used to visualize complex data structures and relationships.

* Anomaly detection: VAEs can be used to identify anomalies in data by monitoring the reconstruction error and identifying cases where the difference between the input and reconstructed data is large.

* Recommendation systems: By learning to encode the underlying patterns of a dataset, VAEs can be used to make recommendations based on similar patterns in the data.

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, latent_dim * 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        mean, log_var = x[:, :latent_dim], x[:, latent_dim:]
        return mean, log_var

class Decoder(nn.Module):
    def __init__(self, latent_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(latent_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super().__init__()
        self.encoder = Encoder(input_dim, hidden_dim, latent_dim)
        self.decoder = Decoder(latent_dim, hidden_dim, input_dim)

    def forward(self, x):
        mean, log_var = self.encoder(x)
        std = torch.exp(0.5 * log_var)
        epsilon = torch.randn_like(std)
        z = mean + std * epsilon
        reconstructed_x = self.decoder(z)
        return reconstructed_x, mean, log_var

input_dim = 784 # 28x28 images
hidden_dim = 128
latent_dim = 32

model = VAE(input_dim, hidden_dim, latent_dim)

### Reconstruction and KL-divergence loss for VAE 

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim, hidden_dims=(512, 256)):
        super(VAE, self).__init__()
        
        # Encoder network
        self.encoder_hidden = nn.ModuleList()
        in_dims = [input_dim] + list(hidden_dims)
        for i in range(len(hidden_dims)):
            self.encoder_hidden.append(nn.Linear(in_dims[i], in_dims[i+1]))
        
        self.mean = nn.Linear(hidden_dims[-1], latent_dim)
        self.logvar = nn.Linear(hidden_dims[-1], latent_dim)
        
        # Decoder network
        self.decoder_hidden = nn.ModuleList()
        out_dims = list(reversed(hidden_dims)) + [input_dim]
        for i in range(len(hidden_dims)):
            self.decoder_hidden.append(nn.Linear(out_dims[i], out_dims[i+1]))
        
    def encode(self, x):
        h = x
        for layer in self.encoder_hidden:
            h = F.relu(layer(h))
        mean = self.mean(h)
        logvar = self.logvar(h)
        return mean, logvar
    
    def decode(self, z):
        h = z
        for layer in self.decoder_hidden:
            h = F.relu(layer(h))
        return torch.sigmoid(h)
    
    def reparameterize(self, mean, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std
    
    def forward(self, x):
        mean, logvar = self.encode(x)
        z = self.reparameterize(mean, logvar)
        recon_x = self.decode(z)
        return recon_x, mean, logvar
    
def vae_loss(recon_x, x, mean, logvar):
    recon_loss = F.binary_cross_entropy(recon_x, x, reduction='sum')
    kl_divergence = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
    return recon_loss + kl_divergence

# DENOISING AUTOENCODER

A denoising autoencoder is a type of autoencoder that is trained to reconstruct the original input from a corrupted version of it. The purpose is to learn a robust representation that is able to recover the original signal from noisy or incomplete data.

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DenoisingAutoencoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(DenoisingAutoencoder, self).__init__()
        self.encoder = nn.Linear(input_size, hidden_size)
        self.decoder = nn.Linear(hidden_size, input_size)

    def forward(self, x):
        x = x + torch.randn_like(x) * 0.1
        x = F.relu(self.encoder(x))
        x = self.decoder(x)
        return x

# Define the model, criterion, optimizer
model = DenoisingAutoencoder(input_size, hidden_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
for epoch in range(10):
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, data)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print('Epoch {} loss: {:.4f}'.format(epoch + 1, running_loss / len(train_loader)))

Epoch 1 loss: 6.0891
Epoch 2 loss: 1.0729
Epoch 3 loss: 0.4547
Epoch 4 loss: 0.2394
Epoch 5 loss: 0.1770
Epoch 6 loss: 0.1567
Epoch 7 loss: 0.1547
Epoch 8 loss: 0.1453
Epoch 9 loss: 0.1415
Epoch 10 loss: 0.1492
