# Evaluations on MNIST, Fashion-MNIST, CIFAR-10, CIFAR-100

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_score, f1_score
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from transformers import BertTokenizer, BertModel
import time

class GaussianEnergyWellAttention(nn.Module):
    def __init__(self, embedding_dim):
        super(GaussianEnergyWellAttention, self).__init__()
        self.alpha = nn.Parameter(torch.tensor(0.5), requires_grad=True)

    def forward(self, query, key, value):
        distances = torch.cdist(query, key, p=2)
        weights = torch.exp(-self.alpha * distances ** 2)
        weights = F.softmax(weights, dim=-1)
        return torch.matmul(weights, value), weights


class SoftmaxExponentialEnergyWellAttention(nn.Module):
    def __init__(self, embedding_dim):
        super(SoftmaxExponentialEnergyWellAttention, self).__init__()
        self.alpha = nn.Parameter(torch.tensor(0.5), requires_grad=True)

    def forward(self, query, key, value):
        distances = torch.cdist(query, key, p=2)
        weights = torch.exp(-self.alpha * distances)
        weights = F.softmax(weights, dim=-1)
        return torch.matmul(weights, value), weights


class AttentionComparisonModel(nn.Module):
    def __init__(self, embedding_dim, output_dim, method="gaussian"):
        super(AttentionComparisonModel, self).__init__()
        self.method = method
        self.embedding_dim = embedding_dim
        self.output_dim = output_dim

        if method == "gaussian":
            self.attention = GaussianEnergyWellAttention(embedding_dim)
        elif method == "softmax_exponential":
            self.attention = SoftmaxExponentialEnergyWellAttention(embedding_dim)
        else:
            self.attention = nn.MultiheadAttention(embedding_dim, num_heads=1)

        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        x = x.view(x.size(0), -1, self.embedding_dim)
        query, key, value = x, x, x
        attention_output, _ = self.attention(query, key, value)
        pooled_output = attention_output.mean(dim=1)
        return self.fc(pooled_output)



def train_and_evaluate(model, train_loader, test_loader, optimizer, criterion, device):
    model.to(device)
    for epoch in range(10):
        model.train()
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    model.eval()
    all_labels, all_preds = [], []
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    return accuracy, precision, f1


def get_data_loaders(dataset_name):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,)) if dataset_name in ["MNIST", "Fashion-MNIST"] else
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    if dataset_name == "MNIST":
        dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
        output_dim, embedding_dim = 10, 28*28
    elif dataset_name == "Fashion-MNIST":
        dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
        output_dim, embedding_dim = 10, 28*28
    elif dataset_name == "CIFAR-10":
        dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
        output_dim, embedding_dim = 10, 32*32*3
    elif dataset_name == "CIFAR-100":
        dataset = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
        output_dim, embedding_dim = 100, 32*32*3
    elif dataset_name == "20 Newsgroups":
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        bert_model = BertModel.from_pretrained("bert-base-uncased")
        documents, labels = fetch_20newsgroups(subset='all').data, fetch_20newsgroups(subset='all').target
        embeddings = [bert_model(**tokenizer(doc, return_tensors='pt')).last_hidden_state.squeeze(0) for doc in documents]
        return embeddings, labels, 768, 20

    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

    return train_loader, test_loader, embedding_dim, output_dim


def run_experiment_on_datasets(dataset_list):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss()

    results = []
    for dataset_name in dataset_list:
        print(f"\nRunning experiment on {dataset_name}...")

        train_loader, test_loader, embedding_dim, output_dim = get_data_loaders(dataset_name)

        gaussian_model = AttentionComparisonModel(embedding_dim, output_dim, method="gaussian").to(device)
        gaussian_optimizer = optim.Adam(gaussian_model.parameters(), lr=0.01)
        gauss_accuracy, gauss_precision, gauss_f1 = train_and_evaluate(
            gaussian_model, train_loader, test_loader, gaussian_optimizer, criterion, device
        )

        softmax_model = AttentionComparisonModel(embedding_dim, output_dim, method="softmax_exponential").to(device)
        softmax_optimizer = optim.Adam(softmax_model.parameters(), lr=0.01)
        softmax_accuracy, softmax_precision, softmax_f1 = train_and_evaluate(
            softmax_model, train_loader, test_loader, softmax_optimizer, criterion, device
        )

        multihead_model = AttentionComparisonModel(embedding_dim, output_dim, method="self_attention").to(device)
        multihead_optimizer = optim.Adam(multihead_model.parameters(), lr=0.01)
        multihead_accuracy, multihead_precision, multihead_f1 = train_and_evaluate(
            multihead_model, train_loader, test_loader, multihead_optimizer, criterion, device
        )

        results.append({
            "Dataset": dataset_name,
            "Gaussian": {"Accuracy": gauss_accuracy, "Precision": gauss_precision, "F1 Score": gauss_f1},
            "Softmax Exponential": {"Accuracy": softmax_accuracy, "Precision": softmax_precision, "F1 Score": softmax_f1},
            "Self-Attention": {"Accuracy": multihead_accuracy, "Precision": multihead_precision, "F1 Score": multihead_f1},
        })

    return results


dataset_list = ["MNIST", "Fashion-MNIST", "CIFAR-10", "CIFAR-100"]

results = run_experiment_on_datasets(dataset_list)
for result in results:
    print(f"\nDataset: {result['Dataset']}")
    for method, metrics in result.items():
        if method != "Dataset":
            print(f"{method} -> Accuracy: {metrics['Accuracy']:.4f}, Precision: {metrics['Precision']:.4f}, F1 Score: {metrics['F1 Score']:.4f}")



Running experiment on MNIST...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Running experiment on Fashion-MNIST...
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 26.4M/26.4M [00:05<00:00, 4.49MB/s]


Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 29.5k/29.5k [00:00<00:00, 269kB/s]


Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 4.42M/4.42M [00:02<00:00, 2.08MB/s]


Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 5.15k/5.15k [00:00<00:00, 9.18MB/s]


Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Running experiment on CIFAR-10...
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:01<00:00, 97.9MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data

Running experiment on CIFAR-100...
Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


100%|██████████| 169M/169M [00:02<00:00, 56.5MB/s]


Extracting ./data/cifar-100-python.tar.gz to ./data

Dataset: MNIST
Gaussian -> Accuracy: 0.8801, Precision: 0.8870, F1 Score: 0.8752
Softmax Exponential -> Accuracy: 0.8939, Precision: 0.8986, F1 Score: 0.8938
Self-Attention -> Accuracy: 0.1070, Precision: 0.0900, F1 Score: 0.0731

Dataset: Fashion-MNIST
Gaussian -> Accuracy: 0.8117, Precision: 0.8287, F1 Score: 0.8127
Softmax Exponential -> Accuracy: 0.8328, Precision: 0.8381, F1 Score: 0.8340
Self-Attention -> Accuracy: 0.1625, Precision: 0.1307, F1 Score: 0.1308

Dataset: CIFAR-10
Gaussian -> Accuracy: 0.3222, Precision: 0.3286, F1 Score: 0.3201
Softmax Exponential -> Accuracy: 0.3182, Precision: 0.3212, F1 Score: 0.3060
Self-Attention -> Accuracy: 0.1030, Precision: 0.1043, F1 Score: 0.0969

Dataset: CIFAR-100
Gaussian -> Accuracy: 0.0848, Precision: 0.0990, F1 Score: 0.0787
Softmax Exponential -> Accuracy: 0.0866, Precision: 0.1012, F1 Score: 0.0836
Self-Attention -> Accuracy: 0.0116, Precision: 0.0119, F1 Score: 0.0078


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Evaluations on STL-10, SVNH

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_score, f1_score
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from transformers import BertTokenizer, BertModel
import time
import numpy as np


class GaussianEnergyWellAttention(nn.Module):
    def __init__(self, embedding_dim):
        super(GaussianEnergyWellAttention, self).__init__()
        self.alpha = nn.Parameter(torch.tensor(0.5), requires_grad=True)

    def forward(self, query, key, value):
        distances = torch.cdist(query, key, p=2)
        weights = torch.exp(-self.alpha * distances ** 2)
        weights = F.softmax(weights, dim=-1)
        return torch.matmul(weights, value), weights


class SoftmaxExponentialEnergyWellAttention(nn.Module):
    def __init__(self, embedding_dim):
        super(SoftmaxExponentialEnergyWellAttention, self).__init__()
        self.alpha = nn.Parameter(torch.tensor(0.5), requires_grad=True)

    def forward(self, query, key, value):
        distances = torch.cdist(query, key, p=2)
        weights = torch.exp(-self.alpha * distances)
        weights = F.softmax(weights, dim=-1)
        return torch.matmul(weights, value), weights

class AttentionComparisonModel(nn.Module):
    def __init__(self, embedding_dim, output_dim, method="gaussian"):
        super(AttentionComparisonModel, self).__init__()
        self.method = method
        self.embedding_dim = embedding_dim
        if method == "gaussian":
            self.attention = GaussianEnergyWellAttention(embedding_dim)
        elif method == "softmax_exponential":
            self.attention = SoftmaxExponentialEnergyWellAttention(embedding_dim)
        else:
            self.attention = nn.MultiheadAttention(embedding_dim, num_heads=1)

        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        batch_size = x.size(0)

        if self.method in ["gaussian", "softmax_exponential"]:
            x = x.view(batch_size, -1, self.embedding_dim)
            query, key, value = x, x, x
            attention_output, _ = self.attention(query, key, value)
            pooled_output = attention_output.mean(dim=1)
        else:
            x = x.view(batch_size, -1, self.embedding_dim).permute(1, 0, 2)
            query, key, value = x, x, x
            attention_output, _ = self.attention(query, key, value)
            pooled_output = attention_output.mean(dim=0)

        return self.fc(pooled_output)


def get_data_loaders(dataset_name, batch_size=64):
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

    if dataset_name == "STL-10":
        train_dataset = datasets.STL10(root='./data', split='train', download=True, transform=transform)
        test_dataset = datasets.STL10(root='./data', split='test', download=True, transform=transform)
        output_dim, embedding_dim = 10, 96*96*3
    elif dataset_name == "SVHN":
        train_dataset = datasets.SVHN(root='./data', split='train', download=True, transform=transform)
        test_dataset = datasets.SVHN(root='./data', split='test', download=True, transform=transform)
        output_dim, embedding_dim = 10, 32*32*3

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader, embedding_dim, output_dim

def train_and_evaluate(model, train_loader, test_loader, optimizer, criterion, device):
    model.to(device)
    for epoch in range(10):
        model.train()
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    accuracy = 100 * (np.array(y_true) == np.array(y_pred)).sum() / len(y_true)
    precision = precision_score(y_true, y_pred, average="weighted")
    f1 = f1_score(y_true, y_pred, average="weighted")
    return accuracy, precision, f1

def run_experiment_on_datasets(dataset_list):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss()
    results = []

    for dataset_name in dataset_list:
        train_loader, test_loader, embedding_dim, output_dim = get_data_loaders(dataset_name)

        gaussian_model = AttentionComparisonModel(embedding_dim, output_dim, method="gaussian").to(device)
        gaussian_optimizer = optim.Adam(gaussian_model.parameters(), lr=0.001)
        gauss_accuracy, gauss_precision, gauss_f1 = train_and_evaluate(gaussian_model, train_loader, test_loader, gaussian_optimizer, criterion, device)

        softmax_exponential_model = AttentionComparisonModel(embedding_dim, output_dim, method="softmax_exponential").to(device)
        softmax_exponential_optimizer = optim.Adam(softmax_exponential_model.parameters(), lr=0.001)
        softmax_accuracy, softmax_precision, softmax_f1 = train_and_evaluate(softmax_exponential_model, train_loader, test_loader, softmax_exponential_optimizer, criterion, device)

        conventional_model = AttentionComparisonModel(embedding_dim, output_dim, method="self_attention").to(device)
        conventional_optimizer = optim.Adam(conventional_model.parameters(), lr=0.001)
        conv_accuracy, conv_precision, conv_f1 = train_and_evaluate(conventional_model, train_loader, test_loader, conventional_optimizer, criterion, device)

        results.append({
            "Dataset": dataset_name,
            "Gaussian (Acc, Prec, F1)": (gauss_accuracy, gauss_precision, gauss_f1),
            "Softmax Exponential (Acc, Prec, F1)": (softmax_accuracy, softmax_precision, softmax_f1),
            "Self-Attention (Acc, Prec, F1)": (conv_accuracy, conv_precision, conv_f1),
        })

    return results

dataset_list = ["STL-10", "SVHN"]
results = run_experiment_on_datasets(dataset_list)

for result in results:
    print(f"\nDataset: {result['Dataset']}")
    print(f"  Gaussian Energy Well Attention - Accuracy: {result['Gaussian (Acc, Prec, F1)'][0]:.2f}%, Precision: {result['Gaussian (Acc, Prec, F1)'][1]:.2f}, F1 Score: {result['Gaussian (Acc, Prec, F1)'][2]:.2f}")
    print(f"  Softmax Exponential Attention - Accuracy: {result['Softmax Exponential (Acc, Prec, F1)'][0]:.2f}%, Precision: {result['Softmax Exponential (Acc, Prec, F1)'][1]:.2f}, F1 Score: {result['Softmax Exponential (Acc, Prec, F1)'][2]:.2f}")
    print(f"  Self-Attention - Accuracy: {result['Self-Attention (Acc, Prec, F1)'][0]:.2f}%, Precision: {result['Self-Attention (Acc, Prec, F1)'][1]:.2f}, F1 Score: {result['Self-Attention (Acc, Prec, F1)'][2]:.2f}")