## Compare 3 configurations for the activation function.

In [None]:
import torch as torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.optim as optim

In [None]:
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,)),])

mnist_trainset = datasets.MNIST(root= './data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(mnist_trainset, batch_size=10, shuffle=True)

mnist_testset = datasets.MNIST(root= './data', train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(mnist_testset, batch_size=10, shuffle=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 104762713.04it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw






Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 34769142.89it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 18240611.57it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 5688423.04it/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



Here we will train and evaluate three different models to compare different configurations with three different activation functions.

#### 1. ReLU activation function

In [None]:
# ReLU activation function

class ReLU_model(nn.Module):
  def __init__(self):
    super(ReLU_model, self).__init__()
    self.relu1 = nn.Linear(28*28, 128)
    self.relu2 = nn.Linear(128, 10)

  def forward(self, x):
    x = x.view(-1, 28*28)
    x = torch.relu(self.relu1(x))
    x = self.relu2(x)
    return x

#### 2. Sigmoid activation function

In [None]:
# Sigmoid activation function

class Sigmoid_model(nn.Module):
  def __init__(self):
    super(Sigmoid_model, self).__init__()
    self.sigmoid1 = nn.Linear(28*28, 128)
    self.sigmoid2 = nn.Linear(128, 10)

  def forward(self, x):
    x = x.view(-1, 28*28)
    x = torch.sigmoid(self.sigmoid1(x))
    x = self.sigmoid2(x)
    return x

#### 3. Softmax activation function

In [None]:
# Softmax activation function

class Softmax_model(nn.Module):
  def __init__(self):
    super(Softmax_model, self).__init__()
    self.softmax1 = nn.Linear(28*28, 128)
    self.softmax2 = nn.Linear(128, 10)

  def forward(self, x):
    x = x.view(-1, 28*28)
    x = self.softmax1(x)
    x = torch.softmax(x, dim=1)
    x = self.softmax2(x)
    return x

#### Cross-entropy loss function.

In [None]:
cross_el = nn.CrossEntropyLoss()
learning_rate = 0.01
momentum = 0.9
epochs = 5

In [None]:
models = [ReLU_model(), Sigmoid_model(), Softmax_model()]
accurate = []

for i, model in enumerate(models):
  print(f'Model {i+1}')
  optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

  for epoch in range(epochs):
    running_loss = 0.0
    for j, data in enumerate(train_loader, 0):
      inputs, labels = data

      optimizer.zero_grad()

      outputs = model(inputs)
      loss = cross_el(outputs, labels)
      loss.backward()
      optimizer.step()

      running_loss += loss.item()
      if j % 100 == 99:
        print(f'[epoch: {epoch + 1}, {j+1}] loss: {running_loss / 100:.3f}')
        running_loss = 0.0

  correct = 0
  total = 0
  with torch.no_grad():
    for data in test_loader:
      images, labels = data
      outputs = model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

  accuracy = 100 * correct / total
  print(f'Accuracy: {accuracy}%')
  accurate.append(accuracy)

print('Accurate:', accurate)

Model 1
[epoch: 1, 100] loss: 1.324
[epoch: 1, 200] loss: 0.784
[epoch: 1, 300] loss: 0.712
[epoch: 1, 400] loss: 0.628
[epoch: 1, 500] loss: 0.610
[epoch: 1, 600] loss: 0.610
[epoch: 1, 700] loss: 0.552
[epoch: 1, 800] loss: 0.514
[epoch: 1, 900] loss: 0.478
[epoch: 1, 1000] loss: 0.442
[epoch: 1, 1100] loss: 0.471
[epoch: 1, 1200] loss: 0.476
[epoch: 1, 1300] loss: 0.439
[epoch: 1, 1400] loss: 0.483
[epoch: 1, 1500] loss: 0.421
[epoch: 1, 1600] loss: 0.494
[epoch: 1, 1700] loss: 0.417
[epoch: 1, 1800] loss: 0.441
[epoch: 1, 1900] loss: 0.481
[epoch: 1, 2000] loss: 0.428
[epoch: 1, 2100] loss: 0.381
[epoch: 1, 2200] loss: 0.379
[epoch: 1, 2300] loss: 0.348
[epoch: 1, 2400] loss: 0.421
[epoch: 1, 2500] loss: 0.367
[epoch: 1, 2600] loss: 0.339
[epoch: 1, 2700] loss: 0.393
[epoch: 1, 2800] loss: 0.401
[epoch: 1, 2900] loss: 0.311
[epoch: 1, 3000] loss: 0.355
[epoch: 1, 3100] loss: 0.367
[epoch: 1, 3200] loss: 0.347
[epoch: 1, 3300] loss: 0.307
[epoch: 1, 3400] loss: 0.354
[epoch: 1, 3500

From the results of the three models above with different activation functions, we get an accuracy of :

* ReLU activation function = 94.79%
* Sigmoid activation function = 97.11%
* Softmax activation function = 73.44%

From the evaluation results of the three models above, it can be concluded that the model that uses the sigmoid activation function provides the best performance results with an accuracy of 97.11%, while the model that uses the ReLU activation function provides results with an accuracy of 94.79%, and the model that uses the Softmax activation gives the worst performance results with an accuracy of 73.44%. This proves that the choice of activation function can influence model performance.

