In [None]:
pip install torch torchvision matplotlib

Unzip the MNIST files (Kaggle):

In [None]:
import zipfile
import os

with zipfile.ZipFile("mnist_train.csv.zip", 'r') as zip_ref:
    zip_ref.extractall() 


print("Extracted files:", os.listdir())

MNIST model (switching activations):

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split
import pandas as pd
import time
import matplotlib.pyplot as plt


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


#defining Tangma
class Tangma(nn.Module):
    def __init__(self):
        super(Tangma, self).__init__()
        self.alpha = nn.Parameter(torch.tensor(0.0))
        self.gamma = nn.Parameter(torch.tensor(0.0))

    def forward(self, x):
        return x * torch.tanh(x + self.alpha) + self.gamma * x

#defining Swish 
class Swish(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(x)


class MNIST_CNN(nn.Module): 
    def  __init__(self, activation):
        super(MNIST_CNN, self).__init__()

        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)

        self.dropout1 = nn.Dropout(0.25) 

        self.fc1 = nn.Linear(9216, 128) 

        self.dropout2 = nn.Dropout(0.5) 

        self.fc2 = nn.Linear(128, 10)
    
        self.activation = activation


    def forward(self, x):
        x = self.conv1(x) #(1, 28, 28)


        x = self.activation(x)

        x = self.conv2(x) #(64, 24, 24)


        x = self.activation(x)

        x = F.max_pool2d(x, 2) #(64, 12, 12)

        x = self.dropout1(x)

        x = torch.flatten(x, 1) #64*12*12 = 9216

        x = self.fc1(x) #(9216, 128)

        x = self.activation(x)

        x = self.dropout2(x)

        x = self.fc2(x) #(128, 10)
        return x


#load csv dataset
df = pd.read_csv("mnist_train.csv")

#all rows and columns (except first one) are normalized from [0, 255] to [0, 1] and reshapes to grayscale 28*28 
X = torch.tensor(df.iloc[:, 1:].values / 255.0, dtype=torch.float32).view(-1, 1, 28, 28) 
y = torch.tensor(df.iloc[:, 0].values, dtype=torch.long) #converts labels to long

#create TensorDataset
full_dataset = TensorDataset(X, y)


#split 80/20 --> train/test
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size #full dataset - 80% 
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size]) #split into train and test dataset

#train both --> train with batch size = 64
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) 
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


#training
def train_model(model, name, epochs=10):
    model.to(device)

    #using Adam and 0.001 learning rate
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss() #cross-entropy loss

    train_losses, val_losses, val_accuracies, times = [], [], [], []

    for epoch in range(epochs):
        start_time = time.time()
        model.train()
        running_loss = 0.0

        for batch_idx, (data, target) in enumerate(train_loader):
            #record learnable parameters for batches 130 and 260
            if name == "Tangma" and batch_idx in {130, 260}:
                print(f"[Tangma] Epoch {epoch+1}, Batch {batch_idx} | alpha = {model.activation.alpha.item():.4f}, gamma = {model.activation.gamma.item():.4f}")
            

            data, target = data.to(device), target.to(device) #move to GPU processing
            optimizer.zero_grad() #clear previous gradients
            output = model(data) #pass input data
            loss = criterion(output, target) #loss calculator

            
            loss.backward() #backpropogation
            optimizer.step()#update model weights
            running_loss += loss.item() #add current loss to total running loss


        #average loss per epoch from all batches
        avg_train_loss = running_loss / len(train_loader)
        train_losses.append(avg_train_loss)


        #validation set evaluation
        model.eval()
        val_loss = 0.0
        correct = 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                val_loss += criterion(output, target).item()
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        avg_val_loss = val_loss / len(test_loader)
        val_accuracy = correct / len(test_loader.dataset)

        val_losses.append(avg_val_loss)
        val_accuracies.append(val_accuracy)
        times.append(time.time() - start_time)


        #display all stats
        print(f"[{name}] Epoch {epoch+1}: Train Loss={avg_train_loss:.4f}, Val Loss={avg_val_loss:.4f}, Val Acc={val_accuracy:.4f}, Time={times[-1]:.2f}s")

    return train_losses, val_losses, val_accuracies, times

#all activations 
activations = {
    "Tangma": Tangma(),
    "ReLU": nn.ReLU(),
    "Swish": Swish(),
    "GELU": nn.GELU()
}




results = {}
for name, act in activations.items():
    print(f"\nTraining with {name} activation:")
    model = MNIST_CNN(act)
    results[name] = train_model(model, name)



# plot results
for metric_idx, metric_name in enumerate(["Train Loss", "Val Loss", "Val Accuracy", "Time"]):
    plt.figure()
    for name in activations:
        plt.plot(results[name][metric_idx], label=name)
    plt.title(metric_name + " per Epoch")
    plt.xlabel("Epoch")
    plt.ylabel(metric_name)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f"mnist_{metric_name.lower().replace(' ', '_')}.png")
    plt.show()




SyntaxError: expected 'else' after 'if' expression (1980119682.py, line 11)