In [72]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [73]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1) #input -? OUtput? RF
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.conv4 = nn.Conv2d(128, 256, 3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.conv5 = nn.Conv2d(256, 512, 3)
        self.conv6 = nn.Conv2d(512, 1024, 3)
        self.conv7 = nn.Conv2d(1024, 10, 3)

    def forward(self, x):
        x = self.pool1(F.relu(self.conv2(F.relu(self.conv1(x)))))
        x = self.pool2(F.relu(self.conv4(F.relu(self.conv3(x)))))
        x = F.relu(self.conv6(F.relu(self.conv5(x))))
        x = F.relu(self.conv7(x))
        x = x.view(-1, 10)
        return F.log_softmax(x)

In [74]:
class OptimizedNet(nn.Module):

  def __init__(self):
        super(OptimizedNet, self).__init__()

        # Conv Block 1
        self.conv1 = nn.Conv2d(1, 8, 3, padding=1)   # 28x28 → 28x28
        self.bn1   = nn.BatchNorm2d(8)
        self.dropout1 = nn.Dropout(0.1)

        # Conv Block 2
        self.conv2 = nn.Conv2d(8, 16, 3, padding=1)  # 28x28 → 28x28
        self.bn2   = nn.BatchNorm2d(16)
        self.dropout2 = nn.Dropout(0.1)

        # Conv Block 3
        self.conv3 = nn.Conv2d(16, 16, 3, padding=1)  # 14x14 → 14x14
        self.bn3   = nn.BatchNorm2d(16)
        self.dropout3 = nn.Dropout(0.1)

        # Conv Block 4
        self.conv4 = nn.Conv2d(16, 16, 3, padding=1)  # 14x14 → 14x14
        self.bn4   = nn.BatchNorm2d(16)
        self.dropout4 = nn.Dropout(0.1)

        # Conv Block 5
        self.conv5 = nn.Conv2d(16, 16, 3, padding=1) # 7x7 → 7x7
        self.bn5 = nn.BatchNorm2d(16)
        self.dropout5 = nn.Dropout(0.1)

        # Conv Block 6
        self.conv6 = nn.Conv2d(16, 16, 3, padding=1) # 7x7 → 7x7
        self.bn6 = nn.BatchNorm2d(16)
        self.dropout6 = nn.Dropout(0.1)

        # Conv Block 7
        self.conv7 = nn.Conv2d(16, 16, 3, padding=1) # 7x7 -> 7x7
        self.bn7 = nn.BatchNorm2d(16)
        self.dropout7 = nn.Dropout(0.1)

        # Conv Block 8
        self.conv8 = nn.Conv2d(16, 16, 3, padding=1) # 7x7 -> 7x7
        self.bn8 = nn.BatchNorm2d(16)
        self.dropout8 = nn.Dropout(0.1)

        # Conv Block 9
        self.conv9 = nn.Conv2d(16, 16, 3, padding=1) # 7x7 -> 7x7
        self.bn9 = nn.BatchNorm2d(16)
        self.dropout9 = nn.Dropout(0.1)

        # Conv Block 10
        self.conv10 = nn.Conv2d(16, 16, 3, padding=1) # 7x7 -> 7x7
        self.bn10 = nn.BatchNorm2d(16)
        self.dropout10 = nn.Dropout(0.1)


        # Pooling
        self.pool = nn.MaxPool2d(2, 2)

        # Global Average Pool
        self.gap = nn.AdaptiveAvgPool2d((1, 1))

        # Final linear

        self.fc = nn.Linear(16, 10) # Updated input features to match the last conv layer's output channels


  def forward(self, x):
        x = self.pool(self.dropout1(F.relu(self.bn1(self.conv1(x)))))   # 28x28 → 14x14
        x = self.pool(self.dropout2(F.relu(self.bn2(self.conv2(x)))))   # 14x14 → 7x7
        x = self.dropout3(F.relu(self.bn3(self.conv3(x))))              # 7x7 → 7x7
        x = self.dropout4(F.relu(self.bn4(self.conv4(x))))              # 7x7 → 7x7
        x = self.dropout5(F.relu(self.bn5(self.conv5(x))))              # 7x7 → 7x7
        x = self.dropout6(F.relu(self.bn6(self.conv6(x))))              # 7x7 → 7x7
        x = self.dropout7(F.relu(self.bn7(self.conv7(x))))              # 7x7 → 7x7
        x = self.dropout8(F.relu(self.bn8(self.conv8(x))))              # 7x7 → 7x7
        x = self.dropout9(F.relu(self.bn9(self.conv9(x))))              # 7x7 → 7x7
        x = self.dropout10(F.relu(self.bn10(self.conv10(x))))            # 7x7 → 7x7
        x = self.gap(x)                                  # 7x7 → 1x1
        x = torch.flatten(x, 1)                          # 96
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

In [75]:
!pip install torchsummary
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

model = OptimizedNet().to(device)

from torchsummary import summary
summary(model, input_size=(1, 28, 28))


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 28, 28]              80
       BatchNorm2d-2            [-1, 8, 28, 28]              16
           Dropout-3            [-1, 8, 28, 28]               0
         MaxPool2d-4            [-1, 8, 14, 14]               0
            Conv2d-5           [-1, 16, 14, 14]           1,168
       BatchNorm2d-6           [-1, 16, 14, 14]              32
           Dropout-7           [-1, 16, 14, 14]               0
         MaxPool2d-8             [-1, 16, 7, 7]               0
            Conv2d-9             [-1, 16, 7, 7]           2,320
      BatchNorm2d-10             [-1, 16, 7, 7]              32
          Dropout-11             [-1, 16, 7, 7]               0
           Conv2d-12             [-1, 16, 7, 7]           2,320
      BatchNorm2d-13             [-1, 16, 7, 7]              32
          Dropout-14             [-1, 1

In [76]:

torch.manual_seed(1)
batch_size = 64

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)
# -------------------------------------------------------------------
# 3) Data loaders with normalization & augmentation
# -------------------------------------------------------------------
"""
train_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

batch_size = 64
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}



# full training dataset (60k)
full_train = datasets.MNIST('../data', train=True, download=True, transform=train_transforms)
from torch.utils.data import random_split, DataLoader

# split into 50k train + 10k validation
train_set, val_set = random_split(full_train, [50000, 10000])

# official 10k test set
test_set = datasets.MNIST('../data', train=False, transform=test_transforms)

# DataLoaders
batch_size = 64
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, **kwargs)
val_loader   = DataLoader(val_set,   batch_size=batch_size, shuffle=False, **kwargs)
test_loader  = DataLoader(test_set,  batch_size=batch_size, shuffle=False, **kwargs)
"""

"\ntrain_transforms = transforms.Compose([\n    transforms.ToTensor(),\n    transforms.Normalize((0.1307,), (0.3081,))\n])\n\ntest_transforms = transforms.Compose([\n    transforms.ToTensor(),\n    transforms.Normalize((0.1307,), (0.3081,))\n])\n\nbatch_size = 64\nkwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}\n\n\n\n# full training dataset (60k)\nfull_train = datasets.MNIST('../data', train=True, download=True, transform=train_transforms)\nfrom torch.utils.data import random_split, DataLoader\n\n# split into 50k train + 10k validation\ntrain_set, val_set = random_split(full_train, [50000, 10000])\n\n# official 10k test set\ntest_set = datasets.MNIST('../data', train=False, transform=test_transforms)\n\n# DataLoaders\nbatch_size = 64\nkwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}\n\ntrain_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, **kwargs)\nval_loader   = DataLoader(val_set,   batch_size=batch_size, shuffle=False, **k

In [77]:
""" from tqdm import tqdm
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    pbar = tqdm(train_loader)
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}')


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset))) """
# -------------------------------------------------------------------
# 4) Training & testing functions with dropout, BN, and learning rate scheduling
# -------------------------------------------------------------------
from tqdm import tqdm

def train(epoch):
    model.train()
    pbar = tqdm(train_loader, desc=f"Epoch {epoch}")
    correct, processed = 0, 0
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
        processed += len(data)
        pbar.set_postfix({'loss': f"{loss.item():.4f}",
                          'acc': f"{100*correct/processed:.2f}%"})

def test():
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(test_loader.dataset)
    acc = 100. * correct / len(test_loader.dataset)
    print(f"\nTest  Avg loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({acc:.2f}%)\n")
    return acc

In [78]:
"""
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 2):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader) """
# -------------------------------------------------------------------
# 5) Optimizer, scheduler, and training loop
# -------------------------------------------------------------------
#optimizer = optim.SGD(model.parameters(), lr=0.001)
#optimizer = optim.Adam(model.parameters(), lr=0.001)
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

#optimizer = optim.Adam(model.parameters(), lr=0.001)

optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5) # Keep the same scheduler or modify as needed

best_acc = 0.0
target_acc = 99.5
for epoch in range(1, 20):
    train(epoch)
    acc = test()
    scheduler.step()
    if acc > best_acc:
        best_acc = acc
    if acc >= target_acc:
        print(f"🎯 Target accuracy {target_acc}% reached at epoch {epoch}!")
        break

print(f"✅ Finished Training. Best accuracy: {best_acc:.2f}%")

Epoch 1: 100%|██████████| 938/938 [00:23<00:00, 39.98it/s, loss=0.0499, acc=81.56%]



Test  Avg loss: 0.0831, Accuracy: 9751/10000 (97.51%)



Epoch 2: 100%|██████████| 938/938 [00:24<00:00, 38.93it/s, loss=0.0440, acc=96.81%]



Test  Avg loss: 0.0446, Accuracy: 9880/10000 (98.80%)



Epoch 3: 100%|██████████| 938/938 [00:23<00:00, 40.38it/s, loss=0.1291, acc=97.48%]



Test  Avg loss: 0.0372, Accuracy: 9895/10000 (98.95%)



Epoch 4: 100%|██████████| 938/938 [00:22<00:00, 41.48it/s, loss=0.0113, acc=97.98%]



Test  Avg loss: 0.0455, Accuracy: 9866/10000 (98.66%)



Epoch 5: 100%|██████████| 938/938 [00:23<00:00, 39.88it/s, loss=0.1698, acc=98.08%]



Test  Avg loss: 0.0314, Accuracy: 9903/10000 (99.03%)



Epoch 6: 100%|██████████| 938/938 [00:23<00:00, 40.24it/s, loss=0.0745, acc=98.51%]



Test  Avg loss: 0.0281, Accuracy: 9911/10000 (99.11%)



Epoch 7: 100%|██████████| 938/938 [00:23<00:00, 39.72it/s, loss=0.0866, acc=98.61%]



Test  Avg loss: 0.0247, Accuracy: 9930/10000 (99.30%)



Epoch 8: 100%|██████████| 938/938 [00:23<00:00, 40.00it/s, loss=0.0311, acc=98.65%]



Test  Avg loss: 0.0223, Accuracy: 9931/10000 (99.31%)



Epoch 9: 100%|██████████| 938/938 [00:23<00:00, 40.31it/s, loss=0.1150, acc=98.72%]



Test  Avg loss: 0.0252, Accuracy: 9929/10000 (99.29%)



Epoch 10: 100%|██████████| 938/938 [00:23<00:00, 40.36it/s, loss=0.0041, acc=98.78%]



Test  Avg loss: 0.0267, Accuracy: 9920/10000 (99.20%)



Epoch 11: 100%|██████████| 938/938 [00:23<00:00, 39.40it/s, loss=0.0685, acc=98.93%]



Test  Avg loss: 0.0210, Accuracy: 9941/10000 (99.41%)



Epoch 12: 100%|██████████| 938/938 [00:23<00:00, 39.96it/s, loss=0.0034, acc=98.94%]



Test  Avg loss: 0.0205, Accuracy: 9938/10000 (99.38%)



Epoch 13: 100%|██████████| 938/938 [00:24<00:00, 38.59it/s, loss=0.0046, acc=98.87%]



Test  Avg loss: 0.0186, Accuracy: 9944/10000 (99.44%)



Epoch 14: 100%|██████████| 938/938 [00:23<00:00, 39.87it/s, loss=0.0064, acc=98.96%]



Test  Avg loss: 0.0202, Accuracy: 9934/10000 (99.34%)



Epoch 15: 100%|██████████| 938/938 [00:23<00:00, 39.59it/s, loss=0.0063, acc=99.01%]



Test  Avg loss: 0.0207, Accuracy: 9934/10000 (99.34%)



Epoch 16: 100%|██████████| 938/938 [00:23<00:00, 39.60it/s, loss=0.0136, acc=99.05%]



Test  Avg loss: 0.0198, Accuracy: 9937/10000 (99.37%)



Epoch 17: 100%|██████████| 938/938 [00:23<00:00, 39.23it/s, loss=0.0094, acc=99.06%]



Test  Avg loss: 0.0192, Accuracy: 9942/10000 (99.42%)



Epoch 18: 100%|██████████| 938/938 [00:23<00:00, 39.79it/s, loss=0.0449, acc=99.06%]



Test  Avg loss: 0.0177, Accuracy: 9942/10000 (99.42%)



Epoch 19: 100%|██████████| 938/938 [00:22<00:00, 40.89it/s, loss=0.2475, acc=99.05%]



Test  Avg loss: 0.0181, Accuracy: 9945/10000 (99.45%)

✅ Finished Training. Best accuracy: 99.45%


In [None]:
# Example 1: Change optimizer and learning rate
# optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5) # Keep the same scheduler or modify as needed

In [None]:
# Example 2: Modify learning rate scheduler
optimizer = optim.Adam(model.parameters(), lr=0.001) # Keep the same optimizer or change as needed
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) # Changed step_size and gamma

In [None]:
# Example 3: Change batch size in cell DqTWLaM5GHgH
# Make sure to execute cell DqTWLaM5GHgH after changing the batch size
# batch_size = 128
batch_size = 64