**Model 2:**

We switched to AdamW optimiser with a learning rate of 0.001 (1e-3) and a weight decay of 0.0001 (1e-4). The momentum is not specified because the AdamW optimizer does not use momentum as a parameter. The AdamW optimizer is an improved version of the original Adam optimizer, which incorporates L2 regularization (weight decay).

To summarize, our optimizer was AdamW with the following parameters:

Learning rate (lr): 0.001 & Weight decay: 0.0001



In [None]:
pip install torch torchvision


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader


In [None]:
class BasicBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = None
        if stride != 1 or in_channels != out_channels:
            self.downsample = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
            
    def forward(self, x):
        identity = x
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        if self.downsample is not None:
            identity = self.downsample(x)
        out += identity
        out = self.relu(out)
        return out

class ModifiedResNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ModifiedResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self._make_layer(32, 32, 3, stride=1)
        self.layer2 = self._make_layer(32, 64, 4, stride=2)
        self.layer3 = self._make_layer(64, 128, 4, stride=2)
        self.layer4 = self._make_layer(128, 256, 3, stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(256, num_classes)

    def _make_layer(self, in_channels, out_channels, num_blocks, stride):
        layers = []
        layers.append(BasicBlock(in_channels, out_channels, stride))
        for _ in range(1, num_blocks):
            layers.append(BasicBlock(out_channels, out_channels, stride=1))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

model = ModifiedResNet()


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])


In [None]:
from cutout import Cutout

transform_train.transforms.append(Cutout(n_holes=1, length=16))


In [None]:
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

trainloader = DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)
testloader = DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:01<00:00, 106026419.87it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


As mentioned above in the model description:

Learning rate (lr): 0.001 & Weight decay: 0.0001

In [None]:
model = ModifiedResNet().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)


In [None]:
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100, eta_min=1e-5)


In [None]:
num_epochs = 150

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for i, (inputs, labels) in enumerate(trainloader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    print(f"Epoch: {epoch+1}, Loss: {running_loss/(i+1)}, Train accuracy: {100*correct/total}")

    scheduler.step()


Epoch: 1, Loss: 1.696396364885218, Train accuracy: 37.144
Epoch: 2, Loss: 1.3318028007931721, Train accuracy: 51.562
Epoch: 3, Loss: 1.158014152360999, Train accuracy: 58.414
Epoch: 4, Loss: 1.0519595659907213, Train accuracy: 62.406
Epoch: 5, Loss: 0.9651200972554629, Train accuracy: 65.498
Epoch: 6, Loss: 0.8982390179048718, Train accuracy: 68.108
Epoch: 7, Loss: 0.8393398878519492, Train accuracy: 70.44
Epoch: 8, Loss: 0.7930788335287967, Train accuracy: 72.034
Epoch: 9, Loss: 0.7572673431137944, Train accuracy: 73.454
Epoch: 10, Loss: 0.7176882324316313, Train accuracy: 74.694
Epoch: 11, Loss: 0.686328514000339, Train accuracy: 75.788
Epoch: 12, Loss: 0.6657198053949019, Train accuracy: 76.646
Epoch: 13, Loss: 0.6383969003282239, Train accuracy: 77.64
Epoch: 14, Loss: 0.6204934155239779, Train accuracy: 78.11
Epoch: 15, Loss: 0.5999529958533509, Train accuracy: 78.944
Epoch: 16, Loss: 0.583392344624795, Train accuracy: 79.486
Epoch: 17, Loss: 0.5656560574803511, Train accuracy: 80.

In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in testloader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print(f"Test accuracy: {100*correct/total}")


Test accuracy: 92.78


**Result:**

The final model architecture achieved a test accuracy of 92.78%, demonstrating the effectiveness of our design choices and optimization techniques for improving the performance on the CIFAR-10 dataset.
