# Importing the Libraries

In [1]:
import numpy as np
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Loading the Dataset

In [11]:
def get_train_valid_loader(data_dir,batch_size,augment,random_seed,valid_size=0.1,shuffle=True):
    normalize=transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
        
    )

    # define transforms
    valid_transform=transforms.Compose([
        transforms.Resize((227,227)),
        transforms.ToTensor(),
        normalize,
        
    ])
    if augment:
        train_transform=transforms.Compose([
            transforms.RandomCrop(32,padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    else:train_transform = transforms.Compose([
                transforms.Resize((227,227)),
                transforms.ToTensor(),
                normalize,
            ])

    # load the dataset
    train_dataset=datasets.CIFAR10(
        root=data_dir,train=True,
        download=True,transform=train_transform,
    )
    
    valid_dataset = datasets.CIFAR10(
            root=data_dir, train=True,
            download=True, transform=valid_transform,
        )
    num_train=len(train_dataset)
    indices=list(range(num_train))
    split=int(np.floor(valid_size*num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_idx,valid_idx=indices[split:],indices[:split]
    train_sampler=SubsetRandomSampler(train_idx)
    valid_sampler=SubsetRandomSampler(valid_idx)

    train_loader=torch.utils.data.DataLoader(train_dataset,batch_size=batch_size,sampler=train_sampler)
    valid_loader=torch.utils.data.DataLoader(valid_dataset,batch_size=batch_size,sampler=valid_sampler)

    return(train_loader,valid_loader)



def get_test_loader(data_dir,
                        batch_size,
                        shuffle=True):
        normalize = transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
        )

        # define transform
        transform = transforms.Compose([
            transforms.Resize((227,227)),
            transforms.ToTensor(),
            normalize,
        ])

        dataset = datasets.CIFAR10(
            root=data_dir, train=False,
            download=True, transform=transform,
        )

        data_loader = torch.utils.data.DataLoader(
            dataset, batch_size=batch_size, shuffle=shuffle
        )

        return data_loader





# CIFAR10 dataset 
train_loader, valid_loader = get_train_valid_loader(data_dir = './data',batch_size = 64,augment = False,random_seed = 1)

test_loader = get_test_loader(data_dir = './data',batch_size = 64)

    
        

# AlexNet from scratch

| Block      | Layer & Params                                         | What the params do                                                                                      | Why AlexNet chose them                                                                                                                                               | Practical effect                                                             |
| ---------- | ------------------------------------------------------ | ------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------- |
| **layer1** | `Conv2d(3, 96, kernel_size=11, stride=4, padding=0)`   | 3→96 channels; big **11×11** receptive field; **stride 4** downsampling; **no padding** shrinks the map | Early, aggressive spatial compression to make training/inference tractable on 2012 GPUs; large kernel to capture coarse edges/color blobs and global context quickly | 227→**55** (per side) after conv; strong low-level features with big context |
|            | `BatchNorm2d(96)`                                      | Normalizes each of 96 feature maps                                                                      | (Modern replacement for LRN) stabilizes/accelerates training, reduces covariate shift                                                                                | Smoother optimization; allows higher learning rates                          |
|            | `ReLU()`                                               | Half-wave nonlinearity                                                                                  | Cheap, effective nonlinearity; mitigates vanishing gradients                                                                                                         | Sparsifies activations, improves convergence                                 |
|            | `MaxPool2d(kernel_size=3, stride=2)`                   | 3×3 max + **stride 2**                                                                                  | Makes the network translation-tolerant and shrinks spatial size while keeping strong responses                                                                       | 55→**27** (per side); keeps strongest local features                         |
| **layer2** | `Conv2d(96, 256, kernel_size=5, stride=1, padding=2)`  | 96→256 channels; **5×5** receptive field; **same** spatial size due to padding=2                        | Increase channel capacity to model more complex motifs; 5×5 refines patterns discovered by large first conv                                                          | 27→**27** (same size), richer representations                                |
|            | `BatchNorm2d(256)`                                     | Normalize 256 maps                                                                                      | Stabilize deeper stack                                                                                                                                               | Better gradients                                                             |
|            | `ReLU()`                                               | Nonlinear mixing                                                                                        | Standard                                                                                                                                                             | –                                                                            |
|            | `MaxPool2d(3, 2)`                                      | Downsample by \~2                                                                                       | Control memory/compute; add invariance                                                                                                                               | 27→**13**                                                                    |
| **layer3** | `Conv2d(256, 384, kernel_size=3, stride=1, padding=1)` | 256→384; **3×3**, **same** size                                                                         | Move to smaller kernels; stacked 3×3s approximate larger receptive fields with fewer params; increases depth and capacity                                            | 13→**13**; more channels for mid/high-level parts                            |
|            | `BatchNorm2d(384)`                                     | Normalize                                                                                               | As above                                                                                                                                                             | As above                                                                     |
|            | `ReLU()`                                               | –                                                                                                       | –                                                                                                                                                                    | –                                                                            |
| **layer4** | `Conv2d(384, 384, kernel_size=3, stride=1, padding=1)` | 384→384; 3×3                                                                                            | Deepens nonlinearity at same resolution to learn complex part co-occurrences                                                                                         | 13→**13**; preserves spatial detail                                          |
|            | `BatchNorm2d(384)`                                     | Normalize                                                                                               | –                                                                                                                                                                    | –                                                                            |
|            | `ReLU()`                                               | –                                                                                                       | –                                                                                                                                                                    | –                                                                            |
| **layer5** | `Conv2d(384, 256, kernel_size=3, stride=1, padding=1)` | 384→256; 3×3                                                                                            | Funnel to fewer channels before final pooling/FC; reduces parameters downstream                                                                                      | 13→**13**                                                                    |
|            | `BatchNorm2d(256)`                                     | Normalize                                                                                               | –                                                                                                                                                                    | –                                                                            |
|            | `ReLU()`                                               | –                                                                                                       | –                                                                                                                                                                    | –                                                                            |
|            | `MaxPool2d(3, 2)`                                      | Final spatial downsample                                                                                | Prepares compact feature grid for FC                                                                                                                                 | 13→**6**                                                                     |


In [8]:
class AlexNet(nn.Module):
    def __init__(self,num_classes=10):
        super(AlexNet,self).__init__()
        self.layer1=nn.Sequential(
                nn.Conv2d(3,96,kernel_size=11,stride=4,padding=0),
                nn.BatchNorm2d(96),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=3,stride=2)
            )
        self.layer2 = nn.Sequential(
                nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
                nn.BatchNorm2d(256),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.layer3 = nn.Sequential(
                nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
                nn.BatchNorm2d(384),
                nn.ReLU())
        self.layer4 = nn.Sequential(
                nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
                nn.BatchNorm2d(384),
                nn.ReLU())
        self.layer5 = nn.Sequential(
                nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
                nn.BatchNorm2d(256),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.fc = nn.Sequential(
                nn.Dropout(0.5),
                nn.Linear(9216, 4096),
                nn.ReLU())
        self.fc1 = nn.Sequential(
                nn.Dropout(0.5),
                nn.Linear(4096, 4096),
                nn.ReLU())
        self.fc2= nn.Sequential(
                nn.Linear(4096, num_classes))
    def forward(self, x):
            out = self.layer1(x)
            out = self.layer2(out)
            out = self.layer3(out)
            out = self.layer4(out)
            out = self.layer5(out)
            out = out.reshape(out.size(0), -1)
            out = self.fc(out)
            out = self.fc1(out)
            out = self.fc2(out)
            return out                          
        

# Setting Hyperparameters

In [12]:
num_classes = 10
num_epochs = 20
batch_size = 64
learning_rate = 0.005

model = AlexNet(num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.005, momentum = 0.9)  
    
# Train the model
total_step = len(train_loader)

# Training

In [18]:
for epoch in range(num_epochs):
    for i,(images,labels) in enumerate(train_loader):
        # Move tensors to the configured device
        images=images.to(device)
        labels=labels.to(device)
        
        # Forward pass
        outputs=model(images)
        loss=criterion(outputs,labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}'.format(epoch+1,num_epochs,i+1,total_step,loss.item()))
    


    # Validation
    with torch.no_grad():
                correct = 0
                total = 0
                for images, labels in valid_loader:
                    images = images.to(device)
                    labels = labels.to(device)
                    outputs = model(images)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()
                    del images, labels, outputs
    
                print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total))                      
    
    




The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
Epoch [1/20], Step[704/704], Loss: 1.0224
Accuracy of the network on the 5000 validation images: 76.08 %
Epoch [2/20], Step[704/704], Loss: 1.3559
Accuracy of the network on the 5000 validation images: 77.92 %
Epoch [3/20], Step[704/704], Loss: 0.9896
Accuracy of the network on the 5000 validation images: 76.58 %
Epoch [4/20], Step[704/704], Loss: 2.1815
Accuracy of the network on the 5000 validation images: 76.26 %
Epoch [5/20], Step[704/704], Loss: 0.4569
Accuracy of the network on the 5000 validation images: 79.76 %
Epoch [6/20], Step[704/704], Loss: 1.0018
Accuracy of the network on the 5000 validation images: 80.64 %
Epoch [7/20], Step[704/704], Loss: 0.5591
Accuracy of the network on the 5000 validation images: 80.02 %
Epoch [8/20], Step[704/704], Loss: 0.9269
Accuracy of the network on the 5000 validation images: 80.98 %
Epoch 

# Testing

In [19]:
with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

        print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total))

Accuracy of the network on the 10000 test images: 83.07 %
