In [8]:
import torch #we use the pytorch lib which is similar to numpy but we can run it on GPU for faster running and we can also use autograd to find the gradient.
import torch.nn as nn #this is the neural network module
'''
    This neural network module is very important, since we can even create our model without it but then we will have to
    manually implement everything. Without it, we will have to write our own forward pass, loss function etc.
'''
import torch.optim as optim #to update weights during training

#to load given dataset
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms

print("PyTorch version:", torch.__version__)
#we can use GPU if it is available or else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


PyTorch version: 2.5.1
Using device: cpu


In [9]:
# CIFAR-10 images are 32x32x3 RGB
# Normalization just scales pixel values roughly to [-1, 1]

#each image consists of pixel values, we convert them to tensor [255, 128, 0] → [1.0, 0.5, 0.0]
#normalizing them will make the values fall between -1 and 1, which is easier to train the model
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),   # mean per channel
                         (0.2023, 0.1994, 0.2010))   # std per channel
])

# Download/load CIFAR-10
'''
    The given dataset consists of 60,000 images of 10 different categories/classes. Thus we can say our model is based on Classification Learning
    Algorithm which is a type of Supervised Learning Algorithm. Classification Learning Algorithm generally deals with classes/categories and
    not numbers. For example, a classification algorithm with 2 outputs is known as Binary Classification and is solved using Logistic
    Regression. 
'''
root_dir = "./data"  # change if you want
full_train_dataset = datasets.CIFAR10(root=root_dir, train=True,
                                      transform=transform, download=True)

test_dataset = datasets.CIFAR10(root=root_dir, train=False,
                                transform=transform, download=True)

# Split training set into train + validation (e.g. 45k train, 5k val)
# that means, 45k out of 50k images are used to train the model, and rest 5k images are used to check if our model is learning something or overfitting
# incase the model is overfitting, we will have to add regularization (task 2)
train_size = int(0.9 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size
train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])

#segragate all the images into batches of size 64
batch_size = 64

#we shuffle all the images before training them because there is a possibility that same class of images are together inside the dataset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False, num_workers=2)

len(train_dataset), len(val_dataset), len(test_dataset)


Files already downloaded and verified
Files already downloaded and verified


(45000, 5000, 10000)

In [10]:
class SimpleCIFAR10CNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Input: 3 x 32 x 32
        '''
            we have two blocks inside the constructor which will be initialized automatically when we create an object of this class, each
            block consists of:
                1. Convolutional Layer: extracts low-level features like edges, colors, textures
                2. ReLU activation: introduces non-linearity so the network can learn complex patterns
                                    other activation functions such as Softplus can also be used.
                3. MaxPool: reduces image size from 32×32 → 16×16, removes noises

            We define all the layers inside the constructor and are applied in the forward function.
        '''
        self.conv_block1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)   # -> 32 x 16 x 16
        )
        
        self.conv_block2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)   # -> 64 x 8 x 8
        )

        # Flatten from 64 x 8 x 8 = 4096 features
        self.fc1 = nn.Linear(64 * 8 * 8, 256)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, 10)    # 10 classes in CIFAR-10

    # gives us the flow of data through the model
    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = x.view(x.size(0), -1)  # flatten
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)            # raw scores (logits)
        return x

        '''
            We don’t add activation at the end of forward pass because CrossEntropyLoss expects raw logits and already applies Softmax internally.
            Adding an activation at the end will break the learning process and reduce accuracy.
        '''

# feed our model to device, ie- CPU/GPU
model = SimpleCIFAR10CNN().to(device)
print(model)


SimpleCIFAR10CNN(
  (conv_block1): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Linear(in_features=4096, out_features=256, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=256, out_features=10, bias=True)
)


In [11]:
# Loss Function and Optimizer 
criterion = nn.CrossEntropyLoss()                # good for multi-class classification
optimizer = optim.SGD(model.parameters(), 
                      lr=0.01, 
                      momentum=0.9)             # simple baseline optimizer

'''
    We know that optimizer is used to update weights as mentioned above. SGD stands for Stochastic Gradient Descent. The parameters
    inside it are:
        1. model.parameter() --> these are your weights 'w'
        2. learning rate (lr) --> learning rate is used to update the weights while using gradient descent. Choosing right learning rate (alpha)
                                is essential since choosing very high learning rate will deviate the weights away from the global minimal
                                of the cost function when plotted against w.
        3. momentum --> to make the optimizer faster and smoother
'''


In [12]:
def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()  # training mode (enables dropout/batchnorm if used)
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in loader:
        images = images.to(device)      #make sure to move the tensors to the device since our model is running on the device (CPU/GPU)
        labels = labels.to(device)

        # 1. forward
        outputs = model(images)
        loss = criterion(outputs, labels)

        # 2. backward
        '''
            for a training set, when we are using .backward() to store the grad, we need to empty it after each iteration in the for loop
            to store the new grad, thus we use: optimizer.zero_grad()
        '''
        optimizer.zero_grad()   # clear old gradients
        loss.backward()         # compute new gradients
        optimizer.step()        # update weights

        # accumulate stats
        running_loss += loss.item() * images.size(0)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc


def evaluate(model, loader, criterion, device):
    model.eval()   # evaluation mode
    running_loss = 0.0
    correct = 0
    total = 0

    #we use 'with torch.no_grad()' to stop tensor from tracking
    with torch.no_grad():  # no gradients during validation
        for images, labels in loader:
            images = images.to(device)       
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * images.size(0)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc


In [6]:
# epoch is basically the number of times we want to train our model with the entire dataset.
num_epochs = 10   # for baseline; you can increase later

for epoch in range(num_epochs):
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc     = evaluate(model, val_loader, criterion, device)

    print(f"Epoch [{epoch+1}/{num_epochs}] "
          f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}% "
          f"|| Val Loss: {val_loss:.4f} | Val Acc: {val_acc*100:.2f}%")


Epoch [1/10] Train Loss: 1.4093 | Train Acc: 49.22% || Val Loss: 1.1474 | Val Acc: 59.66%
Epoch [2/10] Train Loss: 0.9726 | Train Acc: 65.56% || Val Loss: 0.9744 | Val Acc: 65.14%
Epoch [3/10] Train Loss: 0.7701 | Train Acc: 72.64% || Val Loss: 0.8667 | Val Acc: 69.70%
Epoch [4/10] Train Loss: 0.6054 | Train Acc: 78.88% || Val Loss: 0.9015 | Val Acc: 69.70%
Epoch [5/10] Train Loss: 0.4623 | Train Acc: 83.64% || Val Loss: 0.9082 | Val Acc: 71.16%
Epoch [6/10] Train Loss: 0.3245 | Train Acc: 88.51% || Val Loss: 0.9806 | Val Acc: 72.70%
Epoch [7/10] Train Loss: 0.2302 | Train Acc: 91.86% || Val Loss: 1.1438 | Val Acc: 71.94%
Epoch [8/10] Train Loss: 0.1502 | Train Acc: 94.86% || Val Loss: 1.2857 | Val Acc: 71.62%
Epoch [9/10] Train Loss: 0.1110 | Train Acc: 96.23% || Val Loss: 1.3409 | Val Acc: 71.20%
Epoch [10/10] Train Loss: 0.0950 | Train Acc: 96.87% || Val Loss: 1.5409 | Val Acc: 71.22%


In [7]:
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc*100:.2f}%")


Test Loss: 1.5079 | Test Acc: 72.41%
