## 1. Installing the Dependencies

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data.sampler import SubsetRandomSampler

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### Note: 
1. When this transformation is applied to an image tensor, it first subtracts the mean value of each color channel from the corresponding pixel values, and then divides the result by the standard deviation of each color channel.

2. `shuffle` parameter is set to True to ensure that the images are returned in a random order for each epoch of training, which is useful for reducing overfitting and improving the generalization of the model

3. `train_idx` contains the indices of the samples in the training subset, and `valid_idx` contains the indices of the samples in the validation subset. `SubsetRandomSampler` is a PyTorch sampler that selects a random subset of samples from a dataset using a given set of indices. `train_sampler` and `valid_sampler` are PyTorch samplers that select random samples from the training and validation subsets, respectively.

## 2. Getting the Data

In [None]:
def get_train_valid_loader(data_dir,
                           batch_size,
                           augment,
                           random_seed,
                           valid_size=0.1,
                           shuffle=True):
    normalize = transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    # define transforms
    valid_transform = transforms.Compose([
            transforms.Resize((227,227)),
            transforms.ToTensor(),
            normalize,
    ])
    
    if augment:
        train_transform = transforms.Compose([
            transforms.RandomCrop(32, padding=4), # to introduce some variability into the training data 
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    else:
        train_transform = transforms.Compose([
            transforms.Resize((227,227)),
            transforms.ToTensor(),
            normalize,
        ])
        
    # loading the dataset
    train_dataset = datasets.CIFAR10(
    root=data_dir, train=True,
    download=True, transform=train_transform,
    )
    
    valid_dataset = datasets.CIFAR10(
    root=data_dir, train=True,
    download=True, transform=valid_transform,
    )
    
    num_train = len(train_dataset)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train)) # to get the index of the last sample in the training subset
    
    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
        
    train_idx, valid_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler)
 
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler)

    return (train_loader, valid_loader)


def get_test_loader(data_dir,
                   batch_size,
                   shuffle=True):
    normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )
    
    # define transform
    transform = transforms.Compose([
        transforms.Resize((227,227)),
        transforms.ToTensor(),
        normalize,  
    ])
    
    # loading the test data
    dataset = datasets.CIFAR10(
        root=data_dir, train=False,
        download=True, transform=transform,
    )

    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle
    )

    return data_loader

# CIFAR10 dataset 
train_loader, valid_loader = get_train_valid_loader(data_dir = './data',
                                                    batch_size = 64,
                                                    augment = False,
                                                    random_seed = 1)

test_loader = get_test_loader(data_dir = './data',
                              batch_size = 64)

## 3. Creating the AlexNet Class

In [None]:
class AlexNet(nn.Module):
    def __init__(self, total_classes=10):
        super(AlexNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=0),
            nn.BatchNorm2d(96),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2))
        
        self.layer2 = nn.Sequential(
            nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        
        self.layer3 = nn.Sequential(
            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(384),
            nn.ReLU())
        
        self.layer4 = nn.Sequential(
            nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(384),
            nn.ReLU())
        
        self.layer5 = nn.Sequential(
            nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(9216, 4096),
            nn.ReLU())
        
        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU())
        
        self.fc2= nn.Sequential(
            nn.Linear(4096, total_classes))
    
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = out.reshape(out.size(0),-1)
        out = self.fc(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out
        

### Notes:
1. The `lr` parameter specifies the learning rate, which determines the step size of the optimizer in updating the parameters. The `weight_decay` parameter is a form of L2 regularization, which penalizes large weights to prevent overfitting. The `momentum` parameter adds a fraction of the previous update to the current update, which helps to stabilize the optimization process and accelerate convergence.

2. An epoch is a complete iteration through the entire training set.

3. The `predicted == labels` comparison returns a boolean tensor with the same shape as the labels tensor, where `True` indicates a correct prediction and `False` indicates an incorrect prediction. The `sum()` method sums up the number of True values, and the `item()` method converts the resulting PyTorch scalar tensor to a Python integer.

## 4. Setting the Hyperparameters

In [None]:
total_classes = 10
total_epochs = 20
batch_size = 64
learning_rate = 0.005

model = AlexNet(total_classes).to(device)

# loss and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),
                            lr=learning_rate,
                            weight_decay=0.005)


## 5. Training and Validation

In [None]:
# Train the model
total_step = len(train_loader) # getting the total number of batches

for epoch in range(total_epochs):
    for i, (images, labels) in enumerate(train_loader):  # looping over the batches
        images = images.to(device) # moving both the images and labels to the device
        labels = labels.to(device)
        
        # forward pass
        outputs = model(images) # predicted outputs
        loss = loss_fn(outputs, labels)
        
        # backward and optimize
        optimizer.zero_grad() # setting all the gradients to zero before computing gradients for the next batch
        loss.backward()
        optimizer.step() # uppdating the model's parameters using the computed gradients
    
    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, total_epochs, i+1, total_step, loss.item()))
    
    # for validation
    with torch.no_grad(): # reduces memory usage and speeds up computation, since gradients don't need to be computed during validation
        correct = 0 # correctly classified images
        total = 0 # total number of images seen during validation
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1) # discarding maximum value in the predicted output tensor and the class with the highest probability
            total += labels.size(0) # updating the total number of images seen during validation
            
            # the number of correctly classified images
            correct += (predicted == labels).sum().item() 
            
            # deletes the variables containing the images, labels, and predicted outputs tensors, to free up memory.
            del images, labels, outputs
            
        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total)) 
            

## 6. Testing

In [None]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total))   