In [56]:
import os
# Used to load class-to-label mappings from a JSON file.
import json
import torch
# Used to handle image loading.
from PIL import Image
# Base class for custom PyTorch datasets, allowing integration with DataLoader.
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
import torch.nn as nn
import torch.optim as optim

In [61]:
# Hyperparameters
epochs = 100
# Specifies the batch size of 64 images, meaning each iteration of training will process 64 images at once.
batch_size = 64
learning_rate = 0.001
imagenet_data_dir_train = "./ImageNet-Mini/train"
imagenet_data_dir_test = "./ImageNet-Mini/test"
validation_split = 0.2
shuffle = True  # DataLoader.shuffle
# Sets the number of CPU cores for loading data, speeding up data loading by using multiple parallel workers.
num_workers = 4  # DataLoader.num_workers
# Indicates that there are 1,000 classes in MiniImageNet, matching the number of output classes expected for the classification task.
n_classes = 1000

if torch.backends.mps.is_available():
    device = torch.device('mps')  # Use MPS for Apple Silicon
elif torch.cuda.is_available():
    device = torch.device('cuda')  # Use CUDA for NVIDIA GPUs
else:
    device = torch.device('cpu')  # Fallback to CPU

print(f"Using device: {device}")

Using device: mps


In [40]:
# Make Dataset Class, instatiate with our data
class MiniImageNetDataset(Dataset):
    def __init__(self, data_path: str, transform: transforms = None):
        self.data_path = data_path
        self.image_paths = []
        self.labels = []
        self.class_mapping = {}
        self.transform = transform


        # imagenet_class_index.json - This file contains mappings from ImageNet class IDs to human-readable names.
        class_index_path = os.path.join(data_path, 'imagenet_class_index.json')
        with open(class_index_path, 'r') as f:
            # Loads the JSON file and formats it into a dictionary class_id_to_name, where each class name points 
            # to a tuple of the numeric class ID and the class description.
            class_id_to_name = json.load(f)
        class_id_to_name = {v[0]: [k, v[1]]for k, v in class_id_to_name.items()}

        # Build the Dataset
        image_dir = os.path.join(data_path, 'images')
        for class_name in sorted(os.listdir(image_dir)):
            class_path = os.path.join(image_dir, class_name)
            for image_name in sorted(os.listdir(class_path)):
                image_path = os.path.join(class_path, image_name)
                self.image_paths.append(image_path)
                
                class_map = class_id_to_name[class_name]
                self.class_mapping[int(class_map[0])] = class_map[1]
                self.labels.append(int(class_map[0]))

    def __len__(self) -> int:
        return len(self.image_paths)
    
    def __getitem__(self, idx: int):
        image_path = self.image_paths[idx]
        label = torch.tensor(self.labels[idx])
        
        # Load image as PIL format
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, label

In [41]:
# Data Preprocessing Configs
# Chains together a list of transformations to be applied to each image in sequence.
transform = transforms.Compose([
    # The first convolutional layer filters the 224 × 224 × 3 input image with 96 kernels of size 11 × 11 × 3 with a stride of 4 pixels
    # Resize inputs to 224 × 224
    transforms.Resize((224, 224)),
    # Converts images to tensors -  Converts images from a range of [0,255][0,255] integers to a range of [0,1][0,1] floats and changes them from PIL images to PyTorch tensors.
    transforms.ToTensor(),
    # Normalize for ImageNet models -  Normalizes the tensor to have a mean and standard deviation corresponding to ImageNet, with each color channel normalized independently.
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [42]:
dataset = MiniImageNetDataset(data_path='./ImageNet-Mini', transform=transform)

In [43]:
val_split = int(len(dataset) * 0.2)
train_split = len(dataset) - val_split
train_set, val_set = random_split(dataset, [train_split, val_split])

In [44]:
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True)

In [48]:
train_set[0][0]

tensor([[[ 2.2489,  2.2489,  2.2489,  ..., -1.5014, -1.5185, -1.6042],
         [ 2.2489,  2.2489,  2.2489,  ..., -1.5185, -1.6042, -1.6213],
         [ 2.2489,  2.2489,  2.2489,  ..., -1.6384, -1.6213, -1.5699],
         ...,
         [ 2.1975,  2.2147,  2.2147,  ..., -0.0801, -0.1486, -0.3369],
         [ 2.2318,  2.2318,  2.2318,  ..., -0.1143, -0.1486, -0.3712],
         [ 2.2147,  2.2318,  2.2318,  ..., -0.1657, -0.2856, -0.5767]],

        [[ 2.4286,  2.4286,  2.4286,  ..., -1.4580, -1.4580, -1.5630],
         [ 2.4286,  2.4286,  2.4286,  ..., -1.4230, -1.4580, -1.5455],
         [ 2.4286,  2.4286,  2.4286,  ..., -1.5105, -1.5105, -1.5105],
         ...,
         [ 2.4111,  2.4111,  2.4111,  ..., -0.8803, -0.9153, -1.0728],
         [ 2.4111,  2.4111,  2.4111,  ..., -0.9153, -0.9853, -1.1253],
         [ 2.4286,  2.4286,  2.4111,  ..., -0.8803, -1.0028, -1.3354]],

        [[ 2.6400,  2.6400,  2.6400,  ..., -1.3164, -1.2990, -1.3513],
         [ 2.6400,  2.6400,  2.6400,  ..., -1

In [49]:
train_set[0][0].shape

torch.Size([3, 224, 224])

In [50]:
train_set[0][1]

tensor(910)

In [55]:
class AlexNetClassifier(nn.Module):
    # 1000 classes in ImageNet
    def __init__(self, n_classes: int=1000):
        super(AlexNetClassifier, self).__init__()
        self.n_classes = n_classes
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=96, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
        self.conv2 = nn.Conv2d(in_channels=96, out_channels=256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
        self.conv3 = nn.Conv2d(in_channels=256, out_channels=384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.conv4 = nn.Conv2d(in_channels=384, out_channels=384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.conv5 = nn.Conv2d(in_channels=384, out_channels=256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.model_features = nn.Sequential(
            self.conv1, nn.ReLU(True), nn.MaxPool2d((3, 3), (2, 2)),
            self.conv2, nn.ReLU(True), nn.MaxPool2d((3, 3), (2, 2)),
            self.conv3, nn.ReLU(True),
            self.conv4, nn.ReLU(True),
            self.conv5, nn.ReLU(True), nn.MaxPool2d((3, 3), (2, 2)),
        )

        self.avg_pool = nn.AdaptiveAvgPool2d((6, 6))

        # The fully-connected layers have 4096 neurons each as stated in the paper.
        self.n_neurons = 4096
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(256 * 6 * 6, self.n_neurons),
            nn.ReLU(True),
            nn.Dropout(0.5),
            nn.Linear(self.n_neurons, self.n_neurons),
            nn.ReLU(True),
            nn.Linear(self.n_neurons, self.n_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        out = self.model_features(x)
        out = self.avg_pool(out)
        out = torch.flatten(out, 1)
        return self.classifier(out)


In [57]:
model = AlexNetClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [58]:
def train(model, loader, criterion, optimizer):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for imgs, targets in loader:
            imgs, targets = imgs.to(device), targets.to(device)

            # Forward pass
            outputs = model(imgs)
            loss = criterion(outputs, targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(loader):.4f}")

In [59]:
# Evaluation function
def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Accuracy: {accuracy:.2f}%")

In [60]:
# Train and evaluate the model
train(model, train_loader, criterion, optimizer)
evaluate(model, val_loader)

Epoch [1/100], Loss: 6.9353
Epoch [2/100], Loss: 6.9019
Epoch [3/100], Loss: 6.8989
Epoch [4/100], Loss: 6.9067
Epoch [5/100], Loss: 6.8713
Epoch [6/100], Loss: 6.8653
Epoch [7/100], Loss: 6.8580
Epoch [8/100], Loss: 6.8508
Epoch [9/100], Loss: 6.8474
Epoch [10/100], Loss: 6.8394
Epoch [11/100], Loss: 6.8448
Epoch [12/100], Loss: 6.8373
Epoch [13/100], Loss: 6.8369
Epoch [14/100], Loss: 6.8467
Epoch [15/100], Loss: 6.8301
Epoch [16/100], Loss: 6.8330
Epoch [17/100], Loss: 6.8299
Epoch [18/100], Loss: 6.8362
Epoch [19/100], Loss: 6.8308
Epoch [20/100], Loss: 6.8340
Epoch [21/100], Loss: 6.8320
Epoch [22/100], Loss: 6.8332
Epoch [23/100], Loss: 6.8305
Epoch [24/100], Loss: 6.8390
Epoch [25/100], Loss: 6.8360
Epoch [26/100], Loss: 6.8249
Epoch [27/100], Loss: 6.8279
Epoch [28/100], Loss: 6.8316
Epoch [29/100], Loss: 6.8315
Epoch [30/100], Loss: 6.8307
Epoch [31/100], Loss: 6.8289
Epoch [32/100], Loss: 6.8336
Epoch [33/100], Loss: 6.8350
Epoch [34/100], Loss: 6.8287
Epoch [35/100], Loss: 6

In [68]:
prediction = model(train_set[0][0].unsqueeze(0).to(device))
prediction

tensor([[-3.6689e-01, -1.3814e+00,  1.0025e-01, -3.8229e-01, -7.9428e-02,
         -1.3752e+00, -7.3961e-01, -7.5484e-01, -7.5546e-01, -3.8401e-01,
         -3.6418e-01, -4.5579e-02, -3.6532e-01, -3.6226e-01, -7.3443e-01,
         -1.3636e+00, -3.5683e-01, -3.6859e-01, -7.5375e-01, -3.6686e-01,
         -3.5449e-01, -3.5599e-01, -3.8052e-01, -3.7812e-01, -3.5293e-01,
         -7.5091e-01, -8.8756e-02, -7.5728e-01,  6.0805e-02, -1.3732e+00,
         -3.1126e-01, -3.6534e-01, -7.5499e-01, -3.5745e-01, -3.7124e-01,
         -3.6876e-01, -8.7290e-02, -7.3377e-01, -3.6198e-01, -3.5530e-01,
         -4.0098e-01, -3.5933e-01, -7.8039e-02, -3.9074e-01, -3.5977e-01,
         -3.5405e-01, -7.4658e-01, -8.4408e-02, -7.3806e-01, -7.5758e-01,
         -3.5089e-01, -3.6132e-01, -3.5559e-01, -1.3478e+00, -1.0758e-01,
         -6.8144e-01, -1.3580e+00, -7.7384e-01, -3.6308e-01, -3.6646e-01,
         -8.4236e-02, -3.6630e-01, -3.7683e-01, -1.0148e-01, -3.6389e-01,
         -7.4213e-01, -3.7279e-01, -6.

In [72]:
majority_class_pred = torch.argmax(prediction)
majority_class_pred.item()

902

In [74]:
train_set[0][1].item()

910