## Data Processing & Loading

In [6]:
# src/data.py


import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms as T

IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD  = [0.229, 0.224, 0.225]



def _tfms(image_size=224):
    train = T.Compose([
        T.Resize(int(image_size * 1.14)),
        T.RandomResizedCrop(image_size, scale=(0.8, 1.0)),
        T.RandomHorizontalFlip(p=0.5),
        T.ToTensor(),
        T.Normalize(IMAGENET_MEAN, IMAGENET_STD),
    ])
    eval_ = T.Compose([
        T.Resize(int(image_size * 1.14)),
        T.CenterCrop(image_size),
        T.ToTensor(),
        T.Normalize(IMAGENET_MEAN, IMAGENET_STD),
    ])
    return {"train": train, "val": eval_, "test": eval_}



class CSVDataset(Dataset):
    def __init__(self, csv_path, img_dir, transforms=None):
        self.df = pd.read_csv(csv_path)
        if not {"filename","label"}.issubset(self.df.columns):
            raise ValueError(f"{csv_path} must have 'filename' and 'label' columns.")
        self.img_dir = img_dir
        self.transforms = transforms

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        path = os.path.join(self.img_dir, str(row["filename"]))
        img = Image.open(path).convert("RGB")
        y = int(row["label"])
        if self.transforms: img = self.transforms(img)
        return img, y



def make_loaders(
    data_dir="data",
    images_subdir="images",
    batch_size=32,
    num_workers=0,         # macOS: start at 0; bump later if stable
    image_size=224,
):
    csvs = {
        "train": os.path.join(data_dir, "train.csv"),
        "val":   os.path.join(data_dir, "val.csv"),
        "test":  os.path.join(data_dir, "test.csv"),
    }
    img_dir = os.path.join(data_dir, images_subdir)
    tf = _tfms(image_size)

    ds = {
        split: CSVDataset(csvs[split], img_dir, transforms=tf["train" if split=="train" else "val"])
        for split in ["train","val","test"]
    }

    loaders = {
        "train": DataLoader(ds["train"], batch_size=batch_size, shuffle=True,
                            num_workers=num_workers),
        "val":   DataLoader(ds["val"], batch_size=batch_size, shuffle=False,
                            num_workers=num_workers),
        "test":  DataLoader(ds["test"], batch_size=batch_size, shuffle=False,
                            num_workers=num_workers),
    }
    return loaders


In [7]:
data_dir = "/Users/sarvesh/Desktop/GitHub/lego-minifigure-finder/data"

loaders = make_loaders(data_dir=data_dir, images_subdir="images", batch_size=16)
imgs, labels = next(iter(loaders["train"]))
print(imgs.shape, labels[:8])

torch.Size([16, 3, 224, 224]) tensor([0, 0, 1, 0, 0, 0, 0, 1])


## Model Training

In [None]:

import os
import time
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Use CUDA if available; otherwise prefer Apple MPS if present; else CPU.
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
device


device(type='mps')

In [11]:
# Point to your data directory
DATA_DIR = "/Users/sarvesh/Desktop/GitHub/lego-minifigure-finder/data"          # <- adjust if needed
IMAGES_SUBDIR = "/Users/sarvesh/Desktop/GitHub/lego-minifigure-finder/data/images"   # <- adjust if needed

# # If you created make_loaders earlier:
# try:
#     from src.data import make_loaders
# except Exception:
#     # Fallback: assume data.py at repo root
#     from data import make_loaders

# Build loaders; no pin_memory, num_workers=0 is safest on macOS
loaders = make_loaders(
    data_dir=DATA_DIR,
    images_subdir=IMAGES_SUBDIR,
    batch_size=32,
    num_workers=0,          # bump to 2-4 later if stable
    image_size=224
)
for k,v in loaders.items():
    print(k, len(v), "batches")


train 30 batches
val 6 batches
test 7 batches


In [12]:
def set_seed(seed: int = 42):
    import random, numpy as np, torch
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    preds_all, y_all = [], []

    for imgs, y in loader:
        imgs = imgs.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        logits = model(imgs)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)
        preds = torch.argmax(logits, dim=1)
        preds_all.append(preds.detach().cpu())
        y_all.append(y.detach().cpu())

    epoch_loss = running_loss / len(loader.dataset)
    y_true = torch.cat(y_all).numpy()
    y_pred = torch.cat(preds_all).numpy()
    epoch_acc = accuracy_score(y_true, y_pred)
    return epoch_loss, epoch_acc

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    preds_all, y_all = [], []

    for imgs, y in loader:
        imgs = imgs.to(device)
        y = y.to(device)

        logits = model(imgs)
        loss = criterion(logits, y)
        running_loss += loss.item() * imgs.size(0)

        preds = torch.argmax(logits, dim=1)
        preds_all.append(preds.detach().cpu())
        y_all.append(y.detach().cpu())

    epoch_loss = running_loss / len(loader.dataset)
    y_true = torch.cat(y_all).numpy()
    y_pred = torch.cat(preds_all).numpy()
    epoch_acc = accuracy_score(y_true, y_pred)
    return epoch_loss, epoch_acc, y_true, y_pred

def fit(model, loaders, epochs, lr=1e-3, weight_decay=0.0):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_val_acc = -1
    best_state = None
    history = []

    for epoch in range(1, epochs+1):
        t0 = time.time()
        tr_loss, tr_acc = train_one_epoch(model, loaders["train"], criterion, optimizer, device)
        va_loss, va_acc, _, _ = evaluate(model, loaders["val"], criterion, device)
        dt = time.time() - t0

        history.append({"epoch": epoch, "train_loss": tr_loss, "train_acc": tr_acc,
                        "val_loss": va_loss, "val_acc": va_acc, "sec": dt})
        print(f"[{epoch:02d}] "
              f"train_loss={tr_loss:.4f} acc={tr_acc:.4f} | "
              f"val_loss={va_loss:.4f} acc={va_acc:.4f} | {dt:.1f}s")

        if va_acc > best_val_acc:
            best_val_acc = va_acc
            best_state = {k: v.cpu() for k, v in model.state_dict().items()}

    if best_state is not None:
        model.load_state_dict(best_state)

    return model, history


In [13]:
class SmallCNN(nn.Module):
    """
    Very small CNN for quick iteration.
    Input: [B, 3, 224, 224] -> 2 classes
    """
    def __init__(self, num_classes=2):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1),  # 112x112
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),

            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1), # 56x56
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),

            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),# 28x28
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),

            nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1),# 14x14
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),

            nn.AdaptiveAvgPool2d((1,1))  # -> [B, 256, 1, 1]
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(0.25),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

small_cnn = SmallCNN(num_classes=2)
small_cnn


SmallCNN(
  (features): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (7): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU(inplace=True)
    (9): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (10): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): ReLU(inplace=True)
    (12): AdaptiveAvgPool2d(output_size=(1, 1))
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Dropout(p=0.25, inplace=False)
    (2): Linear(in_features=

In [15]:
small_cnn, history_cnn = fit(
    small_cnn,
    loaders,
    epochs=10,           # start small; increase if improving
    lr=1e-3,
    weight_decay=1e-4
)

[01] train_loss=0.3656 acc=0.8459 | val_loss=0.3259 acc=0.8871 | 8.8s
[02] train_loss=0.3551 acc=0.8375 | val_loss=0.2697 acc=0.8978 | 8.4s
[03] train_loss=0.3143 acc=0.8711 | val_loss=0.2370 acc=0.9032 | 8.2s
[04] train_loss=0.3159 acc=0.8627 | val_loss=0.2624 acc=0.8763 | 8.3s
[05] train_loss=0.3083 acc=0.8658 | val_loss=0.2327 acc=0.8925 | 8.5s
[06] train_loss=0.2896 acc=0.8690 | val_loss=0.2802 acc=0.8602 | 8.7s
[07] train_loss=0.3034 acc=0.8774 | val_loss=0.2288 acc=0.8978 | 8.2s
[08] train_loss=0.2867 acc=0.8679 | val_loss=0.2742 acc=0.8925 | 8.3s
[09] train_loss=0.2852 acc=0.8805 | val_loss=0.2222 acc=0.8871 | 8.6s
[10] train_loss=0.3095 acc=0.8595 | val_loss=0.4176 acc=0.8387 | 8.4s


In [16]:
crit = nn.CrossEntropyLoss()
test_loss, test_acc, y_true, y_pred = evaluate(small_cnn, loaders["test"], crit, device)
print(f"SmallCNN Test: loss={test_loss:.4f}, acc={test_acc:.4f}")
print(classification_report(y_true, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))


SmallCNN Test: loss=0.4462, acc=0.8148
              precision    recall  f1-score   support

           0     0.8129    0.9456    0.8742       147
           1     0.8222    0.5362    0.6491        69

    accuracy                         0.8148       216
   macro avg     0.8175    0.7409    0.7617       216
weighted avg     0.8159    0.8148    0.8023       216

Confusion matrix:
 [[139   8]
 [ 32  37]]


### ResNet

In [17]:
from torchvision import models

# PyTorch 2.x API: use "weights" for pretrained
resnet = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)

# Replace final layer for 2 classes
in_feats = resnet.fc.in_features
resnet.fc = nn.Linear(in_feats, 2)

# Option 1 (recommended to start): fine-tune ALL layers
for p in resnet.parameters():
    p.requires_grad = True

resnet.to(device)
resnet


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /Users/sarvesh/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:03<00:00, 12.0MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [18]:
resnet, history_resnet = fit(
    resnet,
    loaders,
    epochs=5,         # try 5–15; watch val accuracy
    lr=3e-4,          # slightly smaller LR for pretrained nets
    weight_decay=1e-4
)


[01] train_loss=0.1731 acc=0.9193 | val_loss=0.1347 acc=0.9785 | 17.5s
[02] train_loss=0.0527 acc=0.9769 | val_loss=0.0274 acc=0.9892 | 13.2s
[03] train_loss=0.0553 acc=0.9790 | val_loss=0.0400 acc=0.9892 | 13.3s
[04] train_loss=0.0283 acc=0.9843 | val_loss=0.0026 acc=1.0000 | 13.3s
[05] train_loss=0.0283 acc=0.9927 | val_loss=0.0112 acc=0.9946 | 13.0s


In [19]:
crit = nn.CrossEntropyLoss()
test_loss, test_acc, y_true, y_pred = evaluate(resnet, loaders["test"], crit, device)
print(f"ResNet18 Test: loss={test_loss:.4f}, acc={test_acc:.4f}")
print(classification_report(y_true, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))


ResNet18 Test: loss=0.0835, acc=0.9630
              precision    recall  f1-score   support

           0     0.9484    1.0000    0.9735       147
           1     1.0000    0.8841    0.9385        69

    accuracy                         0.9630       216
   macro avg     0.9742    0.9420    0.9560       216
weighted avg     0.9649    0.9630    0.9623       216

Confusion matrix:
 [[147   0]
 [  8  61]]
