In [22]:
import os
import json
from pathlib import Path

import numpy as np
from PIL import Image

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

from cardd_dataset import CarDDMultiLabelDataset

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0)


In [23]:
data_root = "../data/raw"

train_json_path = os.path.join(data_root, "train", "annotations.json")
train_img_dir = os.path.join(data_root, "train", "images")

val_json_path = os.path.join(data_root, "val", "annotations.json")
val_img_dir = os.path.join(data_root, "val", "images")

num_classes = 6  # dent, scratch, crack, glass shatter, lamp broken, tire flat


def load_coco_multilabel(json_path, img_dir, num_classes):
    with open(json_path, "r") as f:
        coco = json.load(f)

    # category_id -> index [0..num_classes-1]
    cat_ids = sorted([c["id"] for c in coco["categories"]])
    cat_id_to_index = {cid: i for i, cid in enumerate(cat_ids)}

    # image_id -> file_name
    id_to_file = {im["id"]: im["file_name"] for im in coco["images"]}

    # image_id -> set of class indices
    image_to_classes = {img_id: set() for img_id in id_to_file.keys()}
    for ann in coco["annotations"]:
        img_id = ann["image_id"]
        cid = ann["category_id"]
        if cid in cat_id_to_index:
            image_to_classes[img_id].add(cat_id_to_index[cid])

    samples = []
    for img_id, file_name in id_to_file.items():
        label = np.zeros(num_classes, dtype=np.float32)
        for cls_idx in image_to_classes[img_id]:
            label[cls_idx] = 1.0

        samples.append(
            {
                "image_path": os.path.join(img_dir, file_name),
                "label": label,
            }
        )

    print(f"{os.path.basename(os.path.dirname(json_path))}: {len(samples)} images loaded")
    return samples, cat_id_to_index, coco["categories"]


train_samples, cat_id_to_index, categories = load_coco_multilabel(
    train_json_path, train_img_dir, num_classes
)

val_samples, _, _ = load_coco_multilabel(
    val_json_path, val_img_dir, num_classes
)

print("category mapping (category_id -> index):", cat_id_to_index)
print("classes:", [c["name"] for c in categories])


train: 2816 images loaded
val: 810 images loaded
category mapping (category_id -> index): {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5}
classes: ['dent', 'scratch', 'crack', 'glass shatter', 'lamp broken', 'tire flat']


In [24]:
class CarDDMultiLabelDataset(Dataset):
    def __init__(self, samples, transform=None):
        self.samples = samples
        self.transform = transform

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        img_path = sample["image_path"]
        label = sample["label"]

        image = Image.open(img_path).convert("RGB")
        if self.transform is not None:
            image = self.transform(image)

        label_tensor = torch.from_numpy(label)  # shape: (num_classes,)
        return image, label_tensor


In [25]:
image_size = 224

train_transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(
        brightness=0.2,
        contrast=0.2,
        saturation=0.2,
        hue=0.02,
    ),
    transforms.RandomRotation(degrees=5),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],  # ImageNet
        std=[0.229, 0.224, 0.225],
    ),
])

val_transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
])

train_dataset = CarDDMultiLabelDataset(train_samples, transform=train_transform)
val_dataset = CarDDMultiLabelDataset(val_samples, transform=val_transform)

batch_size = 32

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=True,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0,
    pin_memory=True,
)

print("train batches:", len(train_loader), "val batches:", len(val_loader))


train batches: 88 val batches: 26


In [26]:
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
    else "cpu"
)

print("using device:", device)

# load imagenet-pretrained resnet50
resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)

in_features = resnet.fc.in_features

# small dense head: 2048 -> 256 -> 6
resnet.fc = nn.Sequential(
    nn.Linear(in_features, 256),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(256, num_classes),
)

model = resnet.to(device)

print(model.fc)


using device: mps
Sequential(
  (0): Linear(in_features=2048, out_features=256, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.3, inplace=False)
  (3): Linear(in_features=256, out_features=6, bias=True)
)


In [27]:
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

num_epochs = 20
learning_rate = 1e-4
weight_decay = 1e-4

criterion = nn.BCEWithLogitsLoss()

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=learning_rate,
    weight_decay=weight_decay,
)

scheduler = CosineAnnealingWarmRestarts(
    optimizer,
    T_0=5,          # restart every 5 epochs
    T_mult=2,       # restarts get farther apart
    eta_min=1e-6    # minimum LR
)

class_names = [c["name"] for c in categories]
print("classes:", class_names)


classes: ['dent', 'scratch', 'crack', 'glass shatter', 'lamp broken', 'tire flat']


In [34]:
def multilabel_accuracy(logits, targets, threshold=0.5):
    """
    Compute multi-label accuracy.
    Accuracy = % of labels that match the ground truth (per sample).
    """
    probs = torch.sigmoid(logits)
    preds = (probs > threshold).float()

    correct = (preds == targets).float().mean()  # mean across classes per sample
    return correct.item()


In [38]:
def sigmoid_to_preds(logits, threshold=0.5):
    probs = torch.sigmoid(logits)
    return (probs >= threshold).float()


def update_metrics(logits, targets, metrics):
    preds = sigmoid_to_preds(logits)

    tp = (preds * targets).sum(dim=0)
    fp = (preds * (1 - targets)).sum(dim=0)
    fn = ((1 - preds) * targets).sum(dim=0)

    metrics["tp"] += tp.cpu()
    metrics["fp"] += fp.cpu()
    metrics["fn"] += fn.cpu()


def compute_f1(metrics):
    tp = metrics["tp"]
    fp = metrics["fp"]
    fn = metrics["fn"]

    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)

    micro_tp = tp.sum()
    micro_fp = fp.sum()
    micro_fn = fn.sum()

    micro_precision = micro_tp / (micro_tp + micro_fp + 1e-8)
    micro_recall = micro_tp / (micro_tp + micro_fn + 1e-8)
    micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall + 1e-8)

    return {
        "precision": precision.numpy(),
        "recall": recall.numpy(),
        "f1": f1.numpy(),
        "micro_f1": micro_f1.item(),
    }


In [None]:
best_val_f1 = 0.0
checkpoint_path = "resnet50_multilabel_cardd_best.pt"

for epoch in range(1, num_epochs + 1):
    # train
    train_loss = 0.0
    train_acc = 0.0

    for images, targets in train_loader:
        images = images.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        logits = model(images)
        loss = criterion(logits, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * images.size(0)
        train_acc += multilabel_accuracy(logits, targets) * images.size(0)

    train_loss /= len(train_dataset)
    train_acc /= len(train_dataset)

    # val
    model.eval()
    val_loss = 0.0
    val_acc = 0.0
    metrics = {
        "tp": torch.zeros(num_classes),
        "fp": torch.zeros(num_classes),
        "fn": torch.zeros(num_classes),
    }

    with torch.no_grad():
        for images, targets in val_loader:
            images = images.to(device)
            targets = targets.to(device)

            logits = model(images)
            loss = criterion(logits, targets)
            val_loss += loss.item() * images.size(0)
            val_acc += multilabel_accuracy(logits, targets) * images.size(0)

            update_metrics(logits, targets, metrics)

    val_loss /= len(val_dataset)
    val_acc /= len(val_dataset)
    stats = compute_f1(metrics)
    class_f1 = stats["f1"]
    micro_f1 = stats["micro_f1"]

    # report
    print(f"\nEpoch {epoch}/{num_epochs}")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"  Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f}")
    print(f"  Val Micro F1: {micro_f1:.4f}")

    for i, name in enumerate(class_names):
        print(f"    {name:15s} F1: {class_f1[i]:.4f}") 

    #checkpoint
    if micro_f1 > best_val_f1:
        best_val_f1 = micro_f1
        torch.save(model.state_dict(), checkpoint_path)
        print(f"  Saved new best model → {checkpoint_path}")

    scheduler.step()

print("\nTraining complete. Best val micro F1:", best_val_f1)



Epoch 1/20
  Train Loss: 0.0813 | Train Acc: 0.9676
  Val Loss:   0.2541 | Val Acc:   0.9097
  Val Micro F1: 0.8218
    dent            F1: 0.8317
    scratch         F1: 0.8359
    crack           F1: 0.6496
    glass shatter   F1: 0.9286
    lamp broken     F1: 0.7518
    tire flat       F1: 0.9174
  Saved new best model → resnet50_multilabel_cardd_best.pt

Epoch 2/20
  Train Loss: 0.0599 | Train Acc: 0.9775
  Val Loss:   0.2889 | Val Acc:   0.9121
  Val Micro F1: 0.8325
    dent            F1: 0.8238
    scratch         F1: 0.8421
    crack           F1: 0.6996
    glass shatter   F1: 0.9663
    lamp broken     F1: 0.7806
    tire flat       F1: 0.9256
  Saved new best model → resnet50_multilabel_cardd_best.pt

Epoch 3/20
  Train Loss: 0.0350 | Train Acc: 0.9871
  Val Loss:   0.3584 | Val Acc:   0.9148
  Val Micro F1: 0.8390
    dent            F1: 0.8443
    scratch         F1: 0.8495
    crack           F1: 0.6989
    glass shatter   F1: 0.9556
    lamp broken     F1: 0.7770
    

In [61]:
state = torch.load("resnet50_multilabel_cardd_best.pt", map_location="cpu")

for k in state.keys():
    print(k)


conv1.weight
bn1.weight
bn1.bias
bn1.running_mean
bn1.running_var
bn1.num_batches_tracked
layer1.0.conv1.weight
layer1.0.bn1.weight
layer1.0.bn1.bias
layer1.0.bn1.running_mean
layer1.0.bn1.running_var
layer1.0.bn1.num_batches_tracked
layer1.0.conv2.weight
layer1.0.bn2.weight
layer1.0.bn2.bias
layer1.0.bn2.running_mean
layer1.0.bn2.running_var
layer1.0.bn2.num_batches_tracked
layer1.0.conv3.weight
layer1.0.bn3.weight
layer1.0.bn3.bias
layer1.0.bn3.running_mean
layer1.0.bn3.running_var
layer1.0.bn3.num_batches_tracked
layer1.0.downsample.0.weight
layer1.0.downsample.1.weight
layer1.0.downsample.1.bias
layer1.0.downsample.1.running_mean
layer1.0.downsample.1.running_var
layer1.0.downsample.1.num_batches_tracked
layer1.1.conv1.weight
layer1.1.bn1.weight
layer1.1.bn1.bias
layer1.1.bn1.running_mean
layer1.1.bn1.running_var
layer1.1.bn1.num_batches_tracked
layer1.1.conv2.weight
layer1.1.bn2.weight
layer1.1.bn2.bias
layer1.1.bn2.running_mean
layer1.1.bn2.running_var
layer1.1.bn2.num_batches_tr

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models

def load_trained_resnet(num_classes=6):
    model = models.resnet50(weights=None)   # DO NOT wrap in a class

    # exact classifier you used during training
    model.fc = nn.Sequential(
        nn.Linear(2048, 256),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(256, num_classes)
    )

    return model

model = load_trained_resnet(num_classes=6)

state = torch.load("resnet50_multilabel_cardd_best.pt", map_location="cpu")

model.load_state_dict(state) 
model.eval()
print("Model loaded successfully!")





Model loaded successfully!


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision.models as models
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


# recreate trained model w/ loaded weights

def load_trained_resnet(num_classes=6):
    model = models.resnet50(weights=None)
    model.fc = nn.Sequential(
        nn.Linear(2048, 256),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(256, num_classes)
    )
    return model

model = load_trained_resnet(num_classes=6).to(device)

state = torch.load("resnet50_multilabel_cardd_best.pt", map_location=device)
model.load_state_dict(state)
model.eval()

print("Model loaded successfully!")


# compute metrics

criterion = nn.BCEWithLogitsLoss()

def update_metrics(logits, targets, metrics, threshold=0.5):
    preds = (torch.sigmoid(logits) >= threshold).float()

    metrics["tp"] += ((preds == 1) & (targets == 1)).sum(dim=0).cpu()
    metrics["fp"] += ((preds == 1) & (targets == 0)).sum(dim=0).cpu()
    metrics["fn"] += ((preds == 0) & (targets == 1)).sum(dim=0).cpu()

def compute_f1(metrics):
    tp, fp, fn = metrics["tp"], metrics["fp"], metrics["fn"]
    f1 = (2 * tp) / (2 * tp + fp + fn + 1e-7)
    micro_f1 = (2 * tp.sum()) / (2 * tp.sum() + fp.sum() + fn.sum() + 1e-7)
    return f1, micro_f1


# run on test set

test_loss = 0
metrics = {"tp": torch.zeros(6), "fp": torch.zeros(6), "fn": torch.zeros(6)}

with torch.no_grad():
    for images, labels in tqdm(test_loader, desc="Evaluating"):
        images = images.to(device)
        labels = labels.to(device)

        logits = model(images)
        loss = criterion(logits, labels)

        test_loss += loss.item() * images.size(0)

        update_metrics(logits, labels, metrics)

test_loss /= len(test_dataset)
f1_per_class, micro_f1 = compute_f1(metrics)

print("\n========== TEST RESULTS ==========")
print(f"Test Loss: {test_loss:.4f}")
print(f"Micro-F1:  {micro_f1:.4f}")
print("\nPer-class F1:")
for i, f1 in enumerate(f1_per_class):
    print(f"  Class {i}: {f1:.4f}")


Device: cpu
Model loaded successfully!


Evaluating: 100%|██████████| 12/12 [00:30<00:00,  2.56s/it]


Test Loss: 0.3428
Micro-F1:  0.8411

Per-class F1:
  Class 0: 0.8443
  Class 1: 0.8944
  Class 2: 0.5800
  Class 3: 0.9333
  Class 4: 0.7634
  Class 5: 0.9000





In [3]:
!pip install scikit-learn pandas
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import pandas as pd

# compute classification report and per-class confusion matrices for multilabel test set


model.eval()
y_trues = []
y_preds = []

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        logits = model(images)
        probs = torch.sigmoid(logits).cpu().numpy()
        preds = (probs >= 0.5).astype(int)

        y_preds.append(preds)
        y_trues.append(labels.cpu().numpy().astype(int))

y_true = np.vstack(y_trues)   # shape: (N, num_classes)
y_pred = np.vstack(y_preds)

# classification report (per-class)
print("Classification report (per-class):\n")
print(classification_report(y_true, y_pred, target_names=class_names, zero_division=0))

# confusion matrices per class and summary table (TP, FP, FN, TN)
rows = []
for i, name in enumerate(class_names):
    cm = confusion_matrix(y_true[:, i], y_pred[:, i])
    # pad to 2x2 if necessary
    cm2 = np.zeros((2, 2), dtype=int)
    cm2[: cm.shape[0], : cm.shape[1]] = cm
    tn, fp, fn, tp = cm2.ravel()
    rows.append({"class": name, "TP": int(tp), "FP": int(fp), "FN": int(fn), "TN": int(tn)})
    print(f"\nConfusion matrix for '{name}':\n{cm2}")

summary_df = pd.DataFrame(rows).set_index("class")
print("\nPer-class confusion summary:\n")
print(summary_df)

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp310-cp310-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting pandas
  Using cached pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading scikit_learn-1.7.2-cp310-cp310-macosx_12_0_arm64.whl (8.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m10.4 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hUsing cached pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl (10.8 MB)
Using cached joblib-1.5.2-py3-none-any.whl (308 kB)
Downloading pytz-20

NameError: name 'model' is not defined