In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import timm
from sklearn.metrics import multilabel_confusion_matrix, classification_report
import numpy as np
from tqdm import tqdm
import os

In [2]:
from utils import CarDDDataset

In [3]:
# Load .pt files
train_data = torch.load("train_dataset.pt")
val_data = torch.load("val_dataset.pt")
test_data = torch.load("test_dataset.pt")

# Verify dataset structure
print("Training data example:", train_data[0])  # Check the first entry for structure
print("Validation data example:", val_data[0])
print("Test data example:", test_data[0])


Training data example: {'image': <PIL.Image.Image image mode=RGB size=1000x750 at 0x7F7E1CDF73D0>, 'image_file_path': 'CarDD_release/CarDD_COCO/train2017/000001.jpg', 'labels': tensor([0., 1., 0., 0., 0., 1.]), 'active_label_names': ['Scratch', 'Tire Flat']}
Validation data example: {'image': <PIL.Image.Image image mode=RGB size=1000x685 at 0x7F7E1CDAAF70>, 'image_file_path': 'CarDD_release/CarDD_COCO/val2017/000013.jpg', 'labels': tensor([0., 0., 1., 1., 0., 1.]), 'active_label_names': ['Crack', 'Glass Shatter', 'Tire Flat']}
Test data example: {'image': <PIL.Image.Image image mode=RGB size=1000x667 at 0x7F7E277864C0>, 'image_file_path': 'CarDD_release/CarDD_COCO/test2017/000012.jpg', 'labels': tensor([0., 0., 0., 0., 0., 1.]), 'active_label_names': ['Tire Flat']}


In [18]:
class CustomDataset(Dataset):
    def __init__(self, dataset, transform=None):
        self.dataset = dataset  # List of {"image": ..., "labels": ...}
        self.transform = transform  # Optional transformations

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        image = sample["image"]  # This is already a PIL image
        label = sample["labels"]  # Labels as tensor

        # Apply transformations if provided
        if self.transform:
            image = self.transform(image)
        
        return image, label


In [19]:
# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Image size for VGG11
    transforms.ToTensor(),  # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # ImageNet normalization
])

In [20]:
# Create Dataset and DataLoader
train_dataset = CustomDataset(train_data, transform=transform)
val_dataset = CustomDataset(val_data, transform=transform)
test_dataset = CustomDataset(test_data, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [7]:
# Load VGG11 Model
model = timm.create_model('vgg11.tv_in1k', pretrained=True, num_classes=len(train_data[0]["labels"]))

HBox(children=(FloatProgress(value=0.0, description='model.safetensors', max=531455276.0, style=ProgressStyle(…




In [21]:
# Optionally Freeze Feature Extractor
for param in model.parameters():
    param.requires_grad = False

In [22]:
 print(model)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
 

In [24]:
# Get the input size of the first Linear layer inside model.head.fc
in_features = model.head.fc[0].in_features  # This accesses the first Linear layer

# Modify the classifier (head) part of the model
model.head.fc = nn.Sequential(
    nn.Linear(in_features, 256),  # New layer with 256 units
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(256, len(train_data[0]["labels"])),  # Output layer with size equal to number of labels
    nn.Sigmoid()  # For multi-label classification
)


In [25]:
# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [26]:
# Define Loss, Optimizer, and Scheduler
criterion = nn.BCELoss()  # Binary Cross-Entropy for multi-label
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [27]:
# Training Loop
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=10):
    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        
        # Training Phase
        model.train()
        train_loss = 0.0
        for images, labels in tqdm(train_loader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * images.size(0)
        
        scheduler.step()
        train_loss /= len(train_loader.dataset)
        print(f"Training Loss: {train_loss:.4f}")
        
        # Validation Phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * images.size(0)
        
        val_loss /= len(val_loader.dataset)
        print(f"Validation Loss: {val_loss:.4f}")

In [28]:
# Start Training
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=10)


  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 1/10


100%|██████████| 88/88 [20:24<00:00, 13.92s/it]


Training Loss: 0.4369


  0%|          | 0/88 [00:00<?, ?it/s]

Validation Loss: 0.3401
Epoch 2/10


100%|██████████| 88/88 [16:22<00:00, 11.16s/it]


Training Loss: 0.3256


  0%|          | 0/88 [00:00<?, ?it/s]

Validation Loss: 0.3044
Epoch 3/10


100%|██████████| 88/88 [16:54<00:00, 11.53s/it]


Training Loss: 0.2934


  0%|          | 0/88 [00:00<?, ?it/s]

Validation Loss: 0.2904
Epoch 4/10


100%|██████████| 88/88 [20:02<00:00, 13.66s/it]


Training Loss: 0.2740


  0%|          | 0/88 [00:00<?, ?it/s]

Validation Loss: 0.2836
Epoch 5/10


100%|██████████| 88/88 [21:15<00:00, 14.50s/it]


Training Loss: 0.2601


  0%|          | 0/88 [00:00<?, ?it/s]

Validation Loss: 0.2778
Epoch 6/10


100%|██████████| 88/88 [16:44<00:00, 11.42s/it]


Training Loss: 0.2458


  0%|          | 0/88 [00:00<?, ?it/s]

Validation Loss: 0.2738
Epoch 7/10


100%|██████████| 88/88 [16:29<00:00, 11.25s/it]


Training Loss: 0.2398


  0%|          | 0/88 [00:00<?, ?it/s]

Validation Loss: 0.2716
Epoch 8/10


100%|██████████| 88/88 [16:43<00:00, 11.40s/it]


Training Loss: 0.2282


  0%|          | 0/88 [00:00<?, ?it/s]

Validation Loss: 0.2706
Epoch 9/10


100%|██████████| 88/88 [16:30<00:00, 11.26s/it]


Training Loss: 0.2250


  0%|          | 0/88 [00:00<?, ?it/s]

Validation Loss: 0.2701
Epoch 10/10


100%|██████████| 88/88 [16:25<00:00, 11.20s/it]


Training Loss: 0.2265
Validation Loss: 0.2695


In [29]:
# Evaluation
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            preds = (outputs > 0.5).float()  # Thresholding for multi-label
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    
    print("Classification Report:")
    print(classification_report(all_labels, all_preds, zero_division=0))
    
    print("Confusion Matrix:")
    print(multilabel_confusion_matrix(all_labels, all_preds))

# Evaluate the model
evaluate_model(model, test_loader)

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.82      0.76       157
           1       0.70      0.77      0.73       183
           2       0.47      0.19      0.27        48
           3       0.98      0.86      0.92        71
           4       0.67      0.49      0.57        65
           5       0.96      0.74      0.84        31

   micro avg       0.74      0.71      0.72       555
   macro avg       0.75      0.64      0.68       555
weighted avg       0.73      0.71      0.71       555
 samples avg       0.76      0.76      0.73       555

Confusion Matrix:
[[[164  53]
  [ 28 129]]

 [[132  59]
  [ 43 140]]

 [[316  10]
  [ 39   9]]

 [[302   1]
  [ 10  61]]

 [[293  16]
  [ 33  32]]

 [[342   1]
  [  8  23]]]
