# Load data

In [12]:
from utils import CarDDDataset
import torch

# Load the datasets
train_dataset = torch.load("train_dataset.pt")
val_dataset = torch.load("val_dataset.pt")
test_dataset = torch.load("test_dataset.pt")

print("Datasets loaded successfully.")

Datasets loaded successfully.


  train_dataset = torch.load("train_dataset.pt")
  val_dataset = torch.load("val_dataset.pt")
  test_dataset = torch.load("test_dataset.pt")


In [13]:
train_dataset[0]

{'image': <PIL.Image.Image image mode=RGB size=1000x750>,
 'image_file_path': 'CarDD_release/CarDD_COCO/train2017/000001.jpg',
 'labels': tensor([0., 1., 0., 0., 0., 1.]),
 'active_label_names': ['Scratch', 'Tire Flat']}

# Loading ViT Feature Processor

In [14]:
from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [15]:
# Step 1: Define feature extractor
model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_processor = ViTImageProcessor.from_pretrained(model_name_or_path)

# Transform Dataset

In [16]:
# Step 2: Define Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, dataset, feature_processor):
        self.dataset = dataset
        self.feature_processor = feature_processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        # Apply feature processor to the image
        processed = self.feature_processor(images=sample['image'], return_tensors='pt') #return_tensors='pt' argument, we'll get back torch tensors instead of numpy array
        processed = {key: value.squeeze(0) for key, value in processed.items()}  # Remove batch dimension
        processed['labels'] = sample['labels']  # Add labels
        return processed

# Step 3: Initialize CustomDataset
processed_train_dataset = CustomDataset(train_dataset, feature_processor)
processed_eval_dataset = CustomDataset(val_dataset, feature_processor)
processed_test_dataset = CustomDataset(test_dataset, feature_processor)

In [17]:
processed_train_dataset[0]

{'pixel_values': tensor([[[ 0.9451,  0.9608,  0.9686,  ...,  0.9922,  1.0000,  0.9922],
          [ 0.9294,  0.9451,  0.9686,  ...,  0.9922,  1.0000,  1.0000],
          [ 0.9451,  0.9529,  0.9608,  ...,  1.0000,  0.9922,  0.9922],
          ...,
          [ 0.2784,  0.2314,  0.3098,  ..., -0.4824, -0.5059, -0.5294],
          [ 0.0039,  0.1373,  0.3176,  ..., -0.5294, -0.3882, -0.4667],
          [ 0.2000,  0.1608,  0.2314,  ..., -0.5294, -0.4118, -0.4275]],
 
         [[-0.6941, -0.6235, -0.6000,  ..., -0.2392, -0.2549, -0.2627],
          [-0.6863, -0.6314, -0.5765,  ..., -0.2314, -0.2471, -0.2627],
          [-0.6784, -0.6392, -0.6000,  ..., -0.2549, -0.2392, -0.2549],
          ...,
          [ 0.2863,  0.2314,  0.3255,  ..., -0.5529, -0.5843, -0.6000],
          [ 0.0431,  0.1686,  0.3333,  ..., -0.5843, -0.4980, -0.5529],
          [ 0.2471,  0.2000,  0.2392,  ..., -0.5765, -0.4824, -0.4980]],
 
         [[-0.5451, -0.4980, -0.4510,  ..., -0.2941, -0.2863, -0.2863],
          [-

In [18]:
# Step 4: Define Collate Function
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.stack([x['labels'] for x in batch])
    }

In [19]:
# Step 5: Define Metric

def compute_metrics(p):
    # Apply sigmoid to predictions to get probabilities
    preds = torch.sigmoid(torch.tensor(p.predictions)).numpy()
    # Apply a threshold to convert probabilities to binary predictions
    threshold = 0.5
    binary_preds = (preds > threshold).astype(int)
    # Convert references to numpy
    references = p.label_ids
    # Compute accuracy for multi-label classification
    # True if all labels match for a sample, False otherwise
    sample_accuracies = (binary_preds == references).all(axis=1)
    accuracy = sample_accuracies.mean()
    return {"accuracy": accuracy}

In [20]:
# Step 6: Load the ViT Model
num_labels = train_dataset[0]['labels'].shape[0]
model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    id2label={str(i): f"label_{i}" for i in range(num_labels)},
    label2id={f"label_{i}": i for i in range(num_labels)}
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Freeze the backbone
for param in model.vit.parameters():
    param.requires_grad = False

# Verify trainable parameters
trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]
print("Trainable Parameters:", trainable_params)

Trainable Parameters: ['classifier.weight', 'classifier.bias']


In [22]:
# Step 7: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./vit-final-layer-only",
    per_device_train_batch_size=32,
    eval_strategy="steps",  # Updated
    num_train_epochs=10,
    bf16=True,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to='tensorboard',
    load_best_model_at_end=True,
)

In [23]:
# Step 8: Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=processed_train_dataset,  # Training dataset
    eval_dataset=processed_eval_dataset,  # Validation dataset
    processing_class=feature_processor
)

In [25]:
train_results = trainer.train()

Step,Training Loss,Validation Loss,Accuracy
100,0.5188,0.524162,0.250617
200,0.4619,0.464617,0.323457
300,0.4356,0.435181,0.346914
400,0.4237,0.417407,0.362963
500,0.4196,0.405659,0.37037
600,0.3962,0.397914,0.375309
700,0.3908,0.392902,0.375309
800,0.376,0.390205,0.381481


In [27]:
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

***** train metrics *****
  epoch                    =         10.0
  total_flos               = 2032381165GF
  train_loss               =       0.4371
  train_runtime            =   1:07:51.46
  train_samples_per_second =        6.916
  train_steps_per_second   =        0.216


In [45]:
# Evaluate on validation dataset
eval_results = trainer.evaluate()

# Print the evaluation results
print("Evaluation Results:", eval_results)

Evaluation Results: {'eval_loss': 0.39020490646362305, 'eval_accuracy': 0.3814814814814815, 'eval_runtime': 93.5819, 'eval_samples_per_second': 8.656, 'eval_steps_per_second': 1.09, 'epoch': 10.0}


In [43]:
from tensorboard import program

logdir = './vit-final-layer-only' 
tb = program.TensorBoard()
tb.configure(argv=['serve', '--logdir', logdir])
url = tb.launch()
print(f"TensorBoard is running at {url}")

TensorBoard is running at http://localhost:6006/


# Model Full

In [46]:
# Unfreeze the backbone
for param in model.vit.parameters():
    param.requires_grad = True

# Verify trainable parameters
trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]
print("Trainable Parameters:", trainable_params)

Trainable Parameters: ['vit.embeddings.cls_token', 'vit.embeddings.position_embeddings', 'vit.embeddings.patch_embeddings.projection.weight', 'vit.embeddings.patch_embeddings.projection.bias', 'vit.encoder.layer.0.attention.attention.query.weight', 'vit.encoder.layer.0.attention.attention.query.bias', 'vit.encoder.layer.0.attention.attention.key.weight', 'vit.encoder.layer.0.attention.attention.key.bias', 'vit.encoder.layer.0.attention.attention.value.weight', 'vit.encoder.layer.0.attention.attention.value.bias', 'vit.encoder.layer.0.attention.output.dense.weight', 'vit.encoder.layer.0.attention.output.dense.bias', 'vit.encoder.layer.0.intermediate.dense.weight', 'vit.encoder.layer.0.intermediate.dense.bias', 'vit.encoder.layer.0.output.dense.weight', 'vit.encoder.layer.0.output.dense.bias', 'vit.encoder.layer.0.layernorm_before.weight', 'vit.encoder.layer.0.layernorm_before.bias', 'vit.encoder.layer.0.layernorm_after.weight', 'vit.encoder.layer.0.layernorm_after.bias', 'vit.encoder.la

In [47]:
# Step 7: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./vit-full",
    per_device_train_batch_size=32,
    eval_strategy="steps",  # Updated
    num_train_epochs=10,
    bf16=True,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to='tensorboard',
    load_best_model_at_end=True,
)

In [48]:
# Step 8: Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=processed_train_dataset,  # Training dataset
    eval_dataset=processed_eval_dataset,  # Validation dataset
    processing_class=feature_processor
)

In [49]:
train_results = trainer.train()

Step,Training Loss,Validation Loss,Accuracy
100,0.1918,0.23659,0.574074
200,0.1079,0.240904,0.575309
300,0.0656,0.22354,0.608642
400,0.0353,0.234156,0.6
500,0.0197,0.239234,0.62963
600,0.0143,0.242853,0.622222
700,0.0117,0.244707,0.624691
800,0.0095,0.249293,0.628395


In [50]:
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

***** train metrics *****
  epoch                    =         10.0
  total_flos               = 2032381165GF
  train_loss               =       0.0734
  train_runtime            =   3:25:34.90
  train_samples_per_second =        2.283
  train_steps_per_second   =        0.071


# Predict on Test Dataset(Final-Layer-Only Model)

In [64]:
# Load the model
model = ViTForImageClassification.from_pretrained("./vit-final-layer-only")
print("Model loaded successfully.")

Model loaded successfully.


In [65]:
# Re-initialize the trainer
trainer = Trainer(
    model=model,
    data_collator=collate_fn
)

In [68]:
from sklearn.metrics import multilabel_confusion_matrix, classification_report

# Perform evaluation
results = trainer.predict(test_dataset=processed_test_dataset)

# Extract predictions and labels
predicted_probs = torch.sigmoid(torch.tensor(results.predictions)).numpy()
all_labels = results.label_ids

# Apply a threshold to convert probabilities to binary predictions
threshold = 0.5
all_predictions = (predicted_probs > threshold).astype(int)

# Compute exact match accuracy
exact_match_accuracy = (all_predictions == all_labels).all(axis=1).mean()
print(f"Exact Match Accuracy: {exact_match_accuracy:.4f}")

# Compute multilabel confusion matrix
cm = multilabel_confusion_matrix(all_labels, all_predictions)
print("Multilabel Confusion Matrix:")
print(cm)
print( classification_report(all_labels,all_predictions))

Exact Match Accuracy: 0.3984
Multilabel Confusion Matrix:
[[[174  43]
  [ 42 115]]

 [[ 88 103]
  [ 14 169]]

 [[326   0]
  [ 48   0]]

 [[303   0]
  [ 17  54]]

 [[309   0]
  [ 64   1]]

 [[343   0]
  [ 31   0]]]
              precision    recall  f1-score   support

           0       0.73      0.73      0.73       157
           1       0.62      0.92      0.74       183
           2       0.00      0.00      0.00        48
           3       1.00      0.76      0.86        71
           4       1.00      0.02      0.03        65
           5       0.00      0.00      0.00        31

   micro avg       0.70      0.61      0.65       555
   macro avg       0.56      0.41      0.39       555
weighted avg       0.66      0.61      0.57       555
 samples avg       0.69      0.65      0.65       555



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Predict on Test Dataset(Full Model)

In [69]:
# Load the model
model = ViTForImageClassification.from_pretrained("./vit-full")
print("Model loaded successfully.")

Model loaded successfully.


In [71]:
# Re-initialize the trainer
trainer = Trainer(
    model=model,
    data_collator=collate_fn
)

In [73]:
from sklearn.metrics import multilabel_confusion_matrix, classification_report

# Perform evaluation
results = trainer.predict(test_dataset=processed_test_dataset)

# Extract predictions and labels
predicted_probs = torch.sigmoid(torch.tensor(results.predictions)).numpy()
all_labels = results.label_ids

# Apply a threshold to convert probabilities to binary predictions
threshold = 0.5
all_predictions = (predicted_probs > threshold).astype(int)

# Compute exact match accuracy
exact_match_accuracy = (all_predictions == all_labels).all(axis=1).mean()
print(f"Exact Match Accuracy: {exact_match_accuracy:.4f}")

# Compute multilabel confusion matrix
cm = multilabel_confusion_matrix(all_labels, all_predictions)
print("Multilabel Confusion Matrix:")
print(cm)
print( classification_report(all_labels,all_predictions))

Exact Match Accuracy: 0.5963
Multilabel Confusion Matrix:
[[[171  46]
  [ 12 145]]

 [[154  37]
  [ 19 164]]

 [[310  16]
  [ 20  28]]

 [[301   2]
  [  6  65]]

 [[299  10]
  [ 23  42]]

 [[341   2]
  [  5  26]]]
              precision    recall  f1-score   support

           0       0.76      0.92      0.83       157
           1       0.82      0.90      0.85       183
           2       0.64      0.58      0.61        48
           3       0.97      0.92      0.94        71
           4       0.81      0.65      0.72        65
           5       0.93      0.84      0.88        31

   micro avg       0.81      0.85      0.83       555
   macro avg       0.82      0.80      0.81       555
weighted avg       0.81      0.85      0.82       555
 samples avg       0.86      0.89      0.85       555

