<a href="https://colab.research.google.com/github/ridazaneb/IndoFashionCLIP/blob/main/finalsubsetfinetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# subsetfinetune_final.py
# ---------------------------------------------
# End-to-end Colab pipeline for fine-tuning CLIP on a South Asian
# fashion dataset.
# ---------------------------------------------

In [None]:
# 1) Install all required libraries (run once per session)
!pip install -q torch torchvision transformers accelerate pandas matplotlib scikit-learn


In [None]:
# 2) Imports and random seed setup for reproducibility
import random
import os
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from transformers import (
    CLIPProcessor,
    CLIPVisionModel,
    get_cosine_schedule_with_warmup
)
from torch.amp import autocast
from torch.cuda.amp import GradScaler
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from google.colab import drive

# Setting a global seed to ensure results can be reproduced
SEED = 45
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [None]:
# 3) Mount Google Drive to access data and save models
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 4) Define file paths and hyperparameters (one place to edit)
BASE_DIR         = '/content/drive/MyDrive/IndoFashion'
DATA_DIR         = os.path.join(BASE_DIR, 'data')    # JSON metadata folder
IMAGES_ROOT      = os.path.join(BASE_DIR, 'images')  # train/val/test image folders
MODELS_DIR       = os.path.join(BASE_DIR, 'models')  # where to save models and mappings
os.makedirs(MODELS_DIR, exist_ok=True)               # ensure models directory exists


In [None]:
# Hyperparameters
SUBSET_PER_CLASS = 1000     # max images per class for the training subset
EPOCHS           = 12       # total training epochs
BATCH_SIZE       = 32       # number of images per batch
LR               = 3e-5     # learning rate for optimizer
PATIENCE         = 3        # early stopping patience
WARMUP_RATIO     = 0.1      # fraction of steps used for LR warmup
DEVICE           = 'cuda' if torch.cuda.is_available() else 'cpu'  # CPU or GPU

In [None]:
# 5) Function to load newline-delimited JSON metadata into a DataFrame
#    Each JSON line must contain either 'image_path' or 'image_url',
#    and 'class_label' or 'label'.
def load_jsonl_split(path: str, split: str) -> pd.DataFrame:
    records = []
    with open(path, 'r') as f:
        for line in f:
            obj = json.loads(line)
            # Determine the image file basename
            if 'image_path' in obj:
                basename = os.path.basename(obj['image_path'])
            elif 'image_url' in obj:
                basename = os.path.basename(obj['image_url'])
            else:
                raise ValueError("JSON needs 'image_path' or 'image_url'.")
            # Determine the class label field
            label = obj.get('class_label') or obj.get('label')
            if label is None:
                raise ValueError("JSON needs 'class_label' or 'label'.")
            # Store the relative path and label
            records.append({
                'image_path': os.path.join(split, basename),
                'class_label': label
            })
    # Convert list of dicts into a DataFrame
    return pd.DataFrame(records)

# Load metadata for train, val, test splits
train_df = load_jsonl_split(os.path.join(DATA_DIR, 'train_data.json'), 'train')
val_df   = load_jsonl_split(os.path.join(DATA_DIR,   'val_data.json'),   'val')
test_df  = load_jsonl_split(os.path.join(DATA_DIR,  'test_data.json'),  'test')


In [None]:
# 6) Create a balanced training subset: up to SUBSET_PER_CLASS images per label
subset_list = []
for label, group in train_df.groupby('class_label'):
    # Sample min(total, SUBSET_PER_CLASS) images from each class
    sampled = group.sample(n=min(len(group), SUBSET_PER_CLASS), random_state=SEED)
    subset_list.append(sampled)
small_train_df = pd.concat(subset_list).reset_index(drop=True)
print(f"Training subset: {len(small_train_df)} images across {small_train_df['class_label'].nunique()} classes")


Training subset: 15000 images across 15 classes


In [None]:
# 7) Create mapping from class label string to integer ID and back
labels   = sorted(small_train_df['class_label'].unique())
label2id = {lbl: idx for idx, lbl in enumerate(labels)}
id2label = {idx: lbl for lbl, idx in label2id.items()}
# Map labels to integer IDs in the DataFrames
small_train_df['label_id'] = small_train_df['class_label'].map(label2id)
val_df['label_id']         = val_df['class_label'].map(label2id)
test_df['label_id']        = test_df['class_label'].map(label2id)


In [None]:
# 8) Set up data augmentation for training and consistent resizing for validation/test
processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224),     # random crop + resize for robustness
    transforms.RandomHorizontalFlip(p=0.5),# random flip to augment left/right poses
    transforms.ColorJitter(0.2,0.2,0.2,0.1) # random brightness/contrast/saturation/hue
])
val_transforms = transforms.Compose([
    transforms.Resize(256),                # resize to 256x256
    transforms.CenterCrop(224)             # central 224x224 crop to match CLIP input
])

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
# 9) Custom Dataset class to load images and return CLIP-ready tensors
class IndoFashionSubset(Dataset):
    def __init__(self, df: pd.DataFrame, root_dir: str, transforms=None):
        self.df         = df.reset_index(drop=True)  # keep a copy
        self.root       = root_dir                   # root images folder
        self.transforms = transforms                 # optional image augmentations
    def __len__(self) -> int:
        return len(self.df)
    def __getitem__(self, idx: int):
        # Fetch row and build full image path
        row = self.df.iloc[idx]
        img_path = os.path.join(self.root, row['image_path'])
        # Load with PIL and apply transforms
        img = Image.open(img_path).convert('RGB')
        if self.transforms:
            img = self.transforms(img)
        # Use CLIPProcessor to handle normalization & tensor conversion
        pv    = processor(images=img, return_tensors='pt').pixel_values[0]
        label = int(row['label_id'])
        return pv, label

In [None]:
# 10) Prepare DataLoaders for train, val, test
train_loader = DataLoader(
    IndoFashionSubset(small_train_df, IMAGES_ROOT, train_transforms),
    batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True
)
val_loader = DataLoader(
    IndoFashionSubset(val_df, IMAGES_ROOT, val_transforms),
    batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True
)
test_loader = DataLoader(
    IndoFashionSubset(test_df, IMAGES_ROOT, val_transforms),
    batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True
)

In [None]:
# 11) Define the fine-tuning model: CLIP vision backbone + small linear head
vision_model = CLIPVisionModel.from_pretrained('openai/clip-vit-base-patch32')
class ClipFineTune(nn.Module):
    def __init__(self, backbone: CLIPVisionModel, num_classes: int):
        super().__init__()
        self.backbone   = backbone                                # pre-trained CLIP
        self.classifier = nn.Linear(backbone.config.hidden_size, num_classes)
    def forward(self, pixel_values):
        # Extract pooled embeddings from the vision transformer
        pooled = self.backbone(pixel_values=pixel_values).pooler_output
        # Classify into your fashion categories
        return self.classifier(pooled)

model = ClipFineTune(vision_model, len(labels)).to(DEVICE)
# Freeze all but the last 4 encoder layers + head to adapt higher-level features
for name, param in model.backbone.named_parameters():
    if not any(layer in name for layer in ['encoder.layer.8', 'encoder.layer.9', 'encoder.layer.10', 'encoder.layer.11']):
        param.requires_grad = False


In [None]:
# 12) Set up optimizer (only trainable params), LR scheduler, loss function, and scaler
optimizer = optim.AdamW(
    [p for p in model.parameters() if p.requires_grad], lr=LR
)
total_steps  = EPOCHS * len(train_loader)                     # total training iterations
warmup_steps = int(WARMUP_RATIO * total_steps)            # warm-up schedule # Calculating warmup_steps here
scheduler    = get_cosine_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps
)
criterion = nn.CrossEntropyLoss()  # cross-entropy for multi-class classification
scaler    = GradScaler()           # mixed-precision gradient scaler

  scaler    = GradScaler()           # mixed-precision gradient scaler


In [None]:
# 13) Function to train one epoch (returns average loss & accuracy)
def train_epoch():
    model.train()
    running_loss, running_correct, running_total = 0, 0, 0
    for pixels, labels in train_loader:
        pixels, labels = pixels.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        # Mixed-precision forward & backward
        with autocast(device_type='cuda', enabled=(DEVICE=='cuda')):
            logits = model(pixels)
            loss   = criterion(logits, labels)
        scaler.scale(loss).backward()  # scale gradients
        scaler.step(optimizer)        # update weights
        scaler.update()               # update scale for next step
        scheduler.step()              # update LR per iteration
        # Track metrics
        running_loss   += loss.item() * labels.size(0)
        preds           = logits.argmax(dim=-1)
        running_correct += (preds == labels).sum().item()
        running_total   += labels.size(0)
    avg_loss = running_loss / running_total
    avg_acc  = running_correct / running_total
    return avg_loss, avg_acc


In [None]:
# 14) Function to evaluate on val or test (also returns detailed reports)
def eval_epoch(loader, split_name='Val'):
    model.eval()
    eval_loss, eval_correct, eval_total = 0, 0, 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for pixels, labels in loader:
            pixels, labels = pixels.to(DEVICE), labels.to(DEVICE)
            logits         = model(pixels)
            loss           = criterion(logits, labels)
            eval_loss     += loss.item() * labels.size(0)
            preds          = logits.argmax(dim=-1)
            eval_correct  += (preds == labels).sum().item()
            eval_total    += labels.size(0)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    avg_loss = eval_loss / eval_total
    avg_acc  = eval_correct / eval_total
    print(f"{split_name} Loss: {avg_loss:.4f} | {split_name} Acc: {avg_acc:.4f}")
    # Create per-class metrics and confusion matrix
    # Get unique labels from all_labels
    unique_labels = sorted(list(set(all_labels)))
    # Map unique labels to original label names
    target_names = [id2label[label_id] for label_id in unique_labels]

    # Filter all_preds and all_labels to only include classes in target_names
    filtered_preds = []
    filtered_labels = []
    for pred, label in zip(all_preds, all_labels):
        if label in unique_labels:
            filtered_preds.append(pred)
            filtered_labels.append(label)

    report = classification_report(filtered_labels, filtered_preds, target_names=target_names, zero_division=0)  # Include zero_division=0 or 1 to handle cases with 0 in denominator
    cm     = confusion_matrix(filtered_labels, filtered_preds)

    return avg_loss, avg_acc, report, cm

In [None]:
# 15) Main training loop with early stopping and metric logging
train_losses, val_losses = [], []
train_accs,    val_accs  = [], []
best_val_acc, patience_counter = 0.0, PATIENCE

for epoch in range(1, EPOCHS + 1):
    print(f"\nEpoch {epoch}/{EPOCHS}")
    tr_loss, tr_acc = train_epoch()
    vl_loss, vl_acc, vl_report, vl_cm = eval_epoch(val_loader, 'Val')
    # Save metrics for plotting
    train_losses.append(tr_loss); val_losses.append(vl_loss)
    train_accs.append(tr_acc);    val_accs.append(vl_acc)
    # Show three classes with lowest F1 to monitor weaknesses
    # Convert classification report string to dictionary
    import re
    report_dict = {}
    for line in vl_report.split('\n'):
        if re.match(r'^\s*[a-zA-Z_]+\s+\d', line):  # Match lines starting with class names followed by numbers
            parts = re.split(r'\s+', line.strip())
            class_name = parts[0]
            # Check if parts has enough elements to avoid IndexError
            if len(parts) >= 5:
                try:
                    report_dict[class_name] = {
                        'precision': float(parts[1]),
                        'recall': float(parts[2]),
                        'f1-score': float(parts[3]),
                        'support': int(parts[4])
                    }
                except ValueError:
                    # Handle potential ValueError if conversion fails
                    print(f"Warning: Could not parse line: {line.strip()}")
                    continue  # Skip this line and move to the next
            else:
                #print(f"Warning: Skipping line with insufficient data: {line.strip()}")
            #report_dict[class_name] = {
            #    'precision': float(parts[1]),
            #    'recall': float(parts[2]),
            #    'f1-score': float(parts[3]),
            #    'support': int(parts[4])
            #}

    f1_scores = {cls: report_dict.get(cls, {}).get('f1-score', 0.0) for cls in labels} # Get f1-score, default to 0.0 if not found
    lowest    = sorted(f1_scores.items(), key=lambda x: x[1])[:3]
    print("Lowest F1: ", lowest)
    # Early stopping check
    if vl_acc > best_val_acc:
        best_val_acc = vl_acc
        torch.save(model.state_dict(), os.path.join(MODELS_DIR, 'best.pt'))
        patience_counter = PATIENCE
    else:
        patience_counter -= 1
        if patience_counter <= 0:
            print("Early stopping triggered.")
            break


Epoch 1/12


In [None]:
# Time taken for subset=1000 with 12 epochs: 24m

In [None]:
# 16) Plot training & validation loss/accuracy curves for visual grading
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses,   label='Val Loss')
plt.title('Loss per Epoch'); plt.legend()
plt.subplot(1,2,2)
plt.plot(train_accs, label='Train Acc')
plt.plot(val_accs,   label='Val Acc')
plt.title('Accuracy per Epoch'); plt.legend()
plt.show()

In [None]:
# 17) Final evaluation on the test set using the best saved model
print("\nTesting with best saved model:")
model.load_state_dict(torch.load(os.path.join(MODELS_DIR, 'best.pt')))
_, test_acc, test_report, test_cm = eval_epoch(test_loader, 'Test')
print("Test Classification Report:\n", test_report)


In [None]:
# Time taken for subset=1000 with 12 epochs: 8m

In [None]:
# 18) Save the class ID ↔ label mapping for deployment
with open(os.path.join(MODELS_DIR, 'id2label.json'), 'w') as f:
    json.dump(id2label, f)

print(f"\nDone. Best Validation Accuracy: {best_val_acc:.4f}")
