In [1]:
!pip install -q torch torchvision opencv-python tqdm
!pip install ultralytics opencv-python pandas tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m85.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [67]:
from pathlib import Path

# Root dataset path (already extracted by Kaggle)
ROOT = Path("/kaggle/input/mechanical-parts")

# Check structure
for split in ["train", "valid", "test"]:
    p = ROOT / split
    print(f"{split}: {len(list(p.glob('*.jpg')))} images, "
          f"annotations: {(p / '_annotations.coco.json').exists()}")


train: 1799 images, annotations: True
valid: 225 images, annotations: True
test: 225 images, annotations: True


In [68]:
!pip -q install torch torchvision tqdm pycocotools

In [74]:
# ==== Block 1: imports + core config ====

import json, math, random
from pathlib import Path
from collections import defaultdict

import numpy as np
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import torch
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn

# Paths (ROOT already defined by you)
TRAIN_DIR = ROOT / "train"
VALID_DIR = ROOT / "valid"
TEST_DIR  = ROOT / "test"

# Training config
K_FOLDS = 4
EPOCHS  = 10
BATCH   = 4
LR      = 0.005
MOM     = 0.9
WD      = 5e-4
STEP    = 5
GAMMA   = 0.1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device, "| GPUs:", torch.cuda.device_count())

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)


Device: cuda | GPUs: 2


In [107]:
# ==== FINAL FIX: Redefine dataset class (CocoLiteV2) ====

from PIL import Image
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from pathlib import Path
import json

class CocoLiteV2(Dataset):
    """Minimal COCO dataset reader with guaranteed 3-channel RGB images."""
    def __init__(self, split_dir: Path, anno_name="_annotations.coco.json", allowed_img_ids=None):
        self.split_dir = split_dir
        with open(split_dir / anno_name, "r") as f:
            coco = json.load(f)

        self.categories = coco["categories"]
        all_images = coco["images"]
        all_anns   = coco["annotations"]

        # Optional subset
        if allowed_img_ids is None:
            images = all_images
        else:
            allowed = set(allowed_img_ids)
            images = [im for im in all_images if im["id"] in allowed]

        # Group annotations
        self.anns_by_img = defaultdict(list)
        kept = {im["id"] for im in images}
        for a in all_anns:
            if a["image_id"] in kept:
                self.anns_by_img[a["image_id"]].append(a)

        self.images = images
        self.id_to_file = {im["id"]: im["file_name"] for im in images}

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        im_meta = self.images[idx]
        img_id = im_meta["id"]
        img_path = self.split_dir / self.id_to_file[img_id]

        try:
            img = Image.open(img_path).convert("RGB")
        except OSError as e:
            print(f"[SKIP] OSError on {img_path.name[:120]}...  ({e})")
            # return a dummy 1×1 image + dummy target
            x = torch.zeros((3, 1, 1), dtype=torch.float32)
            target = {
                "boxes": torch.zeros((0, 4), dtype=torch.float32),
                "labels": torch.zeros((0,), dtype=torch.int64),
                "image_id": torch.tensor([img_id], dtype=torch.int64)
            }
            return x, target

        x = pil_to_tensor_rgb(img)

        anns = self.anns_by_img.get(img_id, [])
        boxes, labels = [], []
        for a in anns:
            x1, y1, w, h = a["bbox"]
            boxes.append([x1, y1, x1 + w, y1 + h])
            labels.append(a["category_id"])

        target = {
            "boxes": torch.as_tensor(boxes, dtype=torch.float32) if boxes else torch.zeros((0,4), dtype=torch.float32),
            "labels": torch.as_tensor(labels, dtype=torch.int64) if labels else torch.zeros((0,), dtype=torch.int64),
            "image_id": torch.tensor([img_id], dtype=torch.int64)
        }
        return x, target


def collate_fn(batch):
    return tuple(zip(*batch))


In [108]:
ds_test = CocoLiteV2(TRAIN_DIR)
print("Dataset length:", len(ds_test))

for i in [0, 1, 2]:
    x, y = ds_test[i]
    print(f"Sample {i}: shape={x.shape}, boxes={len(y['boxes'])}")


Dataset length: 1800
Sample 0: shape=torch.Size([3, 640, 640]), boxes=4
Sample 1: shape=torch.Size([3, 640, 640]), boxes=2
Sample 2: shape=torch.Size([3, 640, 640]), boxes=2


In [109]:
# ==== Block 3: read COCO, build K folds, compute num_classes ====

import json, math, random

# Read the training COCO to get image ids and categories
with open((ROOT / "train" / "_annotations.coco.json"), "r") as f:
    coco_train = json.load(f)

train_image_ids = [im["id"] for im in coco_train["images"]]
num_classes = len(coco_train["categories"]) + 1  # +1 for background

print(f"Train images: {len(train_image_ids)}")
print(f"Categories (incl. background): {num_classes}")
print("Category names:", [c["name"] for c in coco_train["categories"]])

# Deterministic K-fold split
random.seed(42)
random.shuffle(train_image_ids)
fold_size = math.ceil(len(train_image_ids) / K_FOLDS)
folds = [set(train_image_ids[i*fold_size : (i+1)*fold_size]) for i in range(K_FOLDS)]

# Sanity: sizes
for k, fset in enumerate(folds):
    print(f"Fold {k}: {len(fset)} images")


Train images: 1800
Categories (incl. background): 6
Category names: ['Bearing', 'Bearing', 'Bolt', 'Gear', 'Nut']
Fold 0: 450 images
Fold 1: 450 images
Fold 2: 450 images
Fold 3: 450 images


In [110]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

def make_model(num_classes: int):
    """
    Create a Faster R-CNN model pretrained on COCO,
    replacing the classifier head for our dataset.
    """
    # Load pretrained model
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(
        weights="DEFAULT"  # COCO pretrained
    )

    # Get the input features of the existing head
    in_features = model.roi_heads.box_predictor.cls_score.in_features

    # Replace with new predictor for our dataset (num_classes includes background)
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # Move to GPU
    model = model.to(device)
    print(f"Model ready on {device} with {num_classes} classes")
    return model


In [111]:
import warnings
import torch
from torch.amp import autocast, GradScaler

def force_three_channels(img: torch.Tensor) -> torch.Tensor:
    """
    Ensures every image tensor is 3×H×W for Faster R-CNN.
    Handles grayscale, 2-channel, 4-channel, or malformed tensors robustly.
    """
    if img.ndim != 3:
        raise RuntimeError(f"Expected CHW tensor, got shape={tuple(img.shape)}")
    c, h, w = img.shape
    if c == 3:
        return img
    elif c == 1:
        return img.repeat(3, 1, 1)
    elif c == 2:
        return img[:1].repeat(3, 1, 1)
    elif c >= 4:
        return img[:3, :, :]
    else:
        # Fallback for anything weird
        return img[:1].repeat(3, 1, 1)

warnings.filterwarnings(
    "ignore",
    message="Was asked to gather along dimension 0, but all input tensors were scalars"
)

from torch.amp import autocast, GradScaler

USE_AMP = True  # keep as you had

def train_one_epoch(model, loader, optimizer):
    model.train()
    scaler = GradScaler(device="cuda", enabled=USE_AMP)

    total_loss = 0.0
    num_batches = 0

    for batch_idx, (images, targets) in enumerate(loader):
        # Force 3ch + move to GPU
        images  = [force_three_channels(im).to(device, non_blocking=True) for im in images]
        targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets]

        # DEBUG: assert all are 3xHxW
        bad = [(i, tuple(im.shape)) for i, im in enumerate(images) if im.ndim != 3 or im.shape[0] != 3]
        if bad:
            print(f"[DEBUG] Bad image shapes in batch {batch_idx}: {bad}")
            # try to show filenames from underlying dataset if available
            try:
                # loader.dataset is a Subset-like? We access underlying dataset and indices if present
                ds = loader.dataset
                if hasattr(ds, 'dataset') and hasattr(ds, 'indices'):  # Subset case
                    base_ds = ds.dataset
                    base_indices = ds.indices
                else:
                    base_ds = ds
                    base_indices = range(len(ds))
                # print candidate filenames for this batch
                start = batch_idx * loader.batch_size
                fns = []
                for bi in range(len(images)):
                    # approximate dataset index (works when shuffle False; with shuffle this is best-effort)
                    ds_idx_guess = start + bi
                    if hasattr(base_ds, "images") and ds_idx_guess < len(base_indices):
                        im_meta = base_ds.images[ base_indices[ds_idx_guess] ]
                        fns.append(im_meta.get("file_name", "unknown"))
                print("[DEBUG] Candidate filenames in this batch:", fns)
            except Exception as e:
                print("[DEBUG] Could not map filenames:", e)
            raise RuntimeError("Found non-RGB image just before model forward")

        optimizer.zero_grad(set_to_none=True)
        with autocast(device_type="cuda", enabled=USE_AMP):
            loss_dict = model(images, targets)
            loss = sum(loss_dict.values())

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += float(loss.item())
        num_batches += 1

    return total_loss / max(num_batches, 1)

@torch.inference_mode()
def eval_loss(model, loader):
    """
    Compute validation loss.
    TorchVision detectors return a loss dict only in train() mode,
    so we temporarily set model.train() while gradients remain disabled.
    """
    was_training = model.training
    model.train()

    total_loss = 0.0
    num_batches = 0

    for images, targets in loader:
        
        images  = [force_three_channels(im).to(device, non_blocking=True) for im in images]
        targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets]

        with autocast(device_type="cuda", enabled=USE_AMP):
            loss_dict = model(images, targets)
            loss = sum(loss_dict.values()).item()

        total_loss += loss
        num_batches += 1

    if not was_training:
        model.eval()

    return total_loss / max(num_batches, 1)


In [112]:
def scan_loader_channels(loader, max_batches=5):
    print("Scanning first", max_batches, "batches for channel issues...")
    for b, (images, targets) in enumerate(loader):
        chs = [tuple(im.shape) for im in images]
        bad = [s for s in chs if len(s) != 3 or s[0] != 3]
        print(f"Batch {b}: shapes={chs}")
        if bad:
            print("-> Found non-3ch tensors in batch", b, ":", bad)
            return
        if b + 1 >= max_batches:
            break
    print("No channel issues detected in first", max_batches, "batches.")

scan_loader_channels(train_loader, max_batches=5)


Scanning first 5 batches for channel issues...
Batch 0: shapes=[(3, 640, 640), (3, 640, 640), (3, 640, 640), (3, 640, 640)]
Batch 1: shapes=[(3, 640, 640), (3, 640, 640), (3, 640, 640), (3, 640, 640)]
Batch 2: shapes=[(3, 640, 640), (3, 640, 640), (3, 640, 640), (3, 640, 640)]
Batch 3: shapes=[(3, 640, 640), (3, 640, 640), (3, 640, 640), (3, 640, 640)]
Batch 4: shapes=[(3, 640, 640), (3, 640, 640), (3, 640, 640), (3, 640, 640)]
No channel issues detected in first 5 batches.


In [113]:
# ==== Block 6: K-fold training loop, data loaders, and checkpointing ====

from torch.utils.data import DataLoader
from pathlib import Path



fold_ckpts = []

for k in range(K_FOLDS):
    val_ids   = folds[k]
    train_ids = set().union(*[folds[i] for i in range(K_FOLDS) if i != k])

    # Datasets for this fold
    # Use CocoLiteV2 instead of CocoLite
    train_ds = CocoLiteV2(TRAIN_DIR, allowed_img_ids=train_ids)
    val_ds   = CocoLiteV2(TRAIN_DIR, allowed_img_ids=val_ids)

    # DataLoaders
    train_loader = DataLoader(
        train_ds,
        batch_size=BATCH,
        shuffle=True,
        num_workers=2,
        pin_memory=True,
        collate_fn=collate_fn
    )
    val_loader = DataLoader(
        val_ds,
        batch_size=BATCH,
        shuffle=False,
        num_workers=2,
        pin_memory=True,
        collate_fn=collate_fn
    )

    # Model
    model = make_model(num_classes)
   # if torch.cuda.device_count() > 1:
    #    model = torch.nn.DataParallel(model)

    # Optimizer / Scheduler
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=LR, momentum=MOM, weight_decay=WD)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=STEP, gamma=GAMMA)

    best = float("inf")
    ckpt_path = Path(f"/kaggle/working/fasterrcnn_fold{k}.pt")

    print(f"\n==== Fold {k+1}/{K_FOLDS} | Train={len(train_ds)} | Val={len(val_ds)} ====")
    for epoch in range(1, EPOCHS + 1):
        tr = train_one_epoch(model, train_loader, optimizer)
        vl = eval_loss(model, val_loader)
        scheduler.step()
        print(f"Fold {k} | Epoch {epoch:02d} | train {tr:.4f} | val {vl:.4f}")

        if vl < best:
            best = vl
            state = {
                "model_state_dict": model.module.state_dict()
                    if isinstance(model, torch.nn.DataParallel) else model.state_dict(),
                "num_classes": num_classes,
                "class_names": [c["name"] for c in coco_train["categories"]],
                "fold": k,
                "val_loss": vl
            }
            torch.save(state, ckpt_path)
            print(f"Saved best → {ckpt_path.name} (val_loss={vl:.4f})")

    fold_ckpts.append((k, best, ckpt_path))

print("\nFold checkpoints summary:")
for k, vl, p in fold_ckpts:
    print(f"  Fold {k}: val_loss={vl:.4f}  →  {p}")

# Optional: choose global best and copy to a stable filename
best_fold, best_val, best_path = min(fold_ckpts, key=lambda t: t[1])
print(f"\nBest overall: fold {best_fold} with val_loss={best_val:.4f} at {best_path}")

import shutil
final_path = Path("/kaggle/working/fasterrcnn_best.pt")
shutil.copy2(best_path, final_path)
print(f"Unified best model saved to: {final_path}")

Model ready on cuda with 6 classes

==== Fold 1/4 | Train=1350 | Val=450 ====
[SKIP] OSError on skf-svenska-kullagerfabriken-swedish-ball-bearing-factory-founded-in-gothenburg-sweden-1907-skf-is-the-worlds-largest-be...  ([Errno 36] File name too long: '/kaggle/input/mechanical-parts/train/skf-svenska-kullagerfabriken-swedish-ball-bearing-factory-founded-in-gothenburg-sweden-1907-skf-is-the-worlds-largest-bearing-manufacturer-and-employs-44000-people-in-108-manufacturing-units-it-has-the-largest-industr_jpg.rf.865932a81555550cff6d73c2bb23941f.jpg')
Fold 0 | Epoch 01 | train 0.4631 | val 0.3223
Saved best → fasterrcnn_fold0.pt (val_loss=0.3223)
[SKIP] OSError on skf-svenska-kullagerfabriken-swedish-ball-bearing-factory-founded-in-gothenburg-sweden-1907-skf-is-the-worlds-largest-be...  ([Errno 36] File name too long: '/kaggle/input/mechanical-parts/train/skf-svenska-kullagerfabriken-swedish-ball-bearing-factory-founded-in-gothenburg-sweden-1907-skf-is-the-worlds-largest-bearing-manufactu

In [114]:
import json, zipfile, os
from pathlib import Path
import torch

best_ckpt = Path("/kaggle/working/fasterrcnn_best.pt")  # you already saved this
assert best_ckpt.exists(), "Best checkpoint not found."

# Extract class names from the checkpoint (you stored them during training)
state = torch.load(best_ckpt, map_location="cpu")
class_names = state.get("class_names", None)
if class_names is None:
    raise RuntimeError("class_names not found in checkpoint. Re-save with class_names in state dict.")

# Save class names and a small README
with open("/kaggle/working/class_names.json", "w") as f:
    json.dump(class_names, f, indent=2)

readme = """Faster R-CNN (ResNet50-FPN) on Mechanical Parts
Files:
- fasterrcnn_best.pt         : Best model weights (torchvision model.state_dict()).
- class_names.json           : Ordered list of class names (index aligns with category_id used during training).
Load:
    state = torch.load('fasterrcnn_best.pt', map_location='cpu')
    model = build_model(num_classes=len(class_names))
    model.load_state_dict(state['model_state_dict'], strict=True)
"""
with open("/kaggle/working/README.txt", "w") as f:
    f.write(readme)

# Zip everything for easy download
zip_path = "/kaggle/working/model_artifacts.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
    z.write(best_ckpt, arcname="fasterrcnn_best.pt")
    z.write("/kaggle/working/class_names.json", arcname="class_names.json")
    z.write("/kaggle/working/README.txt", arcname="README.txt")

print("Zipped:", zip_path)


Zipped: /kaggle/working/model_artifacts.zip


In [115]:
import json, zipfile, os
from pathlib import Path
import torch

best_ckpt = Path("/kaggle/working/fasterrcnn_best.pt")  # you already saved this
assert best_ckpt.exists(), "Best checkpoint not found."

# Extract class names from the checkpoint (you stored them during training)
state = torch.load(best_ckpt, map_location="cpu")
class_names = state.get("class_names", None)
if class_names is None:
    raise RuntimeError("class_names not found in checkpoint. Re-save with class_names in state dict.")

# Save class names and a small README
with open("/kaggle/working/class_names.json", "w") as f:
    json.dump(class_names, f, indent=2)

readme = """Faster R-CNN (ResNet50-FPN) on Mechanical Parts
Files:
- fasterrcnn_best.pt         : Best model weights (torchvision model.state_dict()).
- class_names.json           : Ordered list of class names (index aligns with category_id used during training).
Load:
    state = torch.load('fasterrcnn_best.pt', map_location='cpu')
    model = build_model(num_classes=len(class_names))
    model.load_state_dict(state['model_state_dict'], strict=True)
"""
with open("/kaggle/working/README.txt", "w") as f:
    f.write(readme)

# Zip everything for easy download
zip_path = "/kaggle/working/model_artifacts.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
    z.write(best_ckpt, arcname="fasterrcnn_best.pt")
    z.write("/kaggle/working/class_names.json", arcname="class_names.json")
    z.write("/kaggle/working/README.txt", arcname="README.txt")

print("Zipped:", zip_path)


Zipped: /kaggle/working/model_artifacts.zip
