In [1]:
# Install necessary libraries
!pip install rfdetr supervision albumentations opencv-python torch torchvision --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.3/266.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.6/131.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.5/181.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.9/57.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m89.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
import json
import os
import shutil
import numpy as np
from sklearn.model_selection import train_test_split
from PIL import Image
from rfdetr import RFDETRBase

# =====================
# Dataset Configuration
# =====================
dataset_dir = "/kaggle/input/mobilephoneusagedatasetiitr"
positive_dir = os.path.join(dataset_dir, "positive")
negative_dir = os.path.join(dataset_dir, "negative")
labels_csv = os.path.join(dataset_dir, "labels.csv")

# ====================
# COCO Conversion Setup
# ====================
output_dir = "/kaggle/working/coco_dataset"
os.makedirs(output_dir, exist_ok=True)

# ===================
# Data Preparation
# ===================
# Read and process labels
labels_df = pd.read_csv(labels_csv)

# Simplified serialization approach
def convert_value(val):
    return int(val) if isinstance(val, (np.integer, int)) else float(val) if isinstance(val, (np.floating, float)) else val

# Verify required columns exist
required_columns = ['filename', 'width', 'height', 'xmin', 'ymin', 'xmax', 'ymax', 'class']
missing_cols = [col for col in required_columns if col not in labels_df.columns]
if missing_cols:
    raise ValueError(f"Missing required columns in CSV: {missing_cols}")

print("Detected columns in CSV:")
print(labels_df.columns.tolist())

# Create image registry
image_registry = {
    "positive": {},
    "negative": []
}

# Process positive images
for _, row in labels_df.iterrows():
    filename = row['filename']
    if filename not in image_registry["positive"]:
        image_registry["positive"][filename] = []
    
    # Convert to COCO bbox format [x, y, width, height]
    x = convert_value(row['xmin'])
    y = convert_value(row['ymin'])
    w = convert_value(row['xmax'] - row['xmin'])  # Convert from xmax to width
    h = convert_value(row['ymax'] - row['ymin'])  # Convert from ymax to height
    
    image_registry["positive"][filename].append([x, y, w, h])

# Add negative images
negative_images = [f for f in os.listdir(negative_dir) if f.endswith((".jpg", ".png", ".jpeg"))]
image_registry["negative"] = negative_images

# ===================
# Dataset Splitting
# ===================
# Create balanced splits
positive_samples = list(image_registry["positive"].keys())
negative_samples = image_registry["negative"]

# Split positive images
train_pos, test_pos = train_test_split(positive_samples, test_size=0.2, random_state=42)
train_pos, val_pos = train_test_split(train_pos, test_size=0.125, random_state=42)  # 70-10-20 split

# Split negative images
train_neg, test_neg = train_test_split(negative_samples, test_size=0.2, random_state=42)
train_neg, val_neg = train_test_split(train_neg, test_size=0.125, random_state=42)

# ===================
# COCO JSON Creation
# ===================
def create_coco_split(split_name, pos_files, neg_files):
    split_dir = os.path.join(output_dir, split_name)
    os.makedirs(split_dir, exist_ok=True)
    
    coco_data = {
        "info": {"description": "Mobile Phone Detection Dataset"},
        "licenses": [{"name": "MIT"}],
        "categories": [{
            "id": 0, 
            "name": "mobile phone",
            "supercategory": "object"  # Added required supercategory field
        }],
        "images": [],
        "annotations": []
    }
    
    annotation_id = 1
    
    # Process positive images
    for idx, filename in enumerate(pos_files):
        # Copy image
        src_path = os.path.join(positive_dir, filename)
        dest_path = os.path.join(split_dir, filename)
        shutil.copy(src_path, dest_path)
        
        # Get image dimensions from CSV
        img_row = labels_df[labels_df['filename'] == filename].iloc[0]
        width = convert_value(img_row['width'])
        height = convert_value(img_row['height'])
        
        # Create image entry
        image_id = idx + 1
        coco_data["images"].append({
            "id": image_id,
            "file_name": filename,
            "width": width,
            "height": height,
            "license": 1,
            "date_captured": "2024-01-01"
        })
        
        # Create annotations
        for bbox in image_registry["positive"][filename]:
            coco_data["annotations"].append({
                "id": annotation_id,
                "image_id": image_id,
                "category_id": 0,
                "bbox": bbox,
                "area": convert_value(bbox[2] * bbox[3]),
                "iscrowd": 0
            })
            annotation_id += 1
    
    # Process negative images
    for idx, filename in enumerate(neg_files, start=len(pos_files)):
        src_path = os.path.join(negative_dir, filename)
        dest_path = os.path.join(split_dir, filename)
        shutil.copy(src_path, dest_path)
        
        # Get dimensions from image
        with Image.open(src_path) as img:
            width, height = img.size
        
        coco_data["images"].append({
            "id": idx + 1,
            "file_name": filename,
            "width": convert_value(width),
            "height": convert_value(height),
            "license": 1,
            "date_captured": "2024-01-01"
        })
    
    # Save COCO JSON with default encoder
    with open(os.path.join(split_dir, "_annotations.coco.json"), "w") as f:
        json.dump(coco_data, f, indent=2)

# Create splits
create_coco_split("train", train_pos, train_neg)
create_coco_split("valid", val_pos, val_neg)
create_coco_split("test", test_pos, test_neg)

# ===================
# Model Training
# ===================
# Install required packages
!pip install rfdetr[metrics] --quiet

# Training configuration
model = RFDETRBase(resolution=448)  # Higher resolution for better accuracy

model.train(
    dataset_dir=output_dir,
    epochs=100,
    batch_size=8,
    grad_accum_steps=2,
    lr=1e-4,
    output_dir="/kaggle/working/output",
    early_stopping=True,
    early_stopping_patience=10,
    tensorboard=True,
    wandb=False
)

print("Training completed successfully!")

Detected columns in CSV:
['filename', 'width', 'height', 'xmin', 'ymin', 'xmax', 'ymax', 'class']


rf-detr-base.pth: 100%|██████████| 355M/355M [00:04<00:00, 88.9MiB/s]


config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/88.2M [00:00<?, ?B/s]

Loading pretrain weights
TensorBoard logging initialized. To monitor logs, use 'tensorboard --logdir /kaggle/working/output' and open http://localhost:6006/ in browser.
Not using distributed mode
git:
  sha: N/A, status: clean, branch: N/A

Namespace(num_classes=1, grad_accum_steps=2, amp=True, lr=0.0001, lr_encoder=0.00015, batch_size=8, weight_decay=0.0001, epochs=100, lr_drop=100, clip_max_norm=0.1, lr_vit_layer_decay=0.8, lr_component_decay=0.7, do_benchmark=False, dropout=0, drop_path=0.0, drop_mode='standard', drop_schedule='constant', cutoff_epoch=0, pretrained_encoder=None, pretrain_weights='rf-detr-base.pth', pretrain_exclude_keys=None, pretrain_keys_modify_to_load=None, pretrained_distiller=None, encoder='dinov2_windowed_small', vit_encoder_num_layers=12, window_block_indexes=None, position_embedding='sine', out_feature_indexes=[2, 5, 8, 11], freeze_encoder=False, layer_norm=True, rms_norm=False, backbone_lora=False, force_no_pretrain=False, dec_layers=3, dim_feedforward=2048



Epoch: [0]  [ 0/38]  eta: 0:07:00  lr: 0.000100  class_error: 0.00  loss: 14.4324 (14.4324)  loss_ce: 0.7487 (0.7487)  loss_bbox: 1.3304 (1.3304)  loss_giou: 1.3228 (1.3228)  loss_ce_0: 0.6511 (0.6511)  loss_bbox_0: 1.6978 (1.6978)  loss_giou_0: 1.4370 (1.4370)  loss_ce_1: 0.7419 (0.7419)  loss_bbox_1: 1.3253 (1.3253)  loss_giou_1: 1.3647 (1.3647)  loss_ce_enc: 0.7100 (0.7100)  loss_bbox_enc: 1.6965 (1.6965)  loss_giou_enc: 1.4065 (1.4065)  loss_ce_unscaled: 0.7487 (0.7487)  class_error_unscaled: 0.0000 (0.0000)  loss_bbox_unscaled: 0.2661 (0.2661)  loss_giou_unscaled: 0.6614 (0.6614)  cardinality_error_unscaled: 0.5000 (0.5000)  loss_ce_0_unscaled: 0.6511 (0.6511)  loss_bbox_0_unscaled: 0.3396 (0.3396)  loss_giou_0_unscaled: 0.7185 (0.7185)  cardinality_error_0_unscaled: 0.5000 (0.5000)  loss_ce_1_unscaled: 0.7419 (0.7419)  loss_bbox_1_unscaled: 0.2651 (0.2651)  loss_giou_1_unscaled: 0.6823 (0.6823)  cardinality_error_1_unscaled: 0.5000 (0.5000)  loss_ce_enc_unscaled: 0.7100 (0.7100) 