In [2]:
import os
import json
from pathlib import Path
from collections import defaultdict

print("Dataset Verification Script Starting...")

Dataset Verification Script Starting...


In [3]:
def verify_dataset(raw_root: str) -> dict:
    """
    Verify dataset consistency by checking that every image has corresponding box and entities files.
    
    Args:
        raw_root: Path to dataset/raw/train directory
        
    Returns:
        Dictionary with verification statistics
    """
    raw_path = Path(raw_root)
    image_dir = raw_path / "image"
    box_dir = raw_path / "box"
    entities_dir = raw_path / "entities"
    
    # Get all image files
    image_files = list(image_dir.glob("*.jpg")) + list(image_dir.glob("*.jpeg")) + list(image_dir.glob("*.png"))
    
    stats = {
        "total_images": len(image_files),
        "missing_boxes": [],
        "missing_entities": [],
        "complete_triplets": [],
        "sample_files": []
    }
    
    print(f"Found {len(image_files)} image files")
    
    for img_file in image_files[:10]:  # Sample first 10 for display
        stats["sample_files"].append(str(img_file.name))
    
    for img_file in image_files:
        stem = img_file.stem
        box_file = box_dir / f"{stem}.txt"
        entities_file = entities_dir / f"{stem}.txt"
        
        has_box = box_file.exists()
        has_entities = entities_file.exists()
        
        if not has_box:
            stats["missing_boxes"].append(stem)
        
        if not has_entities:
            stats["missing_entities"].append(stem)
        
        if has_box and has_entities:
            stats["complete_triplets"].append(stem)
    
    # Summary statistics
    stats["complete_count"] = len(stats["complete_triplets"])
    stats["missing_boxes_count"] = len(stats["missing_boxes"])
    stats["missing_entities_count"] = len(stats["missing_entities"])
    stats["completion_rate"] = stats["complete_count"] / stats["total_images"] if stats["total_images"] > 0 else 0
    
    return stats

In [4]:
# Set up paths - adjust the raw_root path as needed
raw_root = "../../dataset/raw/train"
output_path = "../../dataset/verify_report.json"

# Create output directory if it doesn't exist
Path(output_path).parent.mkdir(parents=True, exist_ok=True)

# Run verification
print(f"Verifying dataset at: {raw_root}")
stats = verify_dataset(raw_root)

Verifying dataset at: ../../dataset/raw/train
Found 626 image files


In [5]:
# Print human-readable summary
print("\n" + "="*50)
print("DATASET VERIFICATION SUMMARY")
print("="*50)
print(f"Total Images: {stats['total_images']}")
print(f"Complete Triplets: {stats['complete_count']} ({stats['completion_rate']:.1%})")
print(f"Missing Box Files: {stats['missing_boxes_count']}")
print(f"Missing Entity Files: {stats['missing_entities_count']}")

if stats['missing_boxes_count'] > 0:
    print(f"\nFirst 5 missing box files: {stats['missing_boxes'][:5]}")

if stats['missing_entities_count'] > 0:
    print(f"\nFirst 5 missing entity files: {stats['missing_entities'][:5]}")

print(f"\nSample image files: {stats['sample_files'][:5]}")
print("\n" + "="*50)


DATASET VERIFICATION SUMMARY
Total Images: 626
Complete Triplets: 626 (100.0%)
Missing Box Files: 0
Missing Entity Files: 0

Sample image files: ['X00016469612.jpg', 'X00016469619.jpg', 'X00016469620.jpg', 'X00016469622.jpg', 'X00016469623.jpg']



In [6]:
# Save verification report
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(stats, f, indent=2, ensure_ascii=False)

print(f"\nVerification report saved to: {output_path}")
print(f"Dataset is {'READY' if stats['completion_rate'] > 0.95 else 'INCOMPLETE'} for processing")


Verification report saved to: ../../dataset/verify_report.json
Dataset is READY for processing
