# Data Exploration - Indonesian License Plate Dataset

Quick dataset overview and verification before training.

## Tasks:
- [ ] Dataset structure verification
- [ ] Basic statistics (image count, label count)
- [ ] Data quality checks
- [ ] Sample visualization
- [ ] Training readiness confirmation

## 1. Import Libraries and Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import yaml
from collections import Counter
import cv2
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Define paths (license-plate-training as root)
ROOT_DIR = Path("..").resolve()  # From notebooks/ to license-plate-training/
DATASET_PATH = ROOT_DIR / "dataset" / "plat-kendaraan"
MODELS_DIR = ROOT_DIR / "models"
RESULTS_DIR = ROOT_DIR / "results"

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully")
print(f"Working directory: {Path.cwd()}")

## 2. Dataset Path Configuration

In [None]:
# Dataset path with updated root
dataset_path = DATASET_PATH

if dataset_path.exists():
    print(f"✅ Dataset found: {dataset_path}")
    print(f"📁 Full path: {dataset_path.absolute()}")
else:
    print(f"❌ Dataset not found at: {dataset_path}")
    print("Please run notebook 01 first to download the dataset")

## 3. Dataset Structure Analysis

In [4]:
if dataset_path.exists():
    print("📂 Dataset Structure:")
    print("=" * 30)
    
    # Show main directories and files
    for item in sorted(dataset_path.iterdir()):
        if item.is_dir():
            # Count files in subdirectories
            total_files = len(list(item.rglob("*.*")))
            images_count = len(list((item / "images").glob("*.*"))) if (item / "images").exists() else 0
            labels_count = len(list((item / "labels").glob("*.*"))) if (item / "labels").exists() else 0
            
            print(f"  📁 {item.name}/")
            if images_count > 0 or labels_count > 0:
                print(f"      🖼️  images: {images_count}")
                print(f"      🏷️  labels: {labels_count}")
            else:
                print(f"      📄 files: {total_files}")
        else:
            print(f"  📄 {item.name}")
    
    # Check for data.yaml
    yaml_file = dataset_path / "data.yaml"
    if yaml_file.exists():
        print(f"\n✅ Configuration file found: data.yaml")
        
        try:
            with open(yaml_file, 'r') as f:
                config = yaml.safe_load(f)
            
            print("\n📋 Dataset Configuration:")
            for key, value in config.items():
                print(f"  {key}: {value}")
                
        except Exception as e:
            print(f"⚠️ Could not read YAML: {e}")
    else:
        print(f"\n❌ Configuration file not found: data.yaml")
else:
    print("Cannot analyze structure - dataset not found")

Cannot analyze structure - dataset not found


## 4. Detailed Split Analysis

In [5]:
if dataset_path.exists():
    splits = ['train', 'valid', 'test']
    total_images = 0
    total_labels = 0
    
    print("📊 Dataset Split Analysis:")
    print("=" * 40)
    
    split_stats = {}
    
    for split in splits:
        split_path = dataset_path / split
        
        if split_path.exists():
            images_dir = split_path / 'images'
            labels_dir = split_path / 'labels'
            
            # Count files
            image_files = []
            if images_dir.exists():
                image_files = list(images_dir.glob('*.jpg')) + list(images_dir.glob('*.png')) + list(images_dir.glob('*.jpeg'))
            
            label_files = []
            if labels_dir.exists():
                label_files = list(labels_dir.glob('*.txt'))
            
            image_count = len(image_files)
            label_count = len(label_files)
            
            split_stats[split] = {
                'images': image_count,
                'labels': label_count,
                'match': image_count == label_count
            }
            
            total_images += image_count
            total_labels += label_count
            
            match_symbol = "✅" if image_count == label_count else "⚠️"
            print(f"{split.upper():>6}: {image_count:>6} images | {label_count:>6} labels | {match_symbol}")
        else:
            print(f"{split.upper():>6}: ❌ Directory not found")
            split_stats[split] = {'images': 0, 'labels': 0, 'match': False}
    
    print("=" * 40)
    overall_match = "✅" if total_images == total_labels else "⚠️"
    print(f"{'TOTAL':>6}: {total_images:>6} images | {total_labels:>6} labels | {overall_match}")
    
    # Calculate split percentages
    if total_images > 0:
        print("\n📊 Split Distribution:")
        for split, stats in split_stats.items():
            percentage = (stats['images'] / total_images) * 100
            print(f"  {split.capitalize():>5}: {percentage:5.1f}%")
    
    # Training readiness check
    print("\n🎯 Training Readiness Check:")
    checks = {
        "Dataset exists": dataset_path.exists(),
        "YAML config found": (dataset_path / "data.yaml").exists(),
        "Train split exists": split_stats.get('train', {}).get('images', 0) > 0,
        "Valid split exists": split_stats.get('valid', {}).get('images', 0) > 0,
        "Images-labels match": all(stats['match'] for stats in split_stats.values()),
        "Sufficient data (>100 images)": total_images > 100
    }
    
    all_ready = True
    for check, passed in checks.items():
        symbol = "✅" if passed else "❌"
        print(f"  {symbol} {check}")
        if not passed:
            all_ready = False
    
    print("\n" + "="*50)
    if all_ready:
        print("🚀 READY FOR TRAINING! Proceed to notebook 04")
    else:
        print("⚠️  TRAINING NOT READY - Fix issues above first")
    print("="*50)
        
else:
    print("Cannot perform analysis - dataset not found")

Cannot perform analysis - dataset not found


## Summary

This notebook provides a quick overview of the Indonesian license plate dataset:

### ✅ Completed:
- Dataset structure verification
- Split analysis (train/valid/test)
- Data quality checks
- Training readiness confirmation

### 🎯 Next Steps:
If all checks pass:
1. **Ready for Training**: Proceed to `04_model_training.ipynb`
2. **Training Parameters**: Use standard YOLO settings
3. **Expected Performance**: Target mAP@0.5 > 0.85

If issues found:
1. **Fix dataset issues**: Re-run `01_setup_and_dataset.ipynb`
2. **Check file integrity**: Verify all images/labels are accessible
3. **Resolve path issues**: Ensure all paths are correct