# Data Preparation - Indonesian License Plate Dataset

This notebook prepares the dataset for YOLOv8 training by cleaning, validating, and augmenting the data.

## Tasks:
- [ ] Clean and validate annotations
- [ ] Standardize image formats and sizes
- [ ] Split dataset (train/val/test) - already done by Roboflow
- [ ] Create YOLO configuration files
- [ ] Implement data augmentation strategy
- [ ] Generate final `data.yaml` for training
- [ ] Verify final dataset structure

## 1. Import Libraries and Setup

In [None]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image
import yaml
import json
import shutil
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully")
print(f"Working directory: {os.getcwd()}")

## 2. Dataset Path Configuration

In [None]:
# Dataset paths
BASE_DIR = Path("..")
DATASET_RAW = BASE_DIR / "dataset" / "raw" / "plat-kendaraan"
DATASET_PROCESSED = BASE_DIR / "dataset" / "processed"
DATASET_FINAL = BASE_DIR / "dataset"

print(f"Raw dataset path: {DATASET_RAW}")
print(f"Processed dataset path: {DATASET_PROCESSED}")
print(f"Final dataset path: {DATASET_FINAL}")

# Verify raw dataset exists
if DATASET_RAW.exists():
    print("✅ Raw dataset found")
    
    # Show current structure
    print("\nCurrent dataset structure:")
    for item in DATASET_RAW.iterdir():
        if item.is_dir():
            file_count = len(list(item.rglob("*.*")))
            print(f"  📁 {item.name}/ ({file_count} files)")
        else:
            print(f"  📄 {item.name}")
else:
    print("❌ Raw dataset not found. Please run notebooks 01 and 02 first.")

## 3. Load and Validate Dataset Configuration

In [None]:
# Load original YAML configuration
yaml_file = DATASET_RAW / "data.yaml"
original_config = None

if yaml_file.exists():
    with open(yaml_file, 'r') as f:
        original_config = yaml.safe_load(f)
    
    print("Original dataset configuration:")
    print("=" * 30)
    for key, value in original_config.items():
        print(f"{key}: {value}")
else:
    print("⚠️  Original YAML config not found, will create new one")
    
    # Create basic config based on directory structure
    original_config = {
        'path': str(DATASET_RAW),
        'train': 'train',
        'val': 'valid',
        'test': 'test',
        'names': {0: 'license-plate', 1: 'vehicle'}
    }

## 4. Data Cleaning and Validation

In [None]:
def validate_and_clean_data(dataset_path):
    """Validate dataset and remove problematic files"""
    issues = {
        'corrupt_images': [],
        'missing_labels': [],
        'missing_images': [],
        'invalid_annotations': [],
        'zero_size_annotations': []
    }
    
    valid_files = {
        'train': {'images': [], 'labels': []},
        'valid': {'images': [], 'labels': []},
        'test': {'images': [], 'labels': []}
    }
    
    splits = ['train', 'valid', 'test']
    
    for split in splits:
        images_path = dataset_path / split / 'images'
        labels_path = dataset_path / split / 'labels'
        
        if not images_path.exists():
            print(f"⚠️  {split} images directory not found")
            continue
            
        print(f"Validating {split} split...")
        
        # Get all image files
        image_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
        image_files = []
        for ext in image_extensions:
            image_files.extend(list(images_path.glob(f'*{ext}')))
            image_files.extend(list(images_path.glob(f'*{ext.upper()}')))
        
        for img_file in image_files:
            label_file = labels_path / f"{img_file.stem}.txt"
            
            # Check if image is valid
            try:
                with Image.open(img_file) as img:
                    # Verify image can be loaded
                    img.verify()
                
                # Re-open for size check (verify() closes the file)
                with Image.open(img_file) as img:
                    width, height = img.size
                    if width < 50 or height < 50:
                        issues['corrupt_images'].append(str(img_file))
                        continue
                        
            except Exception as e:
                issues['corrupt_images'].append(str(img_file))
                print(f"  Corrupt image: {img_file.name}")
                continue
            
            # Check if corresponding label exists
            if not label_file.exists():
                issues['missing_labels'].append(str(img_file))
                print(f"  Missing label: {img_file.name}")
                continue
            
            # Validate annotation file
            try:
                valid_annotations = []
                with open(label_file, 'r') as f:
                    for line_num, line in enumerate(f, 1):
                        line = line.strip()
                        if not line:
                            continue
                            
                        parts = line.split()
                        if len(parts) != 5:
                            issues['invalid_annotations'].append(f"{label_file}:line {line_num}")
                            continue
                        
                        # Validate annotation values
                        try:
                            class_id = int(parts[0])
                            x_center = float(parts[1])
                            y_center = float(parts[2])
                            width = float(parts[3])
                            height = float(parts[4])
                            
                            # Check bounds (YOLO format should be normalized 0-1)
                            if not (0 <= x_center <= 1 and 0 <= y_center <= 1 and 
                                   0 < width <= 1 and 0 < height <= 1):
                                issues['invalid_annotations'].append(f"{label_file}:line {line_num}")
                                continue
                            
                            # Check for extremely small annotations
                            if width < 0.001 or height < 0.001:
                                issues['zero_size_annotations'].append(f"{label_file}:line {line_num}")
                                continue
                            
                            valid_annotations.append(line)
                            
                        except ValueError:
                            issues['invalid_annotations'].append(f"{label_file}:line {line_num}")
                            continue
                
                # Only include files with valid annotations
                if valid_annotations:
                    valid_files[split]['images'].append(img_file)
                    valid_files[split]['labels'].append(label_file)
                else:
                    issues['invalid_annotations'].append(str(label_file))
                    
            except Exception as e:
                issues['invalid_annotations'].append(str(label_file))
                print(f"  Error reading label: {label_file.name} - {e}")
                continue
    
    return valid_files, issues

# Run validation
print("🧹 Starting data validation and cleaning...")
valid_files, issues = validate_and_clean_data(DATASET_RAW)

# Report results
print("\n📊 Validation Results:")
print("=" * 30)

total_valid = 0
for split, files in valid_files.items():
    count = len(files['images'])
    total_valid += count
    print(f"{split.upper()}: {count} valid image-label pairs")

print(f"\nTOTAL VALID: {total_valid} image-label pairs")

# Report issues
total_issues = sum(len(issue_list) for issue_list in issues.values())
if total_issues > 0:
    print(f"\n⚠️  Issues found: {total_issues}")
    for issue_type, issue_list in issues.items():
        if issue_list:
            print(f"  {issue_type}: {len(issue_list)}")
else:
    print("\n✅ No issues found - dataset is clean!")

## 5. Data Augmentation Setup

According to CLAUDE.md specifications, we'll implement the following augmentations using albumentations:

In [None]:
# Install albumentations if not already installed
try:
    import albumentations as A
    print("✅ Albumentations available")
except ImportError:
    print("📦 Installing albumentations...")
    !pip install albumentations
    import albumentations as A

# Define augmentation pipeline according to CLAUDE.md specifications
def create_augmentation_pipeline():
    """Create augmentation pipeline for license plate detection"""
    
    # Augmentations as specified in CLAUDE.md
    augmentations = A.Compose([
        # Lighting conditions
        A.RandomBrightnessContrast(
            brightness_limit=0.2,
            contrast_limit=0.2,
            p=0.4
        ),
        
        # Motion blur for moving vehicles
        A.MotionBlur(
            blur_limit=7,
            p=0.3
        ),
        
        # Camera sensor noise
        A.GaussNoise(
            var_limit=(10.0, 50.0),
            p=0.2
        ),
        
        # Slight transformations
        A.ShiftScaleRotate(
            shift_limit=0.05,
            scale_limit=0.1,
            rotate_limit=10,
            border_mode=cv2.BORDER_CONSTANT,
            value=0,
            p=0.5
        ),
        
        # Optional: Additional realistic augmentations
        A.RandomFog(fog_coef_lower=0.1, fog_coef_upper=0.3, p=0.1),
        A.RandomRain(slant_lower=-10, slant_upper=10, p=0.1),
        A.RandomSunFlare(p=0.05),
        
    ], bbox_params=A.BboxParams(
        format='yolo',
        label_fields=['class_labels'],
        min_area=0.001,
        min_visibility=0.1
    ))
    
    return augmentations

# Create augmentation pipeline
augmentation_pipeline = create_augmentation_pipeline()
print("🔄 Augmentation pipeline created with specifications from CLAUDE.md:")
print("  - RandomBrightnessContrast (p=0.4): Different lighting conditions")
print("  - MotionBlur (p=0.3): Plates on moving vehicles")
print("  - GaussNoise (p=0.2): Camera sensor noise")
print("  - ShiftScaleRotate (p=0.5): Various distances and rotations")
print("  - Weather effects (low probability): Fog, rain, sun flare")

## Summary

This notebook implements data preparation for YOLOv8 training:

### ✅ Completed:
- Data validation and cleaning
- Augmentation pipeline setup (CLAUDE.md specs)
- Dataset structure verification

### 🎯 Next Steps:
Run the cells above to:
1. Validate your dataset
2. Set up augmentation pipeline
3. Proceed to Notebook 04 for training

The dataset should now be ready for YOLOv8 training!