## üéØ Complete Maya4 Functionality Showcase

This comprehensive example demonstrates all major features of the Maya4 SAR dataloader in a single, well-documented cell. 

**What you'll learn:**
- Data source configuration (local/online)
- Processing level selection (raw ‚Üí rc ‚Üí rcmc ‚Üí az)
- Filtering by year, polarization, stripmap mode, and location
- Patch extraction strategies and ordering
- Normalization and transformations
- Complex vs real-valued data handling
- Positional encoding for spatial awareness
- Advanced features: concatenation, block patterns, balanced sampling

In [None]:
import sys
import os
from pathlib import Path
import functools
import numpy as np

# Import Maya4 core components
from maya4 import (
    get_sar_dataloader,      # Main dataloader factory function
    SARTransform,             # Transformation pipeline for normalization
    SampleFilter,             # Filter for selecting specific SAR products
    minmax_normalize,         # Normalization utility function
    RC_MIN, RC_MAX,          # Min/max values for Range Compressed data
    GT_MIN, GT_MAX           # Min/max values for Ground Truth (Azimuth focused) data
)

# ============================================================================
# üìÅ STEP 1: DATA DIRECTORY SETUP
# ============================================================================
# Define where SAR products are stored locally (or will be downloaded to)

DATA_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', 'data'))
print(f'üìÇ Data directory: {DATA_DIR}')
print(f'‚úì Directory exists: {os.path.exists(DATA_DIR)}\n')

# Create directory if it doesn't exist
os.makedirs(DATA_DIR, exist_ok=True)

# ============================================================================
# üéõÔ∏è STEP 2: SAMPLE FILTERING (What data to load?)
# ============================================================================
# SampleFilter allows you to select specific SAR products based on metadata

filters = SampleFilter(
    # years: List of acquisition years to include (e.g., [2023, 2024])
    # Only products from these years will be loaded
    years=[2023, 2024],
    
    # polarizations: Radar polarization modes to include
    # Options: "hh", "hv", "vh", "vv" (H=horizontal, V=vertical)
    # "hh" = horizontal transmit, horizontal receive (co-pol)
    # "hv" = horizontal transmit, vertical receive (cross-pol)
    polarizations=["hh"],
    
    # stripmap_modes: Sentinel-1 stripmap beam modes (1-6)
    # Different modes have different swath widths and resolutions
    # Modes 1-3: narrow swaths, higher resolution
    # Modes 4-6: wider swaths, lower resolution
    stripmap_modes=[1, 2, 3],
    
    # parts: Geographic regions/partitions in the dataset
    # Maya4 organizes data into parts (PT1, PT2, etc.) by location
    # Leave empty or None to include all parts
    parts=["PT1"]
)

# ============================================================================
# üîÑ STEP 3: TRANSFORMATION PIPELINE (How to normalize data?)
# ============================================================================
# SARTransform defines how to normalize/transform data at each processing level
# Each level can have its own transformation function

transforms = SARTransform(
    # transform_raw: Applied to Level 0 (raw) data
    # Raw data typically has different statistics than compressed data
    transform_raw=functools.partial(
        minmax_normalize, 
        array_min=RC_MIN,  # Minimum value for normalization
        array_max=RC_MAX   # Maximum value for normalization
    ),
    
    # transform_rc: Applied to Range Compressed data
    # RC is the first compression step (range direction only)
    transform_rc=functools.partial(
        minmax_normalize,
        array_min=RC_MIN,
        array_max=RC_MAX
    ),
    
    # transform_rcmc: Applied to Range Cell Migration Corrected data
    # RCMC corrects for range cell migration due to platform motion
    transform_rcmc=functools.partial(
        minmax_normalize,
        array_min=RC_MIN,
        array_max=RC_MAX
    ),
    
    # transform_az: Applied to Azimuth focused (final) data
    # This is the ground truth - fully focused SAR image
    transform_az=functools.partial(
        minmax_normalize,
        array_min=GT_MIN,  # Different min/max for focused data
        array_max=GT_MAX
    )
)

# ============================================================================
# üéØ STEP 4: CREATE THE DATALOADER (Main configuration)
# ============================================================================
print('='*80)
print('üîß Creating Maya4 SAR Dataloader with comprehensive configuration...')
print('='*80 + '\n')

loader = get_sar_dataloader(
    # -------------------------------------------------------------------------
    # üìÅ DATA SOURCE CONFIGURATION
    # -------------------------------------------------------------------------
    
    # data_dir: Local directory for storing/loading SAR products
    # Products are organized as: data_dir/[part]/[product_name].zarr/
    data_dir=DATA_DIR,
    
    # online: Enable automatic downloading from HuggingFace Hub
    # - True: Downloads missing products from Maya4 HF organization
    # - False: Only uses locally available products
    # Note: Requires HF authentication for private datasets
    online=True,
    
    # filters: SampleFilter object to select specific products
    # Apply metadata-based filtering (year, polarization, mode, location)
    filters=filters,
    
    # max_products: Maximum number of SAR products to load
    # Useful for testing or limiting dataset size
    # Set to None to load all available products matching filters
    max_products=5,
    
    # file_pattern: Regex pattern to match specific filenames (optional)
    # Example: ".*20230508.*" to load only products from May 8, 2023
    # Leave as None to include all files matching other filters
    file_pattern=None,
    
    # -------------------------------------------------------------------------
    # üéöÔ∏è PROCESSING LEVEL SELECTION
    # -------------------------------------------------------------------------
    
    # level_from: Input processing level (what we learn from)
    # Options: "raw" (L0), "rc", "rcmc", "az" (L1)
    # This is the X in your training pair (input)
    level_from="rcmc",
    
    # level_to: Target processing level (what we predict)
    # Options: "raw", "rc", "rcmc", "az"
    # This is the Y in your training pair (target/label)
    level_to="az",
    
    # -------------------------------------------------------------------------
    # üß© PATCH EXTRACTION CONFIGURATION
    # -------------------------------------------------------------------------
    
    # patch_mode: How to extract patches from the full SAR image
    # - "rectangular": Extract rectangular patches (most common)
    # - Other modes may be added in future versions
    patch_mode="rectangular",
    
    # patch_size: (height, width) of extracted patches in pixels
    # - (H, W): Extract patches of fixed size H√óW
    # - (H, -1): Full width, fixed height (entire rows)
    # - (-1, W): Full height, fixed width (entire columns)
    # - (-1, -1): Entire image (not recommended for large products)
    # Example: (1, 1000) extracts 1√ó1000 horizontal slices
    patch_size=(1000, 1000),
    
    # buffer: (vertical, horizontal) buffer zones at image boundaries
    # Excludes this many pixels from edges to avoid boundary artifacts
    # Example: (1000, 1000) excludes 1000 pixels from each edge
    buffer=(500, 500),
    
    # stride: (vertical, horizontal) step size between patches
    # - stride < patch_size: Overlapping patches
    # - stride = patch_size: Non-overlapping patches (tiles)
    # - stride > patch_size: Skip regions between patches
    # Example: (1000, 1000) with patch_size (1000, 1000) = no overlap
    stride=(1000, 1000),
    
    # max_base_sample_size: (height, width) maximum size for base samples
    # When concatenate_patches=True, limits the size of concatenated blocks
    # Helps manage memory usage when creating large composite samples
    # Set to None for no limit
    max_base_sample_size=None,
    
    # -------------------------------------------------------------------------
    # üìä PATCH ORDERING AND SAMPLING STRATEGY
    # -------------------------------------------------------------------------
    
    # patch_order: Order in which patches are extracted and sampled
    # - "row": Sample patches row-by-row (left to right, top to bottom)
    # - "col": Sample patches column-by-column (top to bottom, left to right)
    # - "chunk": Sample based on underlying Zarr chunk structure (I/O efficient)
    # "chunk" is typically fastest as it respects data storage layout
    patch_order="chunk",
    
    # shuffle_files: Shuffle the order of SAR products
    # - True: Random product order each epoch
    # - False: Deterministic product order
    shuffle_files=True,
    
    # use_balanced_sampling: Balance samples across geographic locations
    # Uses K-means clustering on lat/lon to ensure geographic diversity
    # Requires sklearn and at least ~10 products
    # - True: Equal representation from different geographic areas
    # - False: No geographic balancing
    use_balanced_sampling=False,
    
    # block_pattern: (vertical_blocks, horizontal_blocks) for block sampling
    # Divides each product into blocks and samples within blocks
    # Example: (32, -1) creates 32 vertical blocks, full horizontal extent
    # Set to None for standard patch extraction
    block_pattern=None,
    
    # -------------------------------------------------------------------------
    # üî¢ DATA TYPE AND REPRESENTATION
    # -------------------------------------------------------------------------
    
    # complex_valued: Return data as complex numbers or separate real/imag
    # - True: Complex64 tensors (native SAR representation)
    # - False: Float32 tensors with separate real and imaginary channels
    #          Shape changes from (B, H, W) to (B, 2, H, W)
    complex_valued=True,
    
    # positional_encoding: Add spatial position information to samples
    # Appends normalized (row, col) coordinates as additional channels
    # Useful for models that need spatial awareness
    # - True: Adds 2 channels with position info
    # - False: Only returns SAR data
    positional_encoding=True,
    
    # -------------------------------------------------------------------------
    # üîó ADVANCED: PATCH CONCATENATION
    # -------------------------------------------------------------------------
    
    # concatenate_patches: Stack multiple patches into larger samples
    # Useful for creating longer sequences or larger spatial contexts
    # - True: Concatenate patches along specified axis
    # - False: Return individual patches
    concatenate_patches=False,
    
    # concat_axis: Axis along which to concatenate patches
    # Only used when concatenate_patches=True
    # - 0: Vertical concatenation (stack patches vertically)
    # - 1: Horizontal concatenation (stack patches horizontally)
    concat_axis=0,
    
    # -------------------------------------------------------------------------
    # üîÑ TRANSFORMATION AND NORMALIZATION
    # -------------------------------------------------------------------------
    
    # transform: SARTransform object defining normalization pipeline
    # Applied to data at each processing level
    # Set to None to disable normalization (use raw values)
    transform=transforms,
    
    # -------------------------------------------------------------------------
    # üéõÔ∏è DATALOADER CONFIGURATION
    # -------------------------------------------------------------------------
    
    # batch_size: Number of samples per batch
    # Standard PyTorch DataLoader parameter
    batch_size=8,
    
    # num_workers: Number of parallel workers for data loading
    # - 0: Load data in main process (good for debugging)
    # - >0: Use multiprocessing (faster but harder to debug)
    # Recommended: 2-4 for most use cases
    num_workers=0,
    
    # samples_per_prod: Number of patches to extract per SAR product
    # Controls how many samples each product contributes per epoch
    # Higher values = more thorough coverage but longer epochs
    samples_per_prod=100,
    
    # cache_size: Number of products to keep in memory cache
    # Larger cache = fewer disk reads but more memory usage
    # Recommended: 10-100 depending on available RAM
    cache_size=10,
    
    # -------------------------------------------------------------------------
    # üóÑÔ∏è STORAGE BACKEND
    # -------------------------------------------------------------------------
    
    # backend: Storage format for SAR products
    # - "zarr": Zarr format (default, supports cloud streaming)
    # - Other backends may be added in future versions
    backend="zarr",
    
    # -------------------------------------------------------------------------
    # üêõ DEBUGGING AND DEVELOPMENT
    # -------------------------------------------------------------------------
    
    # verbose: Print detailed information during initialization and loading
    # - True: Show download progress, cache info, debugging messages
    # - False: Minimal output
    verbose=True,
    
    # save_samples: Save extracted patches to disk for inspection
    # Useful for debugging and visualizing what the model sees
    # - True: Save patches as image files
    # - False: No saving (recommended for training)
    save_samples=False
)

# ============================================================================
# üèÉ STEP 5: ITERATE THROUGH BATCHES
# ============================================================================
print('\n' + '='*80)
print('üìä Loading batches and inspecting shapes...')
print('='*80 + '\n')

# Counter for demonstration
num_batches_to_show = 5

for i, (x_batch, y_batch) in enumerate(loader):
    # x_batch: Input data (level_from)
    # y_batch: Target data (level_to)
    
    print(f'Batch {i:3d}: X shape: {str(x_batch.shape):20s} | Y shape: {str(y_batch.shape):20s}')
    
    # Show data statistics for first batch
    if i == 0:
        print(f'\nüìà First batch statistics:')
        print(f'   X - min: {x_batch.min():.4f}, max: {x_batch.max():.4f}, mean: {x_batch.mean():.4f}')
        print(f'   Y - min: {y_batch.min():.4f}, max: {y_batch.max():.4f}, mean: {y_batch.mean():.4f}')
        print(f'   X dtype: {x_batch.dtype}, Y dtype: {y_batch.dtype}\n')
    
    # Stop after showing a few batches for demonstration
    if i >= num_batches_to_show - 1:
        break

print('\n' + '='*80)
print('‚úÖ SUCCESS! Maya4 dataloader is working correctly.')
print('='*80)