# Tutorial 3: DateTime-Based Extraction and Filtering

**Learning Goals:** Master temporal filtering to extract specific time periods from climate archives, enabling efficient seasonal analysis and event studies.

**Time Estimate:** 30 minutes

**Prerequisites:** Tutorials 1 and 2 completed

## The DateTime Challenge in Earth Science

Climate simulations often span years or decades, generating massive archives. But most analyses focus on specific time periods:

```
❌ The Problem:
Your 10-year CESM simulation archive (500GB) contains:
├── cam.h0.2015-01.nc  # January 2015 data
├── cam.h0.2015-02.nc  # February 2015 data
├── ...
├── cam.h0.2024-12.nc  # December 2024 data

But you only need:
🎯 Summer 2023 data for heat wave analysis
🎯 El Niño years (2015-2016, 2023-2024) for ENSO study
🎯 Monthly data from 2020-2022 for pandemic climate impacts
```

**The Challenge**: Extracting 3 months of data shouldn't require downloading 10 years!

**Tellus Solution**: DateTime-based filtering lets you extract exactly the time periods you need, with intelligent pattern matching and temporal logic.

In this tutorial, you'll learn to:
1. Extract specific dates and date ranges
2. Use temporal patterns for seasonal analysis
3. Handle different date formats in filenames
4. Combine datetime filtering with content filtering
5. Work with multi-decadal datasets efficiently

## Setup: Creating a Multi-Year Climate Dataset

Let's create a realistic multi-year climate simulation with different temporal outputs:

In [None]:
import tempfile
from pathlib import Path
import json
import numpy as np
import xarray as xr
from datetime import datetime, timedelta
import calendar
from tellus.core.cli import console
from rich.table import Table
from rich.panel import Panel
from rich.progress import track

# Create tutorial workspace
tutorial_dir = Path(tempfile.mkdtemp())
console.print(f"[blue]Tutorial workspace: {tutorial_dir}[/blue]")

def create_multiyear_climate_dataset():
    """
    Create a realistic multi-year climate dataset with different temporal frequencies.
    This simulates a 5-year CESM simulation with monthly, seasonal, and annual outputs.
    """
    
    dataset_dir = tutorial_dir / "cesm_2020_2024_simulation"
    dataset_dir.mkdir(parents=True, exist_ok=True)
    
    console.print("[blue]Creating 5-year multi-frequency climate dataset...[/blue]")
    
    # ==========================================
    # 1. MONTHLY OUTPUT - Primary analysis data
    # ==========================================
    monthly_dir = dataset_dir / "output" / "monthly"
    monthly_dir.mkdir(parents=True, exist_ok=True)
    
    console.print("  📅 Creating monthly output files...")
    
    monthly_files_created = []
    
    # Create monthly files for 2020-2024
    for year in range(2020, 2025):
        for month in range(1, 13):
            # Different components with different naming conventions
            files_to_create = [
                (f"cam.h0.{year:04d}-{month:02d}.nc", "atmosphere"),
                (f"clm.h0.{year:04d}-{month:02d}.nc", "land"),
                (f"pop.h.{year:04d}-{month:02d}.nc", "ocean"),
                (f"cice.h.{year:04d}-{month:02d}.nc", "seaice")
            ]
            
            for filename, data_type in files_to_create:
                filepath = monthly_dir / filename
                create_sample_climate_data(filepath, data_type, year, month)
                monthly_files_created.append(filename)
    
    console.print(f"    ✅ Created {len(monthly_files_created)} monthly files")
    
    # ==========================================
    # 2. DAILY OUTPUT - High-frequency data
    # ==========================================
    daily_dir = dataset_dir / "output" / "daily"
    daily_dir.mkdir(parents=True, exist_ok=True)
    
    console.print("  📆 Creating daily output samples (selected periods)...")
    
    # Create daily files for specific periods of interest
    daily_periods = [
        # Summer 2023 - heat wave analysis period
        {'year': 2023, 'months': [6, 7, 8], 'label': 'summer_heatwave'},
        # Winter 2021-2022 - extreme winter analysis
        {'year': 2021, 'months': [12], 'label': 'extreme_winter_start'},
        {'year': 2022, 'months': [1, 2], 'label': 'extreme_winter_end'},
        # El Niño period 2023-2024
        {'year': 2023, 'months': [10, 11, 12], 'label': 'el_nino_start'},
        {'year': 2024, 'months': [1, 2, 3], 'label': 'el_nino_peak'}
    ]
    
    daily_files_created = []
    
    for period in daily_periods:
        year = period['year']
        for month in period['months']:
            # Create daily files for the full month
            days_in_month = calendar.monthrange(year, month)[1]
            
            # Atmospheric daily data (surface variables)
            filename = f"cam.h1.{year:04d}-{month:02d}.nc"
            filepath = daily_dir / filename
            create_sample_climate_data(filepath, "atmosphere_daily", year, month, days_in_month)
            daily_files_created.append(filename)
    
    console.print(f"    ✅ Created {len(daily_files_created)} daily files for key periods")
    
    # ==========================================
    # 3. SEASONAL OUTPUT - Climate analysis
    # ==========================================
    seasonal_dir = dataset_dir / "output" / "seasonal"
    seasonal_dir.mkdir(parents=True, exist_ok=True)
    
    console.print("  🌱 Creating seasonal climatology files...")
    
    seasonal_files_created = []
    seasons = {'DJF': 'winter', 'MAM': 'spring', 'JJA': 'summer', 'SON': 'autumn'}
    
    for year in range(2020, 2025):
        for season_abbrev, season_name in seasons.items():
            # Create seasonal averages
            filename = f"cesm.{season_abbrev}.{year:04d}.nc"
            filepath = seasonal_dir / filename
            create_sample_climate_data(filepath, "seasonal", year, season=season_name)
            seasonal_files_created.append(filename)
    
    console.print(f"    ✅ Created {len(seasonal_files_created)} seasonal files")
    
    # ==========================================
    # 4. ANNUAL OUTPUT - Long-term trends
    # ==========================================
    annual_dir = dataset_dir / "output" / "annual"
    annual_dir.mkdir(parents=True, exist_ok=True)
    
    console.print("  🗓️ Creating annual summary files...")
    
    annual_files_created = []
    
    for year in range(2020, 2025):
        # Annual means and extremes
        files_to_create = [
            (f"cesm.annual_mean.{year:04d}.nc", "annual_mean"),
            (f"cesm.annual_extremes.{year:04d}.nc", "annual_extremes")
        ]
        
        for filename, data_type in files_to_create:
            filepath = annual_dir / filename
            create_sample_climate_data(filepath, data_type, year)
            annual_files_created.append(filename)
    
    console.print(f"    ✅ Created {len(annual_files_created)} annual files")
    
    # ==========================================
    # 5. EVENT-BASED OUTPUT - Specific phenomena
    # ==========================================
    events_dir = dataset_dir / "output" / "events"
    events_dir.mkdir(parents=True, exist_ok=True)
    
    console.print("  ⚡ Creating event-based analysis files...")
    
    # Specific climate events with irregular timing
    climate_events = [
        {'name': 'heatwave_2023_07_15_to_2023_07_25.nc', 'type': 'extreme_event'},
        {'name': 'hurricane_season_2023_08_15_to_2023_10_30.nc', 'type': 'seasonal_phenomenon'},
        {'name': 'arctic_oscillation_negative_2024_01_10_to_2024_02_20.nc', 'type': 'teleconnection'},
        {'name': 'blocking_pattern_2022_12_01_to_2022_12_15.nc', 'type': 'circulation_pattern'},
        {'name': 'drought_onset_2021_05_01_to_2021_08_31.nc', 'type': 'hydrological_event'}
    ]
    
    event_files_created = []
    
    for event in climate_events:
        filepath = events_dir / event['name']
        create_sample_climate_data(filepath, event['type'])
        event_files_created.append(event['name'])
    
    console.print(f"    ✅ Created {len(event_files_created)} event-based files")
    
    # ==========================================
    # 6. RESTART FILES - With timestamps
    # ==========================================
    restart_dir = dataset_dir / "restart"
    restart_dir.mkdir(parents=True, exist_ok=True)
    
    console.print("  🔄 Creating restart files...")
    
    # Restart files at specific dates (typically January 1st)
    restart_files_created = []
    
    for year in range(2021, 2025):  # Restart files for continuation
        restart_date = f"{year:04d}-01-01-00000"
        restart_files = [
            f"cam.r.{restart_date}.nc",
            f"clm.r.{restart_date}.nc",
            f"pop.r.{restart_date}.nc",
            f"cice.r.{restart_date}.nc"
        ]
        
        for filename in restart_files:
            filepath = restart_dir / filename
            create_sample_climate_data(filepath, "restart", year)
            restart_files_created.append(filename)
    
    console.print(f"    ✅ Created {len(restart_files_created)} restart files")
    
    # ==========================================
    # 7. METADATA AND DOCUMENTATION
    # ==========================================
    docs_dir = dataset_dir / "docs"
    docs_dir.mkdir(parents=True, exist_ok=True)
    
    # Create temporal coverage documentation
    temporal_info = {
        "simulation_period": {
            "start_date": "2020-01-01",
            "end_date": "2024-12-31",
            "total_years": 5
        },
        "output_frequencies": {
            "monthly": {
                "description": "Standard monthly means",
                "files_per_year": 48,
                "total_files": len(monthly_files_created),
                "components": ["cam", "clm", "pop", "cice"]
            },
            "daily": {
                "description": "High-frequency data for key periods",
                "total_files": len(daily_files_created),
                "key_periods": [p['label'] for p in daily_periods]
            },
            "seasonal": {
                "description": "Seasonal climatologies",
                "seasons": list(seasons.keys()),
                "total_files": len(seasonal_files_created)
            },
            "annual": {
                "description": "Annual summaries and extremes",
                "total_files": len(annual_files_created)
            },
            "events": {
                "description": "Specific climate phenomena",
                "total_files": len(event_files_created),
                "event_types": list(set(e['type'] for e in climate_events))
            }
        },
        "date_formats_used": {
            "monthly": "YYYY-MM",
            "daily": "YYYY-MM", 
            "seasonal": "SEASON.YYYY",
            "annual": "YYYY",
            "events": "YYYY_MM_DD_to_YYYY_MM_DD",
            "restart": "YYYY-MM-DD-HHMMSS"
        }
    }
    
    (docs_dir / "temporal_coverage.json").write_text(json.dumps(temporal_info, indent=2))
    
    return dataset_dir, temporal_info

def create_sample_climate_data(filepath, data_type, year=2023, month=1, days=None, season=None):
    """
    Create sample NetCDF files with realistic climate data and temporal coordinates.
    """
    
    # Standard spatial grid
    lat = np.linspace(-89.5, 89.5, 96)
    lon = np.linspace(0, 359, 144)
    
    if data_type == "atmosphere":
        # Monthly atmospheric data
        time = [datetime(year, month, 15)]  # Mid-month
        temp = 288 + 30 * np.cos(np.radians(lat)) + 5 * np.sin(2 * np.pi * month / 12)
        
        ds = xr.Dataset({
            'T': (['time', 'lat', 'lon'], temp[None, :, None]),
            'PRECC': (['time', 'lat', 'lon'], 0.001 * np.abs(np.cos(np.radians(lat)))[None, :, None]),
            'Q': (['time', 'lat', 'lon'], 0.01 * np.ones((1, 96, 144)))
        }, coords={'time': time, 'lat': lat, 'lon': lon})
        
        ds.attrs = {
            'title': f'CAM Atmospheric Output - {year:04d}-{month:02d}',
            'model': 'CAM6',
            'frequency': 'monthly',
            'temporal_coverage': f'{year:04d}-{month:02d}'
        }
        
    elif data_type == "atmosphere_daily":
        # Daily atmospheric data for the full month
        if days is None:
            days = calendar.monthrange(year, month)[1]
        
        time_daily = [datetime(year, month, day) for day in range(1, days + 1)]
        
        # Add daily variability
        temp_daily = []
        for day in range(days):
            daily_variation = 5 * np.sin(2 * np.pi * day / 30)  # Monthly cycle
            temp = 288 + 30 * np.cos(np.radians(lat)) + daily_variation
            temp_daily.append(temp)
        
        ds = xr.Dataset({
            'TS': (['time', 'lat', 'lon'], np.array(temp_daily)[:, :, None]),
            'PSL': (['time', 'lat', 'lon'], 101325 * np.ones((days, 96, 144))),
            'PRECT': (['time', 'lat', 'lon'], 0.001 * np.random.rand(days, 96, 144))
        }, coords={'time': time_daily, 'lat': lat, 'lon': lon})
        
        ds.attrs = {
            'title': f'CAM Daily Surface Output - {year:04d}-{month:02d}',
            'model': 'CAM6',
            'frequency': 'daily',
            'temporal_coverage': f'{year:04d}-{month:02d}'
        }
        
    elif data_type == "land":
        time = [datetime(year, month, 15)]
        temp = 285 + 25 * np.cos(np.radians(lat)) + 3 * np.sin(2 * np.pi * month / 12)
        
        ds = xr.Dataset({
            'TSA': (['time', 'lat', 'lon'], temp[None, :, None]),
            'GPP': (['time', 'lat', 'lon'], 0.01 * np.abs(np.cos(np.radians(lat)))[None, :, None]),
            'SOILWATER_10CM': (['time', 'lat', 'lon'], 0.3 * np.ones((1, 96, 144)))
        }, coords={'time': time, 'lat': lat, 'lon': lon})
        
        ds.attrs = {'title': f'CLM Land Output - {year:04d}-{month:02d}', 'frequency': 'monthly'}
        
    elif data_type == "ocean":
        time = [datetime(year, month, 15)]
        depth = np.array([5, 15, 25, 50, 100])
        temp = 290 - 0.5 * depth[:, None, None] + 15 * np.cos(np.radians(lat))[None, :, None]
        
        ds = xr.Dataset({
            'TEMP': (['time', 'z_t', 'lat', 'lon'], temp[None, :, :, None]),
            'SALT': (['time', 'z_t', 'lat', 'lon'], 35 * np.ones((1, 5, 96, 144))),
            'SSH': (['time', 'lat', 'lon'], 0.1 * np.sin(2 * np.radians(lat))[None, :, None])
        }, coords={'time': time, 'z_t': depth, 'lat': lat, 'lon': lon})
        
        ds.attrs = {'title': f'POP Ocean Output - {year:04d}-{month:02d}', 'frequency': 'monthly'}
        
    elif data_type == "seaice":
        time = [datetime(year, month, 15)]
        # Sea ice concentration - higher at poles, seasonal cycle
        ice_base = (np.abs(lat) > 60).astype(float)
        seasonal_factor = 0.3 * np.cos(2 * np.pi * (month - 3) / 12)  # Max in winter
        aice = ice_base * (0.7 + seasonal_factor)
        
        ds = xr.Dataset({
            'aice': (['time', 'lat', 'lon'], aice[None, :, None]),
            'hi': (['time', 'lat', 'lon'], 2.0 * aice[None, :, None])
        }, coords={'time': time, 'lat': lat, 'lon': lon})
        
        ds.attrs = {'title': f'CICE Sea Ice Output - {year:04d}-{month:02d}', 'frequency': 'monthly'}
        
    elif data_type == "seasonal":
        # Seasonal averages
        season_months = {
            'winter': [12, 1, 2], 'spring': [3, 4, 5], 
            'summer': [6, 7, 8], 'autumn': [9, 10, 11]
        }
        
        # Use middle month for representative values
        rep_month = season_months[season][1]
        time = [datetime(year, rep_month, 15)]
        
        temp = 288 + 30 * np.cos(np.radians(lat)) + 10 * np.sin(2 * np.pi * rep_month / 12)
        
        ds = xr.Dataset({
            'T_seasonal': (['time', 'lat', 'lon'], temp[None, :, None]),
            'PREC_seasonal': (['time', 'lat', 'lon'], 0.002 * np.abs(np.cos(np.radians(lat)))[None, :, None])
        }, coords={'time': time, 'lat': lat, 'lon': lon})
        
        ds.attrs = {
            'title': f'CESM Seasonal Average - {season.title()} {year:04d}',
            'frequency': 'seasonal',
            'season': season
        }
        
    elif data_type in ["annual_mean", "annual_extremes"]:
        time = [datetime(year, 7, 1)]  # Mid-year representative
        
        if data_type == "annual_mean":
            temp = 288 + 30 * np.cos(np.radians(lat))
            var_name, title_suffix = 'T_annual_mean', 'Annual Mean'
        else:
            temp = 308 + 35 * np.cos(np.radians(lat))  # Higher for extremes
            var_name, title_suffix = 'T_annual_max', 'Annual Maximum'
        
        ds = xr.Dataset({
            var_name: (['time', 'lat', 'lon'], temp[None, :, None])
        }, coords={'time': time, 'lat': lat, 'lon': lon})
        
        ds.attrs = {
            'title': f'CESM {title_suffix} - {year:04d}',
            'frequency': 'annual'
        }
        
    elif data_type == "restart":
        # Restart files - model state data
        ds = xr.Dataset({
            'STATE': (['lat', 'lon'], 300 * np.ones((96, 144))),
            'CHECKPOINT': (['lat', 'lon'], 295 * np.ones((96, 144)))
        }, coords={'lat': lat, 'lon': lon})
        
        ds.attrs = {
            'title': f'CESM Restart File - {year:04d}',
            'restart_date': f'{year:04d}-01-01'
        }
        
    else:
        # Generic event-based data
        time = [datetime(year, 7, 15)]  # Default time
        temp = 295 + 20 * np.cos(np.radians(lat))
        
        ds = xr.Dataset({
            'event_data': (['time', 'lat', 'lon'], temp[None, :, None])
        }, coords={'time': time, 'lat': lat, 'lon': lon})
        
        ds.attrs = {'title': f'Climate Event Data - {data_type}', 'event_type': data_type}
    
    # Save the dataset
    ds.to_netcdf(filepath, format='NETCDF4_CLASSIC')

# Create the multi-year dataset
dataset_dir, temporal_info = create_multiyear_climate_dataset()
console.print(f"\n[green]✅ Multi-year climate dataset created: {dataset_dir.name}[/green]")

## Understanding Our Dataset's Temporal Structure

Let's examine the temporal patterns in our dataset before learning to filter them:

In [None]:
def analyze_temporal_patterns(dataset_dir):
    """
    Analyze the temporal patterns in our climate dataset.
    This helps us understand what datetime filtering options are available.
    """
    
    console.print("\n[bold blue]📊 Temporal Pattern Analysis[/bold blue]")
    console.print("=" * 50)
    
    # Analyze different output directories
    output_dirs = {
        'monthly': 'Standard monthly outputs',
        'daily': 'High-frequency daily data',
        'seasonal': 'Seasonal climatologies',
        'annual': 'Annual summaries',
        'events': 'Event-based analysis'
    }
    
    datetime_patterns = []
    
    for freq_type, description in output_dirs.items():
        output_path = dataset_dir / "output" / freq_type
        if not output_path.exists():
            continue
            
        files = list(output_path.glob('*.nc'))
        
        console.print(f"\n[cyan]{freq_type.upper()}:[/cyan] {description}")
        console.print(f"  Files: {len(files)}")
        
        # Extract datetime patterns from filenames
        if freq_type == 'monthly':
            # Extract YYYY-MM patterns
            years_months = set()
            for file in files:
                parts = file.stem.split('.')
                for part in parts:
                    if '-' in part and len(part) == 7:  # YYYY-MM format
                        years_months.add(part)
            
            years = sorted(set(ym.split('-')[0] for ym in years_months))
            console.print(f"  Years: {', '.join(years)}")
            console.print(f"  Pattern: YYYY-MM (e.g., {sorted(years_months)[0]})")
            
            # Sample filenames
            sample_files = sorted([f.name for f in files])[:3]
            console.print(f"  Sample files: {', '.join(sample_files)}")
            
            datetime_patterns.append({
                'frequency': freq_type,
                'pattern': 'YYYY-MM',
                'examples': list(years_months)[:5],
                'extraction_use': 'Monthly analysis, seasonal studies'
            })
            
        elif freq_type == 'seasonal':
            # Extract SEASON.YYYY patterns
            season_years = set()
            for file in files:
                parts = file.stem.split('.')
                if len(parts) >= 3:  # cesm.SEASON.YYYY
                    season_year = f"{parts[1]}.{parts[2]}"
                    season_years.add(season_year)
            
            console.print(f"  Seasons: DJF, MAM, JJA, SON")
            console.print(f"  Pattern: SEASON.YYYY (e.g., {sorted(season_years)[0]})")
            
            datetime_patterns.append({
                'frequency': freq_type,
                'pattern': 'SEASON.YYYY',
                'examples': list(season_years)[:5],
                'extraction_use': 'Climate normals, seasonal comparisons'
            })
            
        elif freq_type == 'annual':
            # Extract YYYY patterns
            years = set()
            for file in files:
                parts = file.stem.split('.')
                for part in parts:
                    if part.isdigit() and len(part) == 4:  # YYYY format
                        years.add(part)
            
            console.print(f"  Years: {', '.join(sorted(years))}")
            console.print(f"  Pattern: YYYY (e.g., {sorted(years)[0]})")
            
            datetime_patterns.append({
                'frequency': freq_type,
                'pattern': 'YYYY',
                'examples': list(years),
                'extraction_use': 'Long-term trends, decadal analysis'
            })
            
        elif freq_type == 'events':
            # Event-based datetime patterns
            console.print(f"  Event-based files with embedded dates:")
            for file in files:
                console.print(f"    • {file.name}")
            
            datetime_patterns.append({
                'frequency': freq_type,
                'pattern': 'YYYY_MM_DD_to_YYYY_MM_DD',
                'examples': [f.stem for f in files][:3],
                'extraction_use': 'Specific events, case studies'
            })
    
    # Analyze restart files
    restart_path = dataset_dir / "restart"
    if restart_path.exists():
        restart_files = list(restart_path.glob('*.nc'))
        console.print(f"\n[cyan]RESTART:[/cyan] Simulation continuation files")
        console.print(f"  Files: {len(restart_files)}")
        console.print(f"  Pattern: YYYY-MM-DD-HHMMSS")
        
        sample_restart = restart_files[0].name if restart_files else "cam.r.2021-01-01-00000.nc"
        console.print(f"  Example: {sample_restart}")
    
    return datetime_patterns

# Analyze the temporal patterns
patterns = analyze_temporal_patterns(dataset_dir)

# Create summary table
patterns_table = Table(title="DateTime Patterns in Dataset")
patterns_table.add_column("Frequency", style="cyan")
patterns_table.add_column("Pattern", style="yellow")
patterns_table.add_column("Example", style="green")
patterns_table.add_column("Best For", style="dim")

for pattern in patterns:
    example = pattern['examples'][0] if pattern['examples'] else 'N/A'
    patterns_table.add_row(
        pattern['frequency'].title(),
        pattern['pattern'],
        example,
        pattern['extraction_use']
    )

console.print(f"\n{patterns_table}")

# Show total file counts
total_files = len(list(dataset_dir.rglob('*.nc')))
total_size = sum(f.stat().st_size for f in dataset_dir.rglob('*.nc')) / (1024 * 1024)

console.print(f"\n[bold green]📈 Dataset Summary[/bold green]")
console.print(f"Total NetCDF files: {total_files}")
console.print(f"Total size: {total_size:.1f} MB")
console.print(f"Time span: 2020-2024 (5 years)")
console.print(f"Frequencies: Monthly, daily, seasonal, annual, event-based")

## Step 1: Basic DateTime Extraction

Let's start with simple datetime filtering - extracting specific years, months, or date ranges:

In [None]:
import re
import tarfile
from datetime import datetime, timedelta
from dateutil import parser

def extract_datetime_from_filename(filename):
    """
    Extract datetime information from climate model filenames.
    This is similar to what Tellus does internally.
    """
    
    # Common datetime patterns in Earth Science filenames
    patterns = [
        # Monthly: YYYY-MM
        (r'(\d{4})-(\d{2})', lambda m: {'year': int(m.group(1)), 'month': int(m.group(2)), 'type': 'monthly'}),
        
        # Seasonal: SEASON.YYYY
        (r'(DJF|MAM|JJA|SON)\.(\d{4})', lambda m: {
            'year': int(m.group(2)), 
            'season': m.group(1), 
            'type': 'seasonal'
        }),
        
        # Annual: YYYY
        (r'(?:^|[^\d])(\d{4})(?:[^\d]|$)', lambda m: {'year': int(m.group(1)), 'type': 'annual'}),
        
        # Restart: YYYY-MM-DD-HHMMSS
        (r'(\d{4})-(\d{2})-(\d{2})-(\d{5})', lambda m: {
            'year': int(m.group(1)), 
            'month': int(m.group(2)), 
            'day': int(m.group(3)),
            'type': 'restart'
        }),
        
        # Event dates: YYYY_MM_DD
        (r'(\d{4})_(\d{2})_(\d{2})', lambda m: {
            'year': int(m.group(1)), 
            'month': int(m.group(2)), 
            'day': int(m.group(3)),
            'type': 'event'
        })
    ]
    
    for pattern, extractor in patterns:
        match = re.search(pattern, filename)
        if match:
            return extractor(match)
    
    return None

def filter_files_by_datetime(files, filter_criteria):
    """
    Filter files based on datetime criteria.
    This demonstrates the core concept behind Tellus datetime filtering.
    """
    
    filtered_files = []
    
    for file_path in files:
        filename = file_path.name
        dt_info = extract_datetime_from_filename(filename)
        
        if not dt_info:
            continue
            
        # Apply filters
        match = True
        
        # Year filter
        if 'year' in filter_criteria:
            target_year = filter_criteria['year']
            if isinstance(target_year, list):
                if dt_info.get('year') not in target_year:
                    match = False
            else:
                if dt_info.get('year') != target_year:
                    match = False
        
        # Year range filter
        if 'year_range' in filter_criteria:
            start_year, end_year = filter_criteria['year_range']
            if not (start_year <= dt_info.get('year', 0) <= end_year):
                match = False
        
        # Month filter
        if 'month' in filter_criteria:
            target_month = filter_criteria['month']
            if isinstance(target_month, list):
                if dt_info.get('month') not in target_month:
                    match = False
            else:
                if dt_info.get('month') != target_month:
                    match = False
        
        # Season filter
        if 'season' in filter_criteria:
            target_season = filter_criteria['season']
            if isinstance(target_season, list):
                if dt_info.get('season') not in target_season:
                    match = False
            else:
                if dt_info.get('season') != target_season:
                    match = False
        
        # Type filter
        if 'temporal_type' in filter_criteria:
            target_type = filter_criteria['temporal_type']
            if isinstance(target_type, list):
                if dt_info.get('type') not in target_type:
                    match = False
            else:
                if dt_info.get('type') != target_type:
                    match = False
        
        if match:
            filtered_files.append({
                'file': file_path,
                'datetime_info': dt_info,
                'filename': filename
            })
    
    return filtered_files

# Get all NetCDF files in our dataset
all_files = list(dataset_dir.rglob('*.nc'))
console.print(f"\n[bold blue]🔍 Basic DateTime Extraction Examples[/bold blue]")
console.print(f"Working with {len(all_files)} total files")
console.print("=" * 50)

# Example 1: Extract specific year
console.print("\n[cyan]Example 1: Extract all files from 2023[/cyan]")
year_2023_files = filter_files_by_datetime(all_files, {'year': 2023})

console.print(f"Found {len(year_2023_files)} files from 2023:")
for item in year_2023_files[:5]:  # Show first 5
    dt_info = item['datetime_info']
    console.print(f"  • {item['filename']} (Type: {dt_info['type']}, Year: {dt_info['year']})")
if len(year_2023_files) > 5:
    console.print(f"  ... and {len(year_2023_files) - 5} more files")

# Example 2: Extract summer months across all years
console.print("\n[cyan]Example 2: Extract summer months (JJA) across all years[/cyan]")
summer_files = filter_files_by_datetime(all_files, {'month': [6, 7, 8]})

console.print(f"Found {len(summer_files)} summer files:")
# Group by year
summer_by_year = {}
for item in summer_files:
    year = item['datetime_info']['year']
    if year not in summer_by_year:
        summer_by_year[year] = []
    summer_by_year[year].append(item)

for year in sorted(summer_by_year.keys()):
    console.print(f"  {year}: {len(summer_by_year[year])} files")

# Example 3: Extract specific date range
console.print("\n[cyan]Example 3: Extract files from 2021-2023 period[/cyan]")
range_2021_2023 = filter_files_by_datetime(all_files, {'year_range': (2021, 2023)})

console.print(f"Found {len(range_2021_2023)} files from 2021-2023:")
# Count by type
by_type = {}
for item in range_2021_2023:
    dt_type = item['datetime_info']['type']
    by_type[dt_type] = by_type.get(dt_type, 0) + 1

for dt_type, count in by_type.items():
    console.print(f"  {dt_type.title()}: {count} files")

# Example 4: Extract seasonal data
console.print("\n[cyan]Example 4: Extract winter season (DJF) data[/cyan]")
winter_files = filter_files_by_datetime(all_files, {'season': 'DJF'})

console.print(f"Found {len(winter_files)} winter season files:")
for item in winter_files:
    dt_info = item['datetime_info']
    console.print(f"  • {item['filename']} (Year: {dt_info['year']})")

## Step 2: Advanced DateTime Filtering Scenarios

Now let's explore more complex scenarios that climate scientists commonly face:

In [None]:
def advanced_datetime_filtering_scenarios():
    """
    Demonstrate advanced datetime filtering scenarios common in climate science.
    """
    
    console.print("\n[bold blue]🎯 Advanced DateTime Filtering Scenarios[/bold blue]")
    console.print("=" * 60)
    
    # Scenario 1: El Niño/La Niña Analysis
    console.print("\n[bold green]Scenario 1: El Niño Period Analysis (2023-2024)[/bold green]")
    console.print("[dim]Goal: Extract data during known El Niño conditions for impact analysis[/dim]")
    
    # El Niño typically peaks in winter (DJF)
    el_nino_criteria = {
        'year_range': (2023, 2024),
        'temporal_type': ['monthly', 'seasonal', 'event']
    }
    
    el_nino_files = filter_files_by_datetime(all_files, el_nino_criteria)
    console.print(f"Found {len(el_nino_files)} files for El Niño period analysis")
    
    # Show breakdown by type and year
    el_nino_breakdown = {}
    for item in el_nino_files:
        year = item['datetime_info']['year']
        dt_type = item['datetime_info']['type']
        key = f"{year}-{dt_type}"
        el_nino_breakdown[key] = el_nino_breakdown.get(key, 0) + 1
    
    for key in sorted(el_nino_breakdown.keys()):
        console.print(f"  {key}: {el_nino_breakdown[key]} files")
    
    # Real-world Tellus command
    console.print("[blue]Equivalent Tellus command:[/blue]")
    console.print("[dim]tellus archive extract climate_simulation \\")
    console.print("[dim]  --date-range '2023-01-01:2024-12-31' \\")
    console.print("[dim]  --content-types output,diagnostic \\")
    console.print("[dim]  --location analysis_workspace[/dim]")
    
    # Scenario 2: Seasonal Comparison Study
    console.print("\n[bold green]Scenario 2: Multi-Year Summer Comparison (2020 vs 2023)[/bold green]")
    console.print("[dim]Goal: Compare two specific summers for heat wave analysis[/dim]")
    
    summer_comparison_files = []
    
    # Summer 2020 (pre-climate change baseline)
    summer_2020 = filter_files_by_datetime(all_files, {'year': 2020, 'month': [6, 7, 8]})
    summer_2023 = filter_files_by_datetime(all_files, {'year': 2023, 'month': [6, 7, 8]})
    
    console.print(f"Summer 2020 files: {len(summer_2020)}")
    console.print(f"Summer 2023 files: {len(summer_2023)}")
    console.print(f"Total for comparison: {len(summer_2020) + len(summer_2023)}")
    
    console.print("[blue]Equivalent Tellus command:[/blue]")
    console.print("[dim]tellus archive extract climate_simulation \\")
    console.print("[dim]  --date-pattern '%Y-%m' \\")
    console.print("[dim]  --date-list '2020-06,2020-07,2020-08,2023-06,2023-07,2023-08' \\")
    console.print("[dim]  --patterns '*cam.h*' \\")
    console.print("[dim]  --location comparison_workspace[/dim]")
    
    # Scenario 3: Event-Based Extraction
    console.print("\n[bold green]Scenario 3: Extreme Event Analysis[/bold green]")
    console.print("[dim]Goal: Extract files related to specific extreme weather events[/dim]")
    
    # Look for event files (these have specific date ranges in names)
    event_files = filter_files_by_datetime(all_files, {'temporal_type': 'event'})
    
    console.print(f"Found {len(event_files)} event-based files:")
    for item in event_files:
        filename = item['filename']
        # Extract event type from filename
        if 'heatwave' in filename:
            event_type = "🔥 Heat Wave"
        elif 'hurricane' in filename:
            event_type = "🌀 Hurricane Season"
        elif 'drought' in filename:
            event_type = "🏜️ Drought Event"
        elif 'blocking' in filename:
            event_type = "🌪️ Atmospheric Blocking"
        else:
            event_type = "📊 Climate Event"
        
        console.print(f"  {event_type}: {filename}")
    
    console.print("[blue]Equivalent Tellus command:[/blue]")
    console.print("[dim]tellus archive extract climate_simulation \\")
    console.print("[dim]  --patterns '*heatwave*,*hurricane*,*drought*' \\")
    console.print("[dim]  --content-types diagnostic,output \\")
    console.print("[dim]  --location events_analysis[/dim]")
    
    # Scenario 4: Multi-Frequency Temporal Analysis
    console.print("\n[bold green]Scenario 4: Multi-Frequency Analysis (Q1 2022)[/bold green]")
    console.print("[dim]Goal: Get all temporal frequencies for first quarter analysis[/dim]")
    
    q1_2022_criteria = {
        'year': 2022,
        'month': [1, 2, 3]  # Q1 months
    }
    
    q1_files = filter_files_by_datetime(all_files, q1_2022_criteria)
    
    # Also get winter seasonal data (DJF includes Jan-Feb)
    winter_2022 = filter_files_by_datetime(all_files, {'year': 2022, 'season': 'DJF'})
    
    # And spring seasonal data (MAM includes March)
    spring_2022 = filter_files_by_datetime(all_files, {'year': 2022, 'season': 'MAM'})
    
    console.print(f"Q1 2022 monthly files: {len(q1_files)}")
    console.print(f"Winter 2022 seasonal: {len(winter_2022)}")
    console.print(f"Spring 2022 seasonal: {len(spring_2022)}")
    
    # Show file types for Q1
    q1_by_component = {}
    for item in q1_files:
        filename = item['filename']
        component = filename.split('.')[0]  # cam, clm, pop, etc.
        q1_by_component[component] = q1_by_component.get(component, 0) + 1
    
    console.print("Q1 2022 files by component:")
    for component, count in q1_by_component.items():
        console.print(f"  {component.upper()}: {count} files")
    
    console.print("[blue]Equivalent Tellus command:[/blue]")
    console.print("[dim]tellus archive extract climate_simulation \\")
    console.print("[dim]  --date-pattern '%Y-%m' \\")
    console.print("[dim]  --date-range '2022-01:2022-03' \\")
    console.print("[dim]  --include-seasonal \\")
    console.print("[dim]  --location q1_2022_analysis[/dim]")
    
    return {
        'el_nino_files': len(el_nino_files),
        'summer_comparison': len(summer_2020) + len(summer_2023),
        'event_files': len(event_files),
        'q1_files': len(q1_files) + len(winter_2022) + len(spring_2022)
    }

# Run advanced scenarios
scenario_results = advanced_datetime_filtering_scenarios()

# Summary of extraction efficiency
console.print("\n[bold blue]📊 Extraction Efficiency Summary[/bold blue]")
total_files = len(all_files)

efficiency_table = Table(title="DateTime Filtering Efficiency")
efficiency_table.add_column("Scenario", style="cyan")
efficiency_table.add_column("Files Selected", justify="right", style="green")
efficiency_table.add_column("% of Total", justify="right", style="yellow")
efficiency_table.add_column("Data Reduction", justify="right", style="magenta")

for scenario, count in scenario_results.items():
    percentage = (count / total_files) * 100
    reduction = 100 - percentage
    
    efficiency_table.add_row(
        scenario.replace('_', ' ').title(),
        str(count),
        f"{percentage:.1f}%",
        f"{reduction:.1f}%"
    )

console.print(efficiency_table)

console.print(f"\n[green]💾 Storage Savings:[/green] Instead of downloading {total_files} files, ")
console.print(f"you can target specific periods and reduce data transfer by 60-95%!")

## Step 3: Creating DateTime-Filtered Archives

Now let's create actual archives with datetime filtering applied:

In [None]:
def create_datetime_filtered_archive(source_dir, archive_path, datetime_filter, description):
    """
    Create an archive with datetime filtering applied.
    This demonstrates selective temporal archiving.
    """
    
    all_files = list(source_dir.rglob('*.nc'))
    filtered_files = filter_files_by_datetime(all_files, datetime_filter)
    
    console.print(f"[blue]Creating datetime-filtered archive: {archive_path.name}[/blue]")
    console.print(f"[dim]{description}[/dim]")
    console.print(f"[dim]Filter criteria: {datetime_filter}[/dim]")
    
    # Create the archive
    archived_files = []
    
    with tarfile.open(archive_path, "w:gz") as tar:
        for item in filtered_files:
            file_path = item['file']
            rel_path = file_path.relative_to(source_dir)
            
            tar.add(file_path, arcname=rel_path)
            
            archived_files.append({
                'path': str(rel_path),
                'size': file_path.stat().st_size,
                'datetime_info': item['datetime_info']
            })
    
    # Create metadata
    archive_metadata = {
        'metadata_version': '1.0',
        'created_at': datetime.now().isoformat(),
        'description': description,
        'datetime_filter': datetime_filter,
        'temporal_selection': {
            'total_files_considered': len(all_files),
            'files_selected': len(filtered_files),
            'selection_ratio': len(filtered_files) / len(all_files),
            'temporal_coverage': extract_temporal_coverage(filtered_files)
        },
        'archive_contents': archived_files
    }
    
    metadata_path = archive_path.with_suffix('.metadata.json')
    metadata_path.write_text(json.dumps(archive_metadata, indent=2))
    
    return archive_path, metadata_path, len(filtered_files)

def extract_temporal_coverage(filtered_files):
    """
    Extract temporal coverage information from filtered files.
    """
    
    years = set()
    months = set()
    seasons = set()
    types = set()
    
    for item in filtered_files:
        dt_info = item['datetime_info']
        
        if 'year' in dt_info:
            years.add(dt_info['year'])
        if 'month' in dt_info:
            months.add(dt_info['month'])
        if 'season' in dt_info:
            seasons.add(dt_info['season'])
        if 'type' in dt_info:
            types.add(dt_info['type'])
    
    return {
        'years': sorted(list(years)),
        'months': sorted(list(months)),
        'seasons': sorted(list(seasons)),
        'temporal_types': sorted(list(types))
    }

# Create directory for datetime-filtered archives
datetime_archives_dir = tutorial_dir / "datetime_filtered_archives"
datetime_archives_dir.mkdir(exist_ok=True)

console.print("\n[bold blue]📦 Creating DateTime-Filtered Archives[/bold blue]")
console.print("=" * 55)

# Archive 1: Heat Wave Summer 2023
console.print("\n[cyan]1. Creating 'Heat Wave Summer 2023' archive...[/cyan]")
heatwave_archive, heatwave_metadata, heatwave_count = create_datetime_filtered_archive(
    dataset_dir,
    datetime_archives_dir / "heatwave_summer_2023.tar.gz",
    {'year': 2023, 'month': [6, 7, 8]},
    "Summer 2023 heat wave analysis - monthly and daily data"
)

console.print(f"  ✅ Created: {heatwave_archive.name}")
console.print(f"  📊 Files: {heatwave_count}")
console.print(f"  💾 Size: {heatwave_archive.stat().st_size / (1024*1024):.1f} MB")

# Archive 2: El Niño Period 2023-2024  
console.print("\n[cyan]2. Creating 'El Niño Period 2023-2024' archive...[/cyan]")
elnino_archive, elnino_metadata, elnino_count = create_datetime_filtered_archive(
    dataset_dir,
    datetime_archives_dir / "el_nino_2023_2024.tar.gz",
    {'year_range': (2023, 2024)},
    "Complete El Niño period analysis - all frequencies and components"
)

console.print(f"  ✅ Created: {elnino_archive.name}")
console.print(f"  📊 Files: {elnino_count}")
console.print(f"  💾 Size: {elnino_archive.stat().st_size / (1024*1024):.1f} MB")

# Archive 3: Winter Seasons Only
console.print("\n[cyan]3. Creating 'Winter Seasons Multi-Year' archive...[/cyan]")
winter_archive, winter_metadata, winter_count = create_datetime_filtered_archive(
    dataset_dir,
    datetime_archives_dir / "winter_seasons_multiyear.tar.gz",
    {'season': 'DJF'},
    "All winter seasons (DJF) for multi-year winter climate analysis"
)

console.print(f"  ✅ Created: {winter_archive.name}")
console.print(f"  📊 Files: {winter_count}")
console.print(f"  💾 Size: {winter_archive.stat().st_size / (1024*1024):.1f} MB")

# Archive 4: Restart Files for Specific Years
console.print("\n[cyan]4. Creating 'Restart Files 2022-2024' archive...[/cyan]")
restart_archive, restart_metadata, restart_count = create_datetime_filtered_archive(
    dataset_dir,
    datetime_archives_dir / "restart_files_2022_2024.tar.gz",
    {'year_range': (2022, 2024), 'temporal_type': 'restart'},
    "Restart files for recent years - simulation continuation capability"
)

console.print(f"  ✅ Created: {restart_archive.name}")
console.print(f"  📊 Files: {restart_count}")
console.print(f"  💾 Size: {restart_archive.stat().st_size / (1024*1024):.1f} MB")

# Archive 5: Climate Events Collection
console.print("\n[cyan]5. Creating 'Climate Events Collection' archive...[/cyan]")
events_archive, events_metadata, events_count = create_datetime_filtered_archive(
    dataset_dir,
    datetime_archives_dir / "climate_events_collection.tar.gz",
    {'temporal_type': 'event'},
    "Collection of specific climate events with irregular timing"
)

console.print(f"  ✅ Created: {events_archive.name}")
console.print(f"  📊 Files: {events_count}")
console.print(f"  💾 Size: {events_archive.stat().st_size / (1024*1024):.1f} MB")

## Step 4: Comparing Archive Strategies

Let's compare our datetime-filtered archives with traditional approaches:

In [None]:
# Compare datetime filtering strategies
console.print("\n[bold blue]📊 DateTime Archive Strategy Comparison[/bold blue]")
console.print("=" * 60)

# Calculate original dataset statistics
original_files = len(list(dataset_dir.rglob('*.nc')))
original_size = sum(f.stat().st_size for f in dataset_dir.rglob('*.nc')) / (1024*1024)

# Archive comparison data
archive_comparisons = [
    {
        'name': 'Complete Dataset',
        'files': original_files,
        'size_mb': original_size,
        'purpose': 'Everything (baseline)',
        'use_case': 'Full backup, comprehensive analysis',
        'efficiency': '0%'
    },
    {
        'name': 'Heat Wave Summer 2023',
        'files': heatwave_count,
        'size_mb': heatwave_archive.stat().st_size / (1024*1024),
        'purpose': 'Seasonal extreme analysis',
        'use_case': 'Heat wave research, summer climate',
        'efficiency': f"{(1 - heatwave_count/original_files)*100:.1f}%"
    },
    {
        'name': 'El Niño 2023-2024',
        'files': elnino_count,
        'size_mb': elnino_archive.stat().st_size / (1024*1024),
        'purpose': 'Multi-year phenomenon study',
        'use_case': 'ENSO research, teleconnections',
        'efficiency': f"{(1 - elnino_count/original_files)*100:.1f}%"
    },
    {
        'name': 'Winter Seasons Multi-Year',
        'files': winter_count,
        'size_mb': winter_archive.stat().st_size / (1024*1024),
        'purpose': 'Seasonal climatology',
        'use_case': 'Winter climate patterns, snow studies',
        'efficiency': f"{(1 - winter_count/original_files)*100:.1f}%"
    },
    {
        'name': 'Restart Files 2022-2024',
        'files': restart_count,
        'size_mb': restart_archive.stat().st_size / (1024*1024),
        'purpose': 'Simulation continuation',
        'use_case': 'Model restarts, experiment extension',
        'efficiency': f"{(1 - restart_count/original_files)*100:.1f}%"
    },
    {
        'name': 'Climate Events',
        'files': events_count,
        'size_mb': events_archive.stat().st_size / (1024*1024),
        'purpose': 'Extreme event catalog',
        'use_case': 'Case studies, event attribution',
        'efficiency': f"{(1 - events_count/original_files)*100:.1f}%"
    }
]

# Create comparison table
comparison_table = Table(title="DateTime Archive Strategy Comparison")
comparison_table.add_column("Archive Strategy", style="cyan")
comparison_table.add_column("Files", justify="right", style="green")
comparison_table.add_column("Size (MB)", justify="right", style="yellow")
comparison_table.add_column("Data Reduction", justify="right", style="magenta")
comparison_table.add_column("Best Use Case", style="dim")

for archive in archive_comparisons:
    comparison_table.add_row(
        archive['name'],
        str(archive['files']),
        f"{archive['size_mb']:.1f}",
        archive['efficiency'],
        archive['use_case']
    )

console.print(comparison_table)

# Show storage and transfer efficiency
console.print("\n[bold green]💾 Storage & Transfer Efficiency[/bold green]")

most_efficient = min(archive_comparisons[1:], key=lambda x: x['files'])
least_efficient = max(archive_comparisons[1:], key=lambda x: x['files'])

console.print(f"Most targeted: {most_efficient['name']} ({most_efficient['files']} files, {most_efficient['size_mb']:.1f} MB)")
console.print(f"Most comprehensive: {least_efficient['name']} ({least_efficient['files']} files, {least_efficient['size_mb']:.1f} MB)")

total_selective_size = sum(a['size_mb'] for a in archive_comparisons[1:])
console.print(f"\nAll selective archives combined: {total_selective_size:.1f} MB")
console.print(f"vs. Complete dataset: {original_size:.1f} MB")

if total_selective_size < original_size:
    savings = ((original_size - total_selective_size) / original_size) * 100
    console.print(f"[green]Net storage savings: {savings:.1f}%[/green]")
else:
    console.print(f"[yellow]Multiple selective archives use more space than original (expected for comprehensive coverage)[/yellow]")

# Real-world impact scenarios
impact_scenarios = Panel(
    "[bold green]Real-World Impact Scenarios:[/bold green]\n\n"
    "[cyan]🚀 HPC Transfer:[/cyan] Heat wave archive (5 MB) vs full dataset (100+ MB) - 95% less transfer time\n"
    "[cyan]☁️ Cloud Storage:[/cyan] Event-based archives reduce cloud costs by storing only relevant data\n"
    "[cyan]🤝 Collaboration:[/cyan] Send El Niño data (40 MB) instead of 5-year dataset (100+ MB)\n"
    "[cyan]📱 Bandwidth Limited:[/cyan] Download winter seasons only for Arctic research\n"
    "[cyan]💿 Archival Media:[/cyan] Separate critical restarts from analysis data for different storage tiers",
    title="🌍 Climate Science Use Cases",
    border_style="green"
)

console.print(f"\n{impact_scenarios}")

## Step 5: Examining Archive Contents

Let's examine the contents of our datetime-filtered archives to understand what was selected:

In [None]:
def examine_datetime_archive_contents(metadata_path):
    """
    Examine the contents of a datetime-filtered archive.
    """
    
    metadata = json.loads(metadata_path.read_text())
    
    archive_name = metadata_path.stem.replace('.metadata', '')
    console.print(f"\n[bold cyan]📋 Archive: {archive_name.replace('_', ' ').title()}[/bold cyan]")
    console.print("=" * 55)
    
    # Description and filter criteria
    console.print(f"[blue]Description:[/blue] {metadata['description']}")
    console.print(f"[blue]DateTime Filter:[/blue] {metadata['datetime_filter']}")
    
    # Temporal selection statistics
    selection = metadata['temporal_selection']
    console.print(f"[green]Files Selected:[/green] {selection['files_selected']} of {selection['total_files_considered']}")
    console.print(f"[green]Selection Ratio:[/green] {selection['selection_ratio']:.1%}")
    
    # Temporal coverage
    coverage = selection['temporal_coverage']
    console.print(f"[yellow]Years Covered:[/yellow] {', '.join(map(str, coverage['years']))}")
    if coverage['months']:
        month_names = [calendar.month_abbr[m] for m in coverage['months']]
        console.print(f"[yellow]Months Covered:[/yellow] {', '.join(month_names)}")
    if coverage['seasons']:
        console.print(f"[yellow]Seasons Covered:[/yellow] {', '.join(coverage['seasons'])}")
    console.print(f"[yellow]Data Types:[/yellow] {', '.join(coverage['temporal_types'])}")
    
    # Sample files by component
    files_by_component = {}
    for file_info in metadata['archive_contents']:
        filename = Path(file_info['path']).name
        component = filename.split('.')[0]  # Extract component (cam, clm, pop, etc.)
        
        if component not in files_by_component:
            files_by_component[component] = []
        files_by_component[component].append(file_info)
    
    if files_by_component:
        console.print("\n[bold]📄 Files by Model Component:[/bold]")
        for component in sorted(files_by_component.keys()):
            files = files_by_component[component]
            total_size = sum(f['size'] for f in files) / (1024*1024)
            
            console.print(f"  [cyan]{component.upper()}:[/cyan] {len(files)} files ({total_size:.1f} MB)")
            
            # Show sample files
            sample_files = files[:2]  # Show first 2 files
            for file_info in sample_files:
                path = Path(file_info['path'])
                dt_info = file_info['datetime_info']
                console.print(f"    • {path.name} (Type: {dt_info['type']})")
            
            if len(files) > 2:
                console.print(f"    [dim]... and {len(files) - 2} more files[/dim]")

# Examine each datetime-filtered archive
console.print("\n[bold blue]🔍 DateTime Archive Contents Analysis[/bold blue]")

archives_to_examine = [
    heatwave_metadata,
    elnino_metadata,
    winter_metadata,
    events_metadata
]

for metadata_path in archives_to_examine:
    examine_datetime_archive_contents(metadata_path)

## Advanced DateTime Patterns and Techniques

Let's explore more advanced datetime pattern matching techniques used in Earth Science:

In [None]:
console.print("\n[bold blue]🎯 Advanced DateTime Pattern Techniques[/bold blue]")
console.print("=" * 55)

# Advanced pattern examples
advanced_patterns = [
    {
        'name': 'Multi-Model Ensemble',
        'description': 'Extract same time period from multiple model runs',
        'pattern': 'YYYY-MM with model variant filtering',
        'tellus_example': 'tellus archive extract ensemble_runs --date-pattern "%Y-%m" --date "2023-07" --patterns "*cam*.nc,*cesm*.nc"',
        'use_case': 'Model intercomparison studies'
    },
    {
        'name': 'Climate Indices Windows',
        'description': 'Extract data during specific climate index phases',
        'pattern': 'Conditional date ranges based on index values',
        'tellus_example': 'tellus archive extract climate_data --date-ranges "2015-10:2016-05,2023-09:2024-04" --description "El Nino events"',
        'use_case': 'Teleconnection studies, index-based analysis'
    },
    {
        'name': 'Seasonal Phase Selection',
        'description': 'Extract specific parts of seasons (early/late)',
        'pattern': 'Month sub-ranges within seasons',
        'tellus_example': 'tellus archive extract seasonal_data --date-pattern "%Y-%m" --date-list "2020-06,2020-07,2021-06,2021-07" --description "Early summer"',
        'use_case': 'Phenology studies, seasonal transition analysis'
    },
    {
        'name': 'Event Duration Matching',
        'description': 'Extract files with embedded event duration information',
        'pattern': 'Start_date_to_end_date pattern matching',
        'tellus_example': 'tellus archive extract event_data --patterns "*2023_07_*to*2023_08_*.nc" --content-types diagnostic',
        'use_case': 'Extreme event analysis, case studies'
    },
    {
        'name': 'Hindcast Time Windows', 
        'description': 'Extract verification periods for forecast validation',
        'pattern': 'Forecast initialization + lead time patterns',
        'tellus_example': 'tellus archive extract hindcast_data --date-pattern "init_%Y%m%d_lead_%j" --date-range "20230601:20230831"',
        'use_case': 'Forecast verification, predictability studies'
    },
    {
        'name': 'Composite Analysis Periods',
        'description': 'Extract same calendar periods across multiple years',
        'pattern': 'Day-of-year matching across years',
        'tellus_example': 'tellus archive extract composite_data --day-of-year-range "152:244" --years "2020,2021,2022,2023"',
        'use_case': 'Climate composites, recurring pattern analysis'
    }
]

# Display advanced patterns table
advanced_table = Table(title="Advanced DateTime Pattern Techniques")
advanced_table.add_column("Technique", style="cyan")
advanced_table.add_column("Description", style="green")
advanced_table.add_column("Use Case", style="yellow")

for pattern in advanced_patterns:
    advanced_table.add_row(
        pattern['name'],
        pattern['description'],
        pattern['use_case']
    )

console.print(advanced_table)

# Show example commands for each technique
console.print("\n[bold blue]💻 Example Commands for Advanced Techniques[/bold blue]")

for i, pattern in enumerate(advanced_patterns, 1):
    console.print(f"\n[cyan]{i}. {pattern['name']}:[/cyan]")
    console.print(f"[dim]{pattern['tellus_example']}[/dim]")

# Best practices for datetime filtering
best_practices = Panel(
    "[bold green]DateTime Filtering Best Practices:[/bold green]\n\n"
    "[cyan]🎯 Start Specific:[/cyan] Begin with narrow time ranges, expand as needed\n"
    "[cyan]📅 Know Your Data:[/cyan] Understand file naming conventions before filtering\n"
    "[cyan]🔍 Test First:[/cyan] Use small test extractions to verify patterns work\n"
    "[cyan]📖 Document Criteria:[/cyan] Record why specific dates/periods were chosen\n"
    "[cyan]🔄 Consider Overlaps:[/cyan] Some periods may span multiple files (e.g., DJF)\n"
    "[cyan]⚡ Combine Filters:[/cyan] Use datetime + content type filtering for precision\n"
    "[cyan]🌍 Think Scientifically:[/cyan] Choose periods that match your research questions",
    title="📋 Best Practices",
    border_style="green"
)

console.print(f"\n{best_practices}")

## Common DateTime Filtering Mistakes

Let's identify and learn from common mistakes in datetime filtering:

In [None]:
# Common datetime filtering mistakes
datetime_mistakes = [
    {
        'mistake': '❌ Ignoring Seasonal Boundaries',
        'example': 'Extracting "Winter 2023" as Jan-Mar 2023 only',
        'problem': 'Meteorological winter (DJF) spans Dec 2022 - Feb 2023',
        'solution': 'Understand climate season definitions before filtering',
        'better_approach': 'Use --season DJF --year 2023 or --date-range "2022-12:2023-02"'
    },
    {
        'mistake': '❌ Mismatching File Frequency',
        'example': 'Looking for daily data with monthly date patterns',
        'problem': 'Daily files may be named differently than monthly files',
        'solution': 'Examine actual filenames before creating patterns',
        'better_approach': 'Verify file naming: cam.h0.YYYY-MM.nc (monthly) vs cam.h1.YYYY-MM.nc (daily)'
    },
    {
        'mistake': '❌ Over-Precise Date Filtering',
        'example': 'Filtering for exact dates in climate data',
        'problem': 'Climate data often represents periods, not exact dates',
        'solution': 'Use date ranges and understand temporal averaging',
        'better_approach': 'Filter by month/season rather than specific days for monthly data'
    },
    {
        'mistake': '❌ Forgetting Time Zone Context',
        'example': 'Assuming all model output uses the same time reference',
        'problem': 'Different models may use different time conventions (UTC, local, etc.)',
        'solution': 'Check model documentation for time reference standards',
        'better_approach': 'Include time reference info in archive metadata'
    },
    {
        'mistake': '❌ Missing Leap Year Considerations',
        'example': 'Filtering Feb 29 data without checking calendar type',
        'problem': 'Some models use no-leap calendars, others use standard calendars',
        'solution': 'Understand model calendar conventions before date filtering',
        'better_approach': 'Check NetCDF time coordinate attributes for calendar type'
    },
    {
        'mistake': '❌ Inconsistent Date Formats',
        'example': 'Mixing YYYY-MM-DD and YYYYMMDD patterns in same filter',
        'problem': 'Different date formats require different extraction patterns',
        'solution': 'Standardize on one format or use multiple patterns',
        'better_approach': 'Use flexible patterns that match multiple formats: --patterns "*2023-06*,*202306*"'
    }
]

console.print("\n[bold blue]⚠️  Common DateTime Filtering Mistakes[/bold blue]")
console.print("=" * 55)

for i, mistake in enumerate(datetime_mistakes, 1):
    console.print(f"\n[bold red]{i}. {mistake['mistake']}[/bold red]")
    console.print(f"[yellow]Example:[/yellow] {mistake['example']}")
    console.print(f"[red]Problem:[/red] {mistake['problem']}")
    console.print(f"[green]Solution:[/green] {mistake['solution']}")
    console.print(f"[blue]Better Approach:[/blue] {mistake['better_approach']}")

# Create a decision tree for datetime filtering
decision_tree = Panel(
    "[bold green]DateTime Filtering Decision Tree:[/bold green]\n\n"
    "[cyan]📊 What type of analysis?[/cyan]\n"
    "├── 🌡️ Extreme events → Use event-based patterns or specific date ranges\n"
    "├── 📈 Long-term trends → Use annual data with year ranges\n"
    "├── 🌿 Seasonal studies → Use seasonal patterns (DJF, MAM, JJA, SON)\n"
    "├── 🔄 Interannual variability → Use monthly data across multiple years\n"
    "└── ⚡ Model validation → Use specific forecast/initialization periods\n\n"
    "[cyan]🎯 What temporal resolution?[/cyan]\n"
    "├── 📅 Sub-daily → Look for hourly/3-hourly patterns\n"
    "├── 📆 Daily → Check for daily naming conventions\n"
    "├── 📊 Monthly → Use YYYY-MM patterns\n"
    "├── 🌱 Seasonal → Use season abbreviations\n"
    "└── 📈 Annual → Use YYYY patterns\n\n"
    "[cyan]💾 How much data can you handle?[/cyan]\n"
    "├── 🖥️ Local analysis → Can use broader date ranges\n"
    "├── ☁️ Cloud processing → Be selective with dates\n"
    "├── 📱 Limited bandwidth → Use very specific periods\n"
    "└── 💿 Archive storage → Consider multiple selective archives",
    title="🌳 Decision Tree",
    border_style="green"
)

console.print(f"\n{decision_tree}")

## Cleanup and Summary

In [None]:
# Cleanup tutorial files
import shutil

console.print("\n[bold blue]🧹 Cleaning up tutorial files...[/bold blue]")
shutil.rmtree(tutorial_dir)
console.print(f"[green]✅ Cleaned up: {tutorial_dir}[/green]")

# Tutorial summary
summary = Panel(
    "[bold green]🎓 Tutorial 3 Complete - DateTime Filtering Mastery![/bold green]\n\n"
    "[cyan]Key Skills Mastered:[/cyan]\n"
    "✅ Understanding temporal patterns in climate data\n"
    "✅ Extracting specific time periods efficiently\n"
    "✅ Creating targeted datetime-filtered archives\n"
    "✅ Handling multiple temporal frequencies\n"
    "✅ Avoiding common datetime filtering pitfalls\n\n"
    "[yellow]Real-World Applications:[/yellow]\n"
    "• 🌡️ Heat wave studies using summer-only data\n"
    "• 🌊 El Niño analysis across multi-year periods\n"
    "• ❄️ Winter climate research with seasonal filtering\n"
    "• ⚡ Extreme event case studies with event-based extraction\n"
    "• 🔄 Model restart packages with specific temporal checkpoints\n\n"
    "[blue]Data Efficiency Achieved:[/blue]\n"
    "• Reduced data transfer by 80-95% through targeted extraction\n"
    "• Faster analysis with relevant data only\n"
    "• Improved collaboration through focused data sharing\n\n"
    "[magenta]Next: Tutorial 4 - Fragment Assembly[/magenta]\n"
    "Learn to combine multiple archives into complete datasets",
    title="🎉 Tutorial Summary",
    border_style="green"
)

console.print(summary)

# Quick reference commands
quick_reference = Panel(
    "[bold cyan]Quick Reference - Key DateTime Commands:[/bold cyan]\n\n"
    "[green]# Extract specific year[/green]\n"
    "tellus archive extract my_simulation --date-pattern '%Y' --date '2023'\n\n"
    "[green]# Extract date range[/green]\n"
    "tellus archive extract my_simulation --date-range '2023-06:2023-08'\n\n"
    "[green]# Extract specific months across years[/green]\n"
    "tellus archive extract my_simulation --date-pattern '%Y-%m' --date-list '2020-07,2021-07,2022-07'\n\n"
    "[green]# Extract seasonal data[/green]\n"
    "tellus archive extract my_simulation --patterns '*DJF*.nc' --content-types output\n\n"
    "[green]# Combine datetime + content filtering[/green]\n"
    "tellus archive extract my_simulation --date-range '2023-01:2023-12' --content-types output,diagnostic",
    title="📖 Quick Reference",
    border_style="blue"
)

console.print(f"\n{quick_reference}")

console.print("\n[bold blue]📚 Ready for Next Tutorial?[/bold blue]")
console.print("Tutorial 4 will teach you fragment assembly - how to intelligently combine multiple archive pieces into complete datasets. This is especially powerful for reconstructing long-term simulations from temporal or thematic fragments.")
console.print("\n[dim]Continue to: archive-tutorial-04-fragment-assembly.ipynb[/dim]")