In [2]:
! pip install --quiet cfgrib xarray pandas pyarrow

In [None]:
endpoint = 'http://172.200.202.2:80'
access_key = 'O6UZCRYPS9OPWUD1DYF8'
secret_key = 'HnF34+7JkYFusmbU++Cv0/YLUiEpWGBRZTjmUsuu'



In [26]:
#!/usr/bin/env python
# coding: utf-8

import os
import sys
import time
import logging
import requests
import tempfile
import shutil
from pathlib import Path
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Optional, Tuple
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import re

# Data processing imports
import cfgrib
import xarray as xr
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import vastdb

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class GRIB2DataIngester:
    """GRIB2 Data Discovery and Bulk Ingestion System for VastDB"""
    
    def __init__(self, vastdb_config: Dict[str, str], temp_dir: Optional[str] = None):
        self.vastdb_config = vastdb_config
        self.temp_dir = Path(temp_dir) if temp_dir else Path(tempfile.mkdtemp())
        self.temp_dir.mkdir(exist_ok=True)
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'GRIB2-Bulk-Ingester/1.0'})
        self.vastdb_session = None
        
        # Statistics tracking
        self.stats = {
            'files_discovered': 0,
            'files_downloaded': 0,
            'files_processed': 0,
            'files_ingested': 0,
            'bytes_downloaded': 0,
            'bytes_processed': 0,
            'start_time': None,
            'errors': []
        }
    
    def connect_to_vastdb(self):
        """Connect to VastDB"""
        try:
            self.vastdb_session = vastdb.connect(
                endpoint=self.vastdb_config['endpoint'],
                access=self.vastdb_config['access_key'],
                secret=self.vastdb_config['secret_key']
            )
            logger.info("✓ Connected to VastDB")
            return True
        except Exception as e:
            logger.error(f"✗ Failed to connect to VastDB: {e}")
            return False
    
    def discover_grib2_sources(self) -> List[Dict]:
        """Discover available GRIB2 data sources"""
        sources = [
            {
                'name': 'GEM Global 66km', 
                'url': 'https://dd.weather.gc.ca/model_gem_global/66km/grib2/lat_lon',
                'description': 'Lower resolution global model (more stable)',
                'estimated_file_size_mb': 0.8
            },
            {
                'name': 'GEM Global 15km',
                'url': 'https://dd.weather.gc.ca/model_gem_global/15km/grib2/lat_lon',
                'description': 'High resolution global model',
                'estimated_file_size_mb': 2.5
            },
            {
                'name': 'HRDPS Continental 2.5km',
                'url': 'https://dd.weather.gc.ca/model_hrdps/continental/2.5km',
                'description': 'Very high resolution continental model',
                'estimated_file_size_mb': 8.0
            },
            {
                'name': 'Sample GRIB2 Files',
                'url': 'https://dd.weather.gc.ca/model_gem_global/25km/grib2/lat_lon',
                'description': 'Alternative global model data',
                'estimated_file_size_mb': 1.5
            }
        ]
        
        logger.info("🔍 Discovering available GRIB2 data sources...")
        available_sources = []
        
        for source in sources:
            try:
                logger.info(f"Checking {source['name']}...")
                
                # More lenient check - just see if we can access the URL
                response = self.session.get(source['url'], timeout=15)
                if response.status_code == 200:
                    # Quick check for actual files
                    file_count = self._quick_file_count(source['url'])
                    
                    # Be more generous - even 1 file is enough
                    if file_count >= 1:
                        source['available'] = True
                        source['estimated_files'] = max(file_count, 10)  # Minimum assumption
                        source['estimated_total_size_gb'] = (source['estimated_files'] * source['estimated_file_size_mb']) / 1024
                        available_sources.append(source)
                        logger.info(f"✓ {source['name']}: ~{source['estimated_files']} files (~{source['estimated_total_size_gb']:.1f} GB)")
                    else:
                        # Even if no files found, add with minimal estimate
                        source['available'] = True
                        source['estimated_files'] = 5
                        source['estimated_total_size_gb'] = (5 * source['estimated_file_size_mb']) / 1024
                        available_sources.append(source)
                        logger.info(f"⚠️ {source['name']}: Directory accessible, assuming ~5 files")
                else:
                    logger.info(f"✗ {source['name']}: HTTP {response.status_code}")
            except Exception as e:
                logger.info(f"✗ {source['name']}: Error - {e}")
        
        # Fallback: If no sources found, create a test source with known working file
        if not available_sources:
            logger.info("No sources found via directory browsing, trying direct file access...")
            fallback_source = {
                'name': 'Direct GRIB2 File',
                'url': 'https://dd.weather.gc.ca/model_gem_global/15km/grib2/lat_lon',
                'description': 'Direct file access method',
                'estimated_file_size_mb': 2.5,
                'available': True,
                'estimated_files': 1,
                'estimated_total_size_gb': 0.0025,
                'direct_files': [
                    'https://dd.weather.gc.ca/model_gem_global/15km/grib2/lat_lon/00/000/CMC_glb_ABSV_ISBL_200_latlon.15x.15_2024121100_P000.grib2'
                ]
            }
            
            # Test if we can access the known file
            try:
                test_response = self.session.head(fallback_source['direct_files'][0], timeout=10)
                if test_response.status_code == 200:
                    available_sources.append(fallback_source)
                    logger.info("✓ Found working direct file access")
            except:
                pass
        
        return available_sources
    
    def _quick_file_count(self, base_url: str, max_dirs: int = 3) -> int:
        """Quick estimation of available files"""
        try:
            # Check a few directories to estimate total files
            dirs = self._get_directories(base_url)
            total_files = 0
            
            for i, dir_name in enumerate(dirs[:max_dirs]):
                dir_url = f"{base_url}/{dir_name}/"
                subdirs = self._get_directories(dir_url)
                
                for j, subdir in enumerate(subdirs[:2]):  # Check first 2 subdirs
                    subdir_url = f"{dir_url}{subdir}/"
                    files = self._get_grib2_files(subdir_url, limit=100)
                    total_files += len(files)
                    
                    if total_files > 500:  # Cap estimation
                        return min(total_files * (len(dirs) * len(subdirs)) // ((i+1) * (j+1)), 10000)
            
            # Extrapolate
            if total_files > 0 and len(dirs) > max_dirs:
                total_files = total_files * len(dirs) // max_dirs
            
            return min(total_files, 10000)  # Cap at reasonable number
            
        except Exception:
            return 0
    
    def _get_directories(self, url: str) -> List[str]:
        """Get directory listings"""
        try:
            response = self.session.get(url, timeout=15)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            dirs = []
            for link in soup.find_all('a', href=True):
                href = link['href']
                if href.endswith('/') and not href.startswith('..') and href != '/':
                    dirs.append(href.rstrip('/'))
            return dirs
        except:
            return []
    
    def _get_grib2_files(self, url: str, limit: int = 50) -> List[str]:
        """Get GRIB2 file listings"""
        try:
            response = self.session.get(url, timeout=15)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            files = []
            for link in soup.find_all('a', href=True):
                href = link['href']
                if href.endswith('.grib2'):
                    files.append(href)
                    if len(files) >= limit:
                        break
            return files
        except:
            return []
    
    def build_download_plan(self, sources: List[Dict], target_size_gb: float) -> Dict:
        """Build a download plan to reach target data size"""
        logger.info(f"📋 Building download plan for {target_size_gb} GB of data...")
        
        plan = {
            'target_size_gb': target_size_gb,
            'sources': [],
            'total_estimated_size_gb': 0,
            'total_estimated_files': 0
        }
        
        # Sort sources by file size (prefer smaller files for more diversity)
        sources.sort(key=lambda x: x['estimated_file_size_mb'])
        
        remaining_gb = target_size_gb
        
        for source in sources:
            if remaining_gb <= 0:
                break
            
            # Calculate how many files we need from this source
            files_needed = min(
                int((remaining_gb * 1024) / source['estimated_file_size_mb']),
                source['estimated_files'],
                2000  # Cap per source
            )
            
            if files_needed > 0:
                size_from_source = (files_needed * source['estimated_file_size_mb']) / 1024
                
                plan['sources'].append({
                    'name': source['name'],
                    'url': source['url'],
                    'files_to_download': files_needed,
                    'estimated_size_gb': size_from_source,
                    'estimated_file_size_mb': source['estimated_file_size_mb'],
                    'direct_files': source.get('direct_files', None)
                })
                
                plan['total_estimated_files'] += files_needed
                plan['total_estimated_size_gb'] += size_from_source
                remaining_gb -= size_from_source
        
        logger.info(f"📊 Plan: {plan['total_estimated_files']} files, {plan['total_estimated_size_gb']:.1f} GB from {len(plan['sources'])} sources")
        return plan
    
    def discover_files_from_source(self, source_url: str, max_files: int, direct_files: List[str] = None) -> List[str]:
        """Discover actual file URLs from a source"""
        logger.info(f"🔍 Discovering files from {source_url}...")
        
        # If we have direct files (fallback mode), use those
        if direct_files:
            logger.info(f"Using direct file list: {len(direct_files)} files")
            return direct_files[:max_files]
        
        file_urls = []
        dirs = self._get_directories(source_url)
        
        # If no directories found, try a more aggressive approach
        if not dirs:
            logger.info("No directories found, trying current date directories...")
            # Try recent dates
            today = datetime.now()
            for days_back in range(7):  # Try last 7 days
                test_date = today - timedelta(days=days_back)
                for hour in ['00', '06', '12', '18']:
                    test_dir = f"{test_date.strftime('%Y%m%d')}{hour}"
                    if test_dir not in dirs:
                        dirs.append(hour)  # Just try the hour directories
        
        for dir_name in dirs:
            if len(file_urls) >= max_files:
                break
                
            dir_url = f"{source_url}/{dir_name}/"
            subdirs = self._get_directories(dir_url)
            
            # If no subdirs, try common forecast hours
            if not subdirs:
                subdirs = ['000', '003', '006', '012', '024', '048']
            
            for subdir in subdirs:
                if len(file_urls) >= max_files:
                    break
                    
                subdir_url = f"{dir_url}{subdir}/"
                files = self._get_grib2_files(subdir_url, limit=max_files - len(file_urls))
                
                for file in files:
                    file_urls.append(f"{subdir_url}{file}")
                    if len(file_urls) >= max_files:
                        break
        
        logger.info(f"📁 Discovered {len(file_urls)} file URLs")
        self.stats['files_discovered'] += len(file_urls)
        return file_urls
    
    def download_file(self, url: str, batch_id: str = None) -> Tuple[Optional[str], Optional[Dict]]:
        """Download a single GRIB2 file and return metadata"""
        filename = Path(url).name
        local_path = self.temp_dir / filename
        download_start = datetime.utcnow()
        
        try:
            response = self.session.get(url, stream=True, timeout=120)
            response.raise_for_status()
            
            with open(local_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            
            download_end = datetime.utcnow()
            file_size = local_path.stat().st_size
            download_duration = (download_end - download_start).total_seconds()
            
            self.stats['bytes_downloaded'] += file_size
            self.stats['files_downloaded'] += 1
            
            # Prepare download metadata
            metadata = {
                'source_url': url,
                'filename': filename,
                'file_size_bytes': file_size,
                'download_timestamp': download_start,
                'download_duration_seconds': download_duration,
                'batch_id': batch_id or f"batch_{int(datetime.utcnow().timestamp())}"
            }
            
            logger.debug(f"Downloaded {filename} ({file_size / 1024 / 1024:.1f} MB)")
            return str(local_path), metadata
            
        except Exception as e:
            logger.error(f"Failed to download {url}: {e}")
            self.stats['errors'].append(f"Download failed: {url} - {e}")
            return None, None
    
    def cleanup_files(self, paths: List[str]):
        """Clean up temporary files"""
        for path in paths:
            try:
                if os.path.exists(path):
                    os.remove(path)
            except Exception as e:
                logger.warning(f"Failed to remove {path}: {e}")
    
    def process_grib2_to_parquet(self, grib2_path: str) -> Optional[str]:
        """Process GRIB2 file to VastDB-compatible Parquet"""
        try:
            # Read GRIB2 file with explicit decode_timedelta to silence warning
            ds = xr.open_dataset(grib2_path, engine="cfgrib", 
                               backend_kwargs={'decode_timedelta': False})
            df = ds.to_dataframe().reset_index()
            
            # Add metadata
            df['source_file'] = Path(grib2_path).name
            df['ingestion_timestamp'] = datetime.utcnow()
            
            # Convert to PyArrow table
            table = pa.Table.from_pandas(df)
            
            # Fix VastDB incompatible types
            table = self._fix_vastdb_compatibility(table)
            
            # Save to parquet
            parquet_path = grib2_path.replace('.grib2', '.parquet')
            pq.write_table(table, parquet_path)
            
            file_size = Path(parquet_path).stat().st_size
            self.stats['bytes_processed'] += file_size
            self.stats['files_processed'] += 1
            
            logger.debug(f"Processed {Path(grib2_path).name} to Parquet ({file_size / 1024 / 1024:.1f} MB)")
            return parquet_path
            
        except Exception as e:
            logger.error(f"Failed to process {grib2_path}: {e}")
            self.stats['errors'].append(f"Processing failed: {grib2_path} - {e}")
            return None
    
    def _fix_vastdb_compatibility(self, table: pa.Table) -> pa.Table:
        """Convert fields incompatible with VastDB"""
        schema = table.schema
        new_fields = []
        
        for field in schema:
            if field.type == pa.duration("ns"):
                new_fields.append(pa.field(field.name, pa.string()))
            else:
                new_fields.append(field)
        
        new_schema = pa.schema(new_fields)
        
        # Cast incompatible columns
        arrays = []
        for i, field in enumerate(schema):
            if field.type == pa.duration("ns"):
                arrays.append(table.column(i).cast(pa.string()))
            else:
                arrays.append(table.column(i))
        
        return pa.Table.from_arrays(arrays, schema=new_schema)
    
    def ingest_to_vastdb(self, parquet_path: str, bucket_name: str, schema_name: str, table_name: str, 
                        file_metadata: Dict = None) -> bool:
        """Ingest Parquet file to VastDB and save metadata"""
        try:
            # Read parquet
            table = pq.read_table(parquet_path)
            
            with self.vastdb_session.transaction() as tx:
                bucket = tx.bucket(bucket_name)
                schema = bucket.schema(schema_name, fail_if_missing=False) or bucket.create_schema(schema_name)
                
                # Create or get main data table
                db_table = schema.table(table_name, fail_if_missing=False)
                if not db_table:
                    db_table = schema.create_table(table_name, table.schema)
                else:
                    # Add any new columns
                    self._add_missing_columns(db_table, table.schema)
                
                # Insert data
                db_table.insert(table)
                
                # Save metadata to ingestion log table
                if file_metadata:
                    self._save_ingestion_metadata(schema, file_metadata, table.num_rows)
                
            self.stats['files_ingested'] += 1
            logger.debug(f"Ingested {Path(parquet_path).name} to VastDB")
            return True
            
        except Exception as e:
            logger.error(f"Failed to ingest {parquet_path}: {e}")
            self.stats['errors'].append(f"Ingestion failed: {parquet_path} - {e}")
            return False
    
    def _add_missing_columns(self, table, new_schema):
        """Add missing columns to existing table"""
        existing_fields = set(table.arrow_schema.names)
        new_fields = set(new_schema.names)
        
        for field_name in new_fields - existing_fields:
            field = new_schema.field(field_name)
            table.add_column(pa.schema([field]))
            logger.debug(f"Added column {field_name} to table")
    
    def _save_ingestion_metadata(self, schema, metadata: Dict, record_count: int):
        """Save ingestion metadata to a separate tracking table"""
        try:
            # Define metadata table schema
            metadata_schema = pa.schema([
                pa.field('ingestion_id', pa.string()),
                pa.field('source_url', pa.string()),
                pa.field('filename', pa.string()),
                pa.field('file_size_bytes', pa.int64()),
                pa.field('download_timestamp', pa.timestamp('us')),
                pa.field('processing_timestamp', pa.timestamp('us')),
                pa.field('ingestion_timestamp', pa.timestamp('us')),
                pa.field('record_count', pa.int64()),
                pa.field('source_name', pa.string()),
                pa.field('download_duration_seconds', pa.float64()),
                pa.field('processing_duration_seconds', pa.float64()),
                pa.field('grib2_variables', pa.string()),  # JSON string of variables
                pa.field('ingestion_batch_id', pa.string()),
                pa.field('status', pa.string())
            ])
            
            # Create or get metadata table
            metadata_table_name = f"{metadata.get('main_table_name', 'grib2_data')}_ingestion_log"
            metadata_table = schema.table(metadata_table_name, fail_if_missing=False)
            
            if not metadata_table:
                metadata_table = schema.create_table(metadata_table_name, metadata_schema)
                logger.info(f"Created ingestion metadata table: {metadata_table_name}")
            
            # Prepare metadata record
            metadata_record = {
                'ingestion_id': metadata.get('ingestion_id', f"{metadata['filename']}_{int(datetime.utcnow().timestamp())}"),
                'source_url': metadata.get('source_url', ''),
                'filename': metadata.get('filename', ''),
                'file_size_bytes': metadata.get('file_size_bytes', 0),
                'download_timestamp': metadata.get('download_timestamp'),
                'processing_timestamp': metadata.get('processing_timestamp'),
                'ingestion_timestamp': datetime.utcnow(),
                'record_count': record_count,
                'source_name': metadata.get('source_name', ''),
                'download_duration_seconds': metadata.get('download_duration_seconds', 0.0),
                'processing_duration_seconds': metadata.get('processing_duration_seconds', 0.0),
                'grib2_variables': metadata.get('grib2_variables', '[]'),  # JSON string
                'ingestion_batch_id': metadata.get('batch_id', ''),
                'status': 'SUCCESS'
            }
            
            # Convert to PyArrow table and insert
            metadata_df = pd.DataFrame([metadata_record])
            metadata_arrow_table = pa.Table.from_pandas(metadata_df, schema=metadata_schema)
            metadata_table.insert(metadata_arrow_table)
            
            logger.debug(f"Saved metadata for {metadata['filename']} to {metadata_table_name}")
            
        except Exception as e:
            logger.warning(f"Failed to save ingestion metadata: {e}")
            # Don't fail the whole ingestion if metadata save fails
    
    def process_grib2_to_parquet(self, grib2_path: str, source_url: str = None, source_name: str = None) -> Tuple[Optional[str], Optional[Dict]]:
        """Process GRIB2 file to VastDB-compatible Parquet and return metadata"""
        processing_start = datetime.utcnow()
        
        try:
            # Read GRIB2 file with explicit decode_timedelta to silence warning
            ds = xr.open_dataset(grib2_path, engine="cfgrib", 
                               backend_kwargs={'decode_timedelta': False})
            df = ds.to_dataframe().reset_index()
            
            # Extract GRIB2 variables for metadata
            grib2_variables = list(ds.data_vars.keys()) if hasattr(ds, 'data_vars') else []
            
            # Add metadata
            df['source_file'] = Path(grib2_path).name
            df['ingestion_timestamp'] = datetime.utcnow()
            
            # Convert to PyArrow table
            table = pa.Table.from_pandas(df)
            
            # Fix VastDB incompatible types
            table = self._fix_vastdb_compatibility(table)
            
            # Save to parquet
            parquet_path = grib2_path.replace('.grib2', '.parquet')
            pq.write_table(table, parquet_path)
            
            file_size = Path(parquet_path).stat().st_size
            self.stats['bytes_processed'] += file_size
            self.stats['files_processed'] += 1
            
            processing_end = datetime.utcnow()
            processing_duration = (processing_end - processing_start).total_seconds()
            
            # Prepare metadata
            metadata = {
                'filename': Path(grib2_path).name,
                'source_url': source_url or '',
                'source_name': source_name or '',
                'file_size_bytes': Path(grib2_path).stat().st_size,
                'processing_timestamp': processing_start,
                'processing_duration_seconds': processing_duration,
                'grib2_variables': str(grib2_variables),  # Convert to string for storage
                'record_count': len(df)
            }
            
            logger.debug(f"Processed {Path(grib2_path).name} to Parquet ({file_size / 1024 / 1024:.1f} MB)")
            return parquet_path, metadata
            
        except Exception as e:
            logger.error(f"Failed to process {grib2_path}: {e}")
            self.stats['errors'].append(f"Processing failed: {grib2_path} - {e}")
            return None, None
    
    def run_ingestion(self, plan: Dict, bucket_name: str, schema_name: str, table_name: str, 
                     batch_size: int = 10, max_workers: int = 3):
        """Run the complete ingestion process"""
        
        logger.info(f"🚀 Starting ingestion to {bucket_name}.{schema_name}.{table_name}")
        self.stats['start_time'] = datetime.now()
        
        # Connect to VastDB
        if not self.connect_to_vastdb():
            return False
        
        total_processed = 0
        
        try:
            for source_plan in plan['sources']:
                logger.info(f"📂 Processing source: {source_plan['name']}")
                
                # Discover files from this source
                direct_files = source_plan.get('direct_files', None)
                file_urls = self.discover_files_from_source(
                    source_plan['url'], 
                    source_plan['files_to_download'],
                    direct_files
                )
                
                if not file_urls:
                    logger.warning(f"No files found for {source_plan['name']}")
                    continue
                
                # Process in batches to avoid disk space issues
                for i in range(0, len(file_urls), batch_size):
                    batch_urls = file_urls[i:i + batch_size]
                    batch_id = f"{source_plan['name']}_batch_{i//batch_size + 1}_{int(datetime.now().timestamp())}"
                    logger.info(f"🔄 Processing batch {i//batch_size + 1}/{(len(file_urls) + batch_size - 1)//batch_size} ({len(batch_urls)} files)")
                    
                    # Download batch with metadata
                    downloaded_files = []
                    download_metadata = []
                    with ThreadPoolExecutor(max_workers=max_workers) as executor:
                        future_to_url = {executor.submit(self.download_file, url, batch_id): url for url in batch_urls}
                        
                        for future in as_completed(future_to_url):
                            result_path, metadata = future.result()
                            if result_path and metadata:
                                downloaded_files.append(result_path)
                                download_metadata.append(metadata)
                    
                    # Process and ingest batch
                    for j, grib2_path in enumerate(downloaded_files):
                        download_meta = download_metadata[j] if j < len(download_metadata) else {}
                        
                        # Convert to Parquet with metadata
                        parquet_path, processing_meta = self.process_grib2_to_parquet(
                            grib2_path, 
                            download_meta.get('source_url', ''),
                            source_plan['name']
                        )
                        
                        if parquet_path and processing_meta:
                            # Combine download and processing metadata
                            combined_metadata = {**download_meta, **processing_meta}
                            combined_metadata['main_table_name'] = table_name
                            combined_metadata['batch_id'] = batch_id
                            
                            # Ingest to VastDB with metadata
                            if self.ingest_to_vastdb(parquet_path, bucket_name, schema_name, table_name, combined_metadata):
                                total_processed += 1
                            
                            # Clean up files immediately
                            self.cleanup_files([grib2_path, parquet_path])
                        else:
                            # Clean up failed GRIB2 file
                            self.cleanup_files([grib2_path])
                    
                    # Progress update
                    elapsed = datetime.now() - self.stats['start_time']
                    gb_downloaded = self.stats['bytes_downloaded'] / 1024 / 1024 / 1024
                    
                    logger.info(f"📊 Progress: {total_processed} files ingested, {gb_downloaded:.2f} GB downloaded, {elapsed}")
        
        except KeyboardInterrupt:
            logger.info("🛑 Process interrupted by user")
        except Exception as e:
            logger.error(f"❌ Unexpected error: {e}")
        
        # Final statistics
        self._print_final_stats()
        return True
    
    def _print_final_stats(self):
        """Print final ingestion statistics"""
        elapsed = datetime.now() - self.stats['start_time']
        gb_downloaded = self.stats['bytes_downloaded'] / 1024 / 1024 / 1024
        gb_processed = self.stats['bytes_processed'] / 1024 / 1024 / 1024
        
        logger.info("=" * 60)
        logger.info("📈 FINAL INGESTION STATISTICS")
        logger.info("=" * 60)
        logger.info(f"Files discovered: {self.stats['files_discovered']}")
        logger.info(f"Files downloaded: {self.stats['files_downloaded']}")
        logger.info(f"Files processed: {self.stats['files_processed']}")
        logger.info(f"Files ingested: {self.stats['files_ingested']}")
        logger.info(f"Data downloaded: {gb_downloaded:.2f} GB")
        logger.info(f"Data processed: {gb_processed:.2f} GB")
        logger.info(f"Total time: {elapsed}")
        logger.info(f"Download rate: {gb_downloaded / elapsed.total_seconds() * 60:.1f} MB/min")
        logger.info(f"Errors: {len(self.stats['errors'])}")
        
        if self.stats['errors']:
            logger.info("Recent errors:")
            for error in self.stats['errors'][-5:]:  # Show last 5 errors
                logger.info(f"  - {error}")


def get_user_configuration(vastdb_config: Dict = None, db_config: Dict = None) -> Tuple[Dict, Dict, float]:
    """Get configuration from user input, parameters, and environment variables"""
    print("=" * 60)
    print("🌦️  GRIB2 BULK DATA INGESTER FOR VASTDB")
    print("=" * 60)
    
    # Data size selection
    print("\n📊 How much data would you like to ingest?")
    size_options = {
        '1': 1.0,      # 1 GB
        '2': 10.0,     # 10 GB  
        '3': 100.0,    # 100 GB
        '4': 1000.0,   # 1 TB
        '5': 10000.0   # 10 TB
    }
    
    print("1. 1 GB   (Quick test)")
    print("2. 10 GB  (Small dataset)")
    print("3. 100 GB (Medium dataset)")
    print("4. 1 TB   (Large dataset)")
    print("5. 10 TB  (Very large dataset)")
    print("6. Custom amount")
    
    choice = input("\nSelect option (1-6): ").strip()
    
    if choice in size_options:
        target_size_gb = size_options[choice]
    elif choice == '6':
        target_size_gb = float(input("Enter size in GB: "))
    else:
        print("Invalid choice, defaulting to 1 GB")
        target_size_gb = 1.0
    
    # VastDB configuration from passed config or environment variables
    print(f"\n🗄️  Loading VastDB Configuration")
    
    if vastdb_config:
        print("✓ Using provided VastDB configuration")
        final_vastdb_config = vastdb_config.copy()
    else:
        print("Loading from environment variables...")
        final_vastdb_config = {}
        
        # Required environment variables for VastDB connection
        vastdb_env_vars = {
            'VASTDB_ENDPOINT': 'endpoint',
            'VASTDB_ACCESS_KEY': 'access_key', 
            'VASTDB_SECRET_KEY': 'secret_key'
        }
        
        missing_vastdb_vars = []
        for var_name, key in vastdb_env_vars.items():
            value = os.getenv(var_name)
            if not value:
                missing_vastdb_vars.append(var_name)
            else:
                final_vastdb_config[key] = value
        
        if missing_vastdb_vars:
            print(f"\n❌ Missing required VastDB environment variables:")
            for var in missing_vastdb_vars:
                print(f"   - {var}")
            print(f"\nPlease set these environment variables or pass vastdb_config to main().")
            sys.exit(1)
    
    # Database configuration from passed config or environment variables
    print(f"\n🗃️  Loading Database Configuration")
    
    if db_config:
        print("✓ Using provided database configuration")
        final_db_config = db_config.copy()
        
        # Validate required keys
        required_db_keys = ['bucket_name', 'schema_name', 'table_name']
        missing_db_keys = [key for key in required_db_keys if key not in final_db_config]
        
        if missing_db_keys:
            print(f"\n❌ Missing required database config keys: {missing_db_keys}")
            sys.exit(1)
    else:
        print("Loading from environment variables...")
        final_db_config = {}
        db_env_vars = {
            'VASTDB_BUCKET': 'bucket_name',
            'VASTDB_SCHEMA': 'schema_name',
            'VASTDB_TABLE': 'table_name'
        }
        
        missing_db_vars = []
        for var_name, key in db_env_vars.items():
            value = os.getenv(var_name)
            if not value:
                missing_db_vars.append(var_name)
            else:
                final_db_config[key] = value
        
        if missing_db_vars:
            print(f"\n❌ Missing required database environment variables:")
            for var in missing_db_vars:
                print(f"   - {var}")
            print(f"\nExample:")
            print(f"export VASTDB_BUCKET='your-bucket-name'")
            print(f"export VASTDB_SCHEMA='your-schema-name'")
            print(f"export VASTDB_TABLE='your-table-name'")
            sys.exit(1)
    
    print(f"✓ Endpoint: {final_vastdb_config['endpoint']}")
    print(f"✓ Target: {final_db_config['bucket_name']}.{final_db_config['schema_name']}.{final_db_config['table_name']}")
    print(f"✓ Data Volume: {target_size_gb} GB")
    
    return final_vastdb_config, final_db_config, target_size_gb


def test_basic_functionality(vastdb_config: Dict = None, db_config: Dict = None):
    """Test basic functionality with a simple manual approach"""
    print("\n🧪 TESTING MODE - Manual GRIB2 Download")
    print("="*60)
    
    # Test with a known working GRIB2 file
    test_url = "https://dd.weather.gc.ca/model_gem_global/15km/grib2/lat_lon/00/000/CMC_glb_ABSV_ISBL_200_latlon.15x.15_2024121100_P000.grib2"
    
    try:
        import tempfile
        import requests
        
        session = requests.Session()
        session.headers.update({'User-Agent': 'GRIB2-Test/1.0'})
        
        print(f"Testing download from: {test_url}")
        
        # Test download
        response = session.head(test_url, timeout=10)
        if response.status_code == 200:
            print("✓ Test file is accessible")
            
            # Get VastDB config from parameter or environment
            if vastdb_config:
                print("✓ Using provided VastDB configuration")
                final_vastdb_config = vastdb_config.copy()
            else:
                print("Loading VastDB configuration from environment variables...")
                final_vastdb_config = {}
                
                vastdb_env_vars = {
                    'VASTDB_ENDPOINT': 'endpoint',
                    'VASTDB_ACCESS_KEY': 'access_key', 
                    'VASTDB_SECRET_KEY': 'secret_key'
                }
                
                missing_vars = []
                for var_name, key in vastdb_env_vars.items():
                    value = os.getenv(var_name)
                    if not value:
                        missing_vars.append(var_name)
                    else:
                        final_vastdb_config[key] = value
                
                if missing_vars:
                    print(f"❌ Missing environment variables: {missing_vars}")
                    return
            
            # Get database config from parameter or environment
            if db_config:
                print("✓ Using provided database configuration")
                final_db_config = db_config.copy()
                
                # Validate required keys
                required_db_keys = ['bucket_name', 'schema_name', 'table_name']
                missing_db_keys = [key for key in required_db_keys if key not in final_db_config]
                
                if missing_db_keys:
                    print(f"❌ Missing required database config keys: {missing_db_keys}")
                    return
            else:
                print("Loading database configuration from environment variables...")
                final_db_config = {}
                db_env_vars = {
                    'VASTDB_BUCKET': 'bucket_name',
                    'VASTDB_SCHEMA': 'schema_name',
                    'VASTDB_TABLE': 'table_name'
                }
                
                missing_db_vars = []
                for var_name, key in db_env_vars.items():
                    value = os.getenv(var_name)
                    if not value:
                        missing_db_vars.append(var_name)
                    else:
                        final_db_config[key] = value
                
                if missing_db_vars:
                    print(f"❌ Missing database environment variables: {missing_db_vars}")
                    return
            
            # Create test ingester
            ingester = GRIB2DataIngester(final_vastdb_config)
            
            if ingester.connect_to_vastdb():
                print("\n✓ VastDB connection successful")
                
                # Download test file
                temp_dir = Path(tempfile.mkdtemp())
                test_file = temp_dir / "test.grib2"
                
                print(f"Downloading test file...")
                response = session.get(test_url, stream=True)
                with open(test_file, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                
                print(f"✓ Downloaded {test_file.stat().st_size / 1024 / 1024:.1f} MB")
                
                # Process to parquet
                parquet_file = ingester.process_grib2_to_parquet(str(test_file))
                if parquet_file:
                    print("✓ Converted to Parquet")
                    
                    # Ingest to VastDB
                    success = ingester.ingest_to_vastdb(
                        parquet_file,
                        final_db_config['bucket_name'],
                        final_db_config['schema_name'], 
                        final_db_config['table_name']
                    )
                    
                    if success:
                        print("✅ Test ingestion successful!")
                        print(f"Data is now in {final_db_config['bucket_name']}.{final_db_config['schema_name']}.{final_db_config['table_name']}")
                    else:
                        print("❌ Test ingestion failed")
                
                # Cleanup
                import shutil
                shutil.rmtree(temp_dir)
            else:
                print("❌ VastDB connection failed")
        else:
            print(f"❌ Test file not accessible: HTTP {response.status_code}")
            
    except Exception as e:
        print(f"❌ Test failed: {e}")


def main(vastdb_config: Dict = None, db_config: Dict = None, target_size_gb: float = None):
    """
    Main application
    
    Args:
        vastdb_config: Dict with keys 'endpoint', 'access_key', 'secret_key'
        db_config: Dict with keys 'bucket_name', 'schema_name', 'table_name'
        target_size_gb: Target data size in GB (if None, will prompt user)
    """
    print("=" * 60)
    print("🌦️  GRIB2 BULK DATA INGESTER FOR VASTDB")
    print("=" * 60)
    
    # If configs are provided, skip mode selection and go straight to ingestion
    if vastdb_config and db_config:
        print("✓ Configuration provided - running in programmatic mode")
        mode = '1'  # Full bulk ingestion
    else:
        print("Choose mode:")
        print("1. Full bulk ingestion (discover and download multiple files)")
        print("2. Test mode (single file test)")
        
        mode = input("Select mode (1-2): ").strip()
    
    if mode == '2':
        test_basic_functionality(vastdb_config, db_config)
        return
    
    try:
        # Get user configuration (or use provided configs)
        if vastdb_config and db_config and target_size_gb is not None:
            final_vastdb_config = vastdb_config.copy()
            final_db_config = db_config.copy()
            final_target_size_gb = target_size_gb
            
            print(f"✓ Using provided configuration:")
            print(f"  - Endpoint: {final_vastdb_config['endpoint']}")
            print(f"  - Target: {final_db_config['bucket_name']}.{final_db_config['schema_name']}.{final_db_config['table_name']}")
            print(f"  - Data Volume: {final_target_size_gb} GB")
        else:
            final_vastdb_config, final_db_config, final_target_size_gb = get_user_configuration(vastdb_config, db_config)
        
        # Initialize ingester
        ingester = GRIB2DataIngester(final_vastdb_config)
        
        # Discover available sources
        print(f"\n🔍 Discovering GRIB2 data sources...")
        available_sources = ingester.discover_grib2_sources()
        
        if not available_sources:
            print("❌ No available GRIB2 sources found!")
            print("💡 Try test mode (option 2) to test with a single known file")
            return
        
        print(f"\n✅ Found {len(available_sources)} available sources")
        
        # Build download plan
        plan = ingester.build_download_plan(available_sources, final_target_size_gb)
        
        if plan['total_estimated_files'] == 0:
            print("❌ Could not build a download plan!")
            return
        
        print(f"\n📋 Download Plan:")
        print(f"Target: {final_target_size_gb} GB")
        print(f"Estimated: {plan['total_estimated_files']} files, {plan['total_estimated_size_gb']:.1f} GB")
        
        for source in plan['sources']:
            print(f"  - {source['name']}: {source['files_to_download']} files ({source['estimated_size_gb']:.1f} GB)")
        
        # Confirm with user (unless running programmatically)
        if not (vastdb_config and db_config and target_size_gb is not None):
            confirm = input(f"\n🚀 Start ingestion to {final_db_config['bucket_name']}.{final_db_config['schema_name']}.{final_db_config['table_name']}? (y/n): ")
            if confirm.lower() != 'y':
                print("Cancelled by user")
                return
        else:
            print(f"\n🚀 Starting automatic ingestion to {final_db_config['bucket_name']}.{final_db_config['schema_name']}.{final_db_config['table_name']}")
        
        # Run ingestion
        success = ingester.run_ingestion(
            plan, 
            final_db_config['bucket_name'],
            final_db_config['schema_name'], 
            final_db_config['table_name'],
            batch_size=5,      # Process 5 files at a time
            max_workers=2      # Conservative parallelism
        )
        
        if success:
            print("\n✅ Ingestion completed!")
            print(f"Data is now available in {final_db_config['bucket_name']}.{final_db_config['schema_name']}.{final_db_config['table_name']}")
        else:
            print("\n❌ Ingestion failed!")
    
    except KeyboardInterrupt:
        print("\n🛑 Process interrupted by user")
    except Exception as e:
        print(f"\n❌ Unexpected error: {e}")
    finally:
        print("\n👋 Goodbye!")


# Example usage functions for easy integration
def run_with_config(endpoint: str, access_key: str, secret_key: str, 
                   bucket_name: str, schema_name: str, table_name: str,
                   target_size_gb: float = 1.0):
    """
    Convenience function to run ingestion with explicit parameters
    
    Example:
        run_with_config(
            endpoint='https://my-vastdb.com',
            access_key='your_access_key',
            secret_key='your_secret_key',
            bucket_name='weather_data',
            schema_name='grib2',
            table_name='meteorological_data',
            target_size_gb=10.0
        )
    """
    vastdb_config = {
        'endpoint': endpoint,
        'access_key': access_key,
        'secret_key': secret_key
    }
    
    db_config = {
        'bucket_name': bucket_name,
        'schema_name': schema_name,
        'table_name': table_name
    }
    
    main(vastdb_config, db_config, target_size_gb)


def run_test_mode(endpoint: str, access_key: str, secret_key: str,
                 bucket_name: str, schema_name: str, table_name: str):
    """
    Convenience function to run test mode with explicit parameters
    
    Example:
        run_test_mode(
            endpoint='https://my-vastdb.com',
            access_key='your_access_key', 
            secret_key='your_secret_key',
            bucket_name='weather_data',
            schema_name='grib2',
            table_name='test_data'
        )
    """
    vastdb_config = {
        'endpoint': endpoint,
        'access_key': access_key,
        'secret_key': secret_key
    }
    
    db_config = {
        'bucket_name': bucket_name,
        'schema_name': schema_name,
        'table_name': table_name
    }
    
    test_basic_functionality(vastdb_config, db_config)


# if __name__ == "__main__":
#     main()

In [27]:
run_with_config(
    endpoint=endpoint,
    access_key=access_key,
    secret_key=secret_key,
    bucket_name='csnowdb',
    schema_name='grib2',
    table_name='grib2_data',
    target_size_gb=0.01
)



2025-06-04 10:28:25,233 - INFO - 🔍 Discovering available GRIB2 data sources...
2025-06-04 10:28:25,233 - INFO - Checking GEM Global 66km...


🌦️  GRIB2 BULK DATA INGESTER FOR VASTDB
✓ Configuration provided - running in programmatic mode
✓ Using provided configuration:
  - Endpoint: http://172.200.202.2:80
  - Target: csnowdb.grib2.grib2_data
  - Data Volume: 0.01 GB

🔍 Discovering GRIB2 data sources...


2025-06-04 10:28:25,479 - INFO - ✗ GEM Global 66km: HTTP 404
2025-06-04 10:28:25,480 - INFO - Checking GEM Global 15km...
2025-06-04 10:28:27,270 - INFO - ✓ GEM Global 15km: ~100 files (~0.2 GB)
2025-06-04 10:28:27,271 - INFO - Checking HRDPS Continental 2.5km...
2025-06-04 10:28:29,810 - INFO - ✓ HRDPS Continental 2.5km: ~200 files (~1.6 GB)
2025-06-04 10:28:29,811 - INFO - Checking Sample GRIB2 Files...
2025-06-04 10:28:30,012 - INFO - ✗ Sample GRIB2 Files: HTTP 404
2025-06-04 10:28:30,013 - INFO - 📋 Building download plan for 0.01 GB of data...
2025-06-04 10:28:30,013 - INFO - 📊 Plan: 4 files, 0.0 GB from 1 sources
2025-06-04 10:28:30,014 - INFO - 🚀 Starting ingestion to csnowdb.grib2.grib2_data
2025-06-04 10:28:30,020 - INFO - VAST version: (5, 2, 0, 131)
2025-06-04 10:28:30,021 - INFO - ✓ Connected to VastDB
2025-06-04 10:28:30,021 - INFO - 📂 Processing source: GEM Global 15km
2025-06-04 10:28:30,021 - INFO - 🔍 Discovering files from https://dd.weather.gc.ca/model_gem_global/15km/


✅ Found 2 available sources

📋 Download Plan:
Target: 0.01 GB
Estimated: 4 files, 0.0 GB
  - GEM Global 15km: 4 files (0.0 GB)

🚀 Starting automatic ingestion to csnowdb.grib2.grib2_data


2025-06-04 10:28:32,695 - INFO - 📁 Discovered 4 file URLs
2025-06-04 10:28:32,696 - INFO - 🔄 Processing batch 1/1 (4 files)
2025-06-04 10:28:35,887 - INFO - Created table: grib2_data
2025-06-04 10:28:39,309 - INFO - Created table: grib2_data_ingestion_log
2025-06-04 10:28:39,321 - INFO - Created ingestion metadata table: grib2_data_ingestion_log
2025-06-04 10:28:55,245 - INFO - 📊 Progress: 4 files ingested, 0.01 GB downloaded, 0:00:25.231317
2025-06-04 10:28:55,246 - INFO - 📈 FINAL INGESTION STATISTICS
2025-06-04 10:28:55,247 - INFO - Files discovered: 4
2025-06-04 10:28:55,247 - INFO - Files downloaded: 4
2025-06-04 10:28:55,247 - INFO - Files processed: 4
2025-06-04 10:28:55,248 - INFO - Files ingested: 4
2025-06-04 10:28:55,248 - INFO - Data downloaded: 0.01 GB
2025-06-04 10:28:55,249 - INFO - Data processed: 0.02 GB
2025-06-04 10:28:55,249 - INFO - Total time: 0:00:25.232018
2025-06-04 10:28:55,249 - INFO - Download rate: 0.0 MB/min
2025-06-04 10:28:55,250 - INFO - Errors: 0



✅ Ingestion completed!
Data is now available in csnowdb.grib2.grib2_data

👋 Goodbye!
