In [1]:
#!/usr/bin/env python3
"""
GCP Bucket Crawler and Catalog Generator
Crawls a GCP storage bucket to discover vector and raster data,
then generates collections, individual STAC items, and a comprehensive catalog.
"""

import json
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
from urllib.parse import urljoin
import os
import shutil
import shlex
import subprocess
from google.cloud import storage

In [2]:
class GCPBucketCrawler:
    def __init__(self, bucket_name: str, prefix: str = "", project_id: Optional[str] = None):
        """
        Initialize the crawler with GCP bucket details.
        
        Args:
            bucket_name: Name of the GCP storage bucket (e.g., 'swhm_data')
            prefix: Prefix to filter objects (e.g., 'public/layers/')
            project_id: GCP project ID (optional, will use default if not provided)
        """
        self.bucket_name = bucket_name
        self.prefix = prefix
        self.project_id = project_id
        self.vectors = []
        self.rasters = []
        self.processed_items = set()  # Track processed items to prevent duplicates
        
        # Initialize the GCS client
        try:
            if project_id:
                self.client = storage.Client(project=project_id)
            else:
                self.client = storage.Client()
            self.bucket = self.client.bucket(bucket_name)
            print(f"Successfully connected to bucket: {bucket_name}")
        except Exception as e:
            print(f"Error initializing GCS client: {e}")
            print("Make sure you have proper authentication set up:")
            print("1. Set GOOGLE_APPLICATION_CREDENTIALS environment variable")
            print("2. Or run 'gcloud auth application-default login'")
            self.client = None
            self.bucket = None
        
    def crawl_bucket(self) -> Dict:
        """
        Crawl the GCP bucket to discover all vectors and rasters.
        Returns a dictionary with discovered items.
        """
        if not self.client or not self.bucket:
            print("No valid GCS client available, creating sample data...")
            return self._create_sample_data()
            
        print(f"Crawling bucket '{self.bucket_name}' with prefix '{self.prefix}'...")
        
        try:
            # List all blobs in the bucket with the specified prefix
            blobs = self.bucket.list_blobs(prefix=self.prefix)
            
            blob_count = 0
            for blob in blobs:
                blob_count += 1
                self._process_blob(blob)
                
            print(f"Processed {blob_count} objects from bucket")
            print(f"Found {len(self.vectors)} unique vectors and {len(self.rasters)} unique rasters")
            
        except Exception as e:
            print(f"Error crawling bucket: {e}")
            return self._create_sample_data()
            
        return {
            'vectors': self.vectors,
            'rasters': self.rasters,
            'total_items': len(self.vectors) + len(self.rasters)
        }
    
    def _process_blob(self, blob):
        """Process a single blob to determine if it's a vector or raster."""
        blob_name = blob.name
        blob_path = Path(blob_name)
        
        # Skip directories (blobs ending with '/')
        if blob_name.endswith('/'):
            return
            
        # Check for vector files - ONLY .geojson files, NOT .json files
        if 'vector/' in blob_name and blob_path.suffix.lower() == '.geojson':
            self._add_vector_item(blob)
            
        # Check for raster files - TIFF files including .gtiff  
        elif 'raster/' in blob_name and blob_path.suffix.lower() in ['.tiff', '.tif', '.gtiff']:
            self._add_raster_item(blob)
    
    def _add_vector_item(self, blob):
        """Add a vector item to the collection."""
        blob_path = Path(blob.name)
        item_name = blob_path.stem
        
        # Create unique identifier to prevent duplicates
        item_key = f"vector:{item_name}"
        if item_key in self.processed_items:
            print(f"Skipping duplicate vector: {item_name}")
            return
        
        # Create public URL
        public_url = f"https://storage.googleapis.com/{self.bucket_name}/{blob.name}"
        
        # Get the directory path where the STAC item should be saved
        # e.g., public/layers/vector/PugetSoundWA/PugetSoundWA.geojson -> vector/PugetSoundWA/
        parent_dir = blob_path.parent
        stac_dir = str(parent_dir).replace(self.prefix, '')
        
        vector_item = {
            'name': item_name,
            'filename': blob.name,
            'url': public_url,
            'type': 'vector',
            'format': 'GeoJSON',
            'size_bytes': blob.size,
            'content_type': blob.content_type,
            'created': blob.time_created.isoformat() if blob.time_created else None,
            'updated': blob.updated.isoformat() if blob.updated else None,
            'discovered_at': datetime.now().isoformat(),
            'etag': blob.etag,
            'md5_hash': blob.md5_hash,
            'stac_dir': stac_dir  # Directory where STAC item should be saved
        }
        
        self.vectors.append(vector_item)
        self.processed_items.add(item_key)
        print(f"Found vector: {item_name}")
    
    def _add_raster_item(self, blob):
        """Add a raster item to the collection."""
        blob_path = Path(blob.name)
        item_name = blob_path.stem
        
        # Create unique identifier to prevent duplicates
        item_key = f"raster:{item_name}"
        if item_key in self.processed_items:
            print(f"Skipping duplicate raster: {item_name}")
            return
        
        # Create public URL
        public_url = f"https://storage.googleapis.com/{self.bucket_name}/{blob.name}"
        
        # Get the directory path where the STAC item should be saved
        # e.g., public/layers/raster/Age_of_Imperviousness/Age_of_Imperviousness.tif -> raster/Age_of_Imperviousness/
        parent_dir = blob_path.parent
        stac_dir = str(parent_dir).replace(self.prefix, '')
        
        raster_item = {
            'name': item_name,
            'filename': blob.name,
            'url': public_url,
            'type': 'raster',
            'format': 'GeoTIFF',
            'size_bytes': blob.size,
            'content_type': blob.content_type,
            'created': blob.time_created.isoformat() if blob.time_created else None,
            'updated': blob.updated.isoformat() if blob.updated else None,
            'discovered_at': datetime.now().isoformat(),
            'etag': blob.etag,
            'md5_hash': blob.md5_hash,
            'stac_dir': stac_dir  # Directory where STAC item should be saved
        }
        
        self.rasters.append(raster_item)
        self.processed_items.add(item_key)
        print(f"Found raster: {item_name}")
    
    def get_blob_info(self, blob_name: str) -> Optional[Dict]:
        """Get detailed information about a specific blob."""
        if not self.bucket:
            return None
            
        try:
            blob = self.bucket.blob(blob_name)
            if blob.exists():
                return {
                    'name': blob.name,
                    'size': blob.size,
                    'content_type': blob.content_type,
                    'created': blob.time_created.isoformat() if blob.time_created else None,
                    'updated': blob.updated.isoformat() if blob.updated else None,
                    'etag': blob.etag,
                    'md5_hash': blob.md5_hash,
                    'public_url': f"https://storage.googleapis.com/{self.bucket_name}/{blob.name}"
                }
        except Exception as e:
            print(f"Error getting blob info for {blob_name}: {e}")
            
        return None
    
    def _create_sample_data(self):
        """Create sample data structure when bucket can't be crawled directly."""
        print("Creating sample data structure...")
        
        base_url = f"https://storage.googleapis.com/{self.bucket_name}"
        
        # Sample vectors based on your structure
        sample_vectors = [
            {
                'name': 'vector1',
                'filename': f'{self.prefix}vector/vector1/vector1.geojson',
                'url': f"{base_url}/{self.prefix}vector/vector1/vector1.geojson",
                'type': 'vector',
                'format': 'GeoJSON',
                'size_bytes': None,
                'content_type': 'application/geo+json',
                'created': None,
                'updated': None,
                'discovered_at': datetime.now().isoformat(),
                'etag': None,
                'md5_hash': None,
                'stac_dir': 'vector/vector1'
            }
        ]
        
        # Sample rasters based on your structure
        sample_rasters = [
            {
                'name': 'raster1',
                'filename': f'{self.prefix}raster/raster1/raster1.tiff',
                'url': f"{base_url}/{self.prefix}raster/raster1/raster1.tiff",
                'type': 'raster',
                'format': 'GeoTIFF',
                'size_bytes': None,
                'content_type': 'image/tiff',
                'created': None,
                'updated': None,
                'discovered_at': datetime.now().isoformat(),
                'etag': None,
                'md5_hash': None,
                'stac_dir': 'raster/raster1'
            }
        ]
        
        self.vectors = sample_vectors
        self.rasters = sample_rasters
        
        return {
            'vectors': self.vectors,
            'rasters': self.rasters,
            'total_items': len(self.vectors) + len(self.rasters)
        }

class CatalogGenerator:
    def __init__(self, crawler_data: Dict):
        """Initialize with data from the crawler."""
        self.data = crawler_data
        self.stac_items = []
        
    def generate_stac_item(self, item_data: Dict, item_type: str) -> Dict:
        """Generate a STAC item for vector or raster data."""
        item_id = item_data['name']
        
        # Base STAC item structure
        stac_item = {
            "type": "Feature",
            "stac_version": "1.0.0",
            "id": item_id,
            "properties": {
                "title": item_data['name'].replace('_', ' ').title(),
                "description": f"{item_type.title()} dataset: {item_data['name']}",
                "datetime": item_data['discovered_at'],
                "created": item_data.get('created') or item_data['discovered_at'],
                "updated": item_data.get('updated') or item_data['discovered_at'],
                "providers": [
                    {
                        "name": "SWHM Data",
                        "roles": ["producer", "processor", "host"],
                        "url": "https://storage.googleapis.com/swhm_data/"
                    }
                ]
            },
            "geometry": None,  # Would need to extract from actual data
            "bbox": None,  # Would need to calculate from geometry/bounds
            "assets": {},
            "links": [
                {
                    "rel": "self",
                    "href": f"./{item_id}.json",
                    "type": "application/json"
                },
                {
                    "rel": "parent",
                    "href": "../collection.json",
                    "type": "application/json"
                },
                {
                    "rel": "collection",
                    "href": "../collection.json",
                    "type": "application/json"
                },
                {
                    "rel": "root",
                    "href": "../../catalog.json",
                    "type": "application/json"
                }
            ]
        }
        
        # Add assets based on type
        if item_type == 'vector':
            stac_item["assets"]["data"] = {
                "href": item_data['url'],
                "type": "application/geo+json",
                "title": "GeoJSON data",
                "description": "Vector data in GeoJSON format",
                "roles": ["data"],
                "file:size": item_data.get('size_bytes'),
                "file:checksum": item_data.get('md5_hash')
            }
        elif item_type == 'raster':
            stac_item["assets"]["data"] = {
                "href": item_data['url'],
                "type": "image/tiff; application=geotiff",
                "title": "GeoTIFF data",
                "description": "Raster data in GeoTIFF format",
                "roles": ["data"],
                "file:size": item_data.get('size_bytes'),
                "file:checksum": item_data.get('md5_hash')
            }
            
            # Add COG asset if it's a Cloud Optimized GeoTIFF
            stac_item["assets"]["cog"] = {
                "href": item_data['url'],
                "type": "image/tiff; application=geotiff; profile=cloud-optimized",
                "title": "Cloud Optimized GeoTIFF",
                "description": "Cloud Optimized GeoTIFF for web access",
                "roles": ["data", "overview"]
            }
        
        # Only add metadata and thumbnail assets if they exist (no placeholders)
        # This prevents STAC Browser from trying to load non-existent resources
        
        return stac_item
    
    def generate_vector_collection(self) -> Dict:
        """Generate a vector collection with individual STAC items."""
        collection = {
            "type": "Collection",
            "stac_version": "1.0.0",
            "id": "swhm-vector",
            "title": "SWHM Vector Collection",
            "description": "Collection of vector datasets from SWHM data bucket",
            "keywords": ["vector", "geojson", "swhm"],
            "license": "proprietary",
            "extent": {
                "spatial": {
                    "bbox": [[-180, -90, 180, 90]]  # Global bbox - update with actual bounds
                },
                "temporal": {
                    "interval": [[None, None]]
                }
            },
            "providers": [
                {
                    "name": "SWHM Data",
                    "roles": ["producer", "processor", "host"],
                    "url": "https://storage.googleapis.com/swhm_data/"
                }
            ],
            "links": [
                {
                    "rel": "self",
                    "href": "./collection.json",
                    "type": "application/json"
                },
                {
                    "rel": "parent",
                    "href": "../catalog.json",
                    "type": "application/json"
                },
                {
                    "rel": "root",
                    "href": "../catalog.json",
                    "type": "application/json"
                }
            ],
            "item_assets": {
                "data": {
                    "type": "application/geo+json",
                    "title": "GeoJSON data",
                    "roles": ["data"]
                }
            }
        }
        
        # Add links to individual items - crawler already handles deduplication
        for vector in self.data['vectors']:
            item_id = vector['name']
            # Create relative path to item in its own directory
            # e.g., for item in vector/PugetSoundWA/ -> ../PugetSoundWA/PugetSoundWA.json
            item_dir = vector['stac_dir'].split('/')[-1]  # Get just the item directory name
            collection["links"].append({
                "rel": "item",
                "href": f"../{item_dir}/{item_id}.json",
                "type": "application/json",
                "title": vector['name'].replace('_', ' ').title()
            })
            
        return collection
    
    def generate_raster_collection(self) -> Dict:
        """Generate a raster collection with individual STAC items."""
        collection = {
            "type": "Collection",
            "stac_version": "1.0.0",
            "id": "swhm-raster",
            "title": "SWHM Raster Collection",
            "description": "Collection of raster datasets from SWHM data bucket",
            "keywords": ["raster", "geotiff", "swhm"],
            "license": "proprietary",
            "extent": {
                "spatial": {
                    "bbox": [[-180, -90, 180, 90]]  # Global bbox - update with actual bounds
                },
                "temporal": {
                    "interval": [[None, None]]
                }
            },
            "providers": [
                {
                    "name": "SWHM Data",
                    "roles": ["producer", "processor", "host"],
                    "url": "https://storage.googleapis.com/swhm_data/"
                }
            ],
            "links": [
                {
                    "rel": "self",
                    "href": "./collection.json",
                    "type": "application/json"
                },
                {
                    "rel": "parent",
                    "href": "../catalog.json",
                    "type": "application/json"
                },
                {
                    "rel": "root",
                    "href": "../catalog.json",
                    "type": "application/json"
                }
            ],
            "item_assets": {
                "data": {
                    "type": "image/tiff; application=geotiff",
                    "title": "GeoTIFF data",
                    "roles": ["data"]
                },
                "cog": {
                    "type": "image/tiff; application=geotiff; profile=cloud-optimized",
                    "title": "Cloud Optimized GeoTIFF",
                    "roles": ["data", "overview"]
                }
            }
        }
        
        # Add links to individual items - crawler already handles deduplication
        for raster in self.data['rasters']:
            item_id = raster['name']
            # Create relative path to item in its own directory
            # e.g., for item in raster/Age_of_Imperviousness/ -> ../Age_of_Imperviousness/Age_of_Imperviousness.json
            item_dir = raster['stac_dir'].split('/')[-1]  # Get just the item directory name
            collection["links"].append({
                "rel": "item",
                "href": f"../{item_dir}/{item_id}.json",
                "type": "application/json",
                "title": raster['name'].replace('_', ' ').title()
            })
            
        return collection
    
    def generate_master_catalog(self, vector_collection: Dict, raster_collection: Dict) -> Dict:
        """Generate the master catalog containing all collections."""
        catalog = {
            "type": "Catalog",
            "stac_version": "1.0.0",
            "id": "swhm-data-catalog",
            "title": "SWHM Data Catalog",
            "description": "Master catalog for SWHM vector and raster datasets",
            "created": datetime.now().isoformat(),
            "updated": datetime.now().isoformat(),
            "keywords": ["swhm", "vector", "raster", "geospatial"],
            "providers": [
                {
                    "name": "SWHM Data",
                    "roles": ["producer", "processor", "host"],
                    "url": "https://storage.googleapis.com/swhm_data/"
                }
            ],
            "links": [
                {
                    "rel": "self",
                    "href": "./catalog.json",
                    "type": "application/json",
                    "title": "SWHM Data Catalog"
                },
                {
                    "rel": "child",
                    "href": "./vector/collection.json",
                    "type": "application/json",
                    "title": "Vector Collection"
                },
                {
                    "rel": "child",
                    "href": "./raster/collection.json",
                    "type": "application/json",
                    "title": "Raster Collection"
                }
            ],
            "conformsTo": [
                "https://api.stacspec.org/v1.0.0/core",
                "https://api.stacspec.org/v1.0.0/collections"
            ]
        }
        
        return catalog
    
    def generate_all_stac_items(self) -> Dict[str, List[Dict]]:
        """Generate all STAC items for vectors and rasters."""
        vector_items = []
        raster_items = []
        
        # Generate vector items - crawler already handles deduplication
        for vector in self.data['vectors']:
            stac_item = self.generate_stac_item(vector, 'vector')
            vector_items.append(stac_item)
            
        # Generate raster items - crawler already handles deduplication
        for raster in self.data['rasters']:
            stac_item = self.generate_stac_item(raster, 'raster')
            raster_items.append(stac_item)
            
        return {
            'vector_items': vector_items,
            'raster_items': raster_items
        }

def save_json(data: Dict, filepath: str):
    """Save data to JSON file with pretty formatting."""
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"Saved: {filepath}")

In [3]:
class GCSUploader:
    """
    Handles uploading STAC catalog files to Google Cloud Storage.
    Supports both gsutil and native GCS client approaches.
    Sets Cache-Control headers to prevent browser caching of JSON files.
    """
    
    def __init__(self, bucket_name: str, project_id: Optional[str] = None):
        """
        Initialize the GCS uploader.
        
        Args:
            bucket_name: Name of the GCS bucket
            project_id: GCP project ID (optional)
        """
        self.bucket_name = bucket_name
        self.project_id = project_id
        self.use_gsutil = shutil.which("gsutil") is not None
        
        # Try to initialize GCS client as fallback
        self.client = None
        self.bucket = None
        if not self.use_gsutil:
            try:
                if project_id:
                    self.client = storage.Client(project=project_id)
                else:
                    self.client = storage.Client()
                self.bucket = self.client.bucket(bucket_name)
                print("Using native GCS client for uploads")
            except Exception as e:
                print(f"WARNING: Could not initialize GCS client: {e}")
                print("Upload functionality will be limited")
        else:
            print("Using gsutil for uploads")
    
    def upload_directory(self, root_dir: str, prefix: str = "", dry_run: bool = False) -> Dict:
        """
        Upload all STAC JSON files from a directory structure to GCS.
        
        Args:
            root_dir: Local directory containing STAC files
            prefix: GCS path prefix (e.g., "stac/")
            dry_run: If True, show what would be uploaded without doing it
            
        Returns:
            Dictionary with upload results
        """
        root_path = Path(root_dir).resolve()
        
        if not root_path.is_dir():
            raise ValueError(f"Directory does not exist: {root_path}")
        
        if prefix and not prefix.endswith("/"):
            prefix += "/"
        
        print(f"Scanning directory: {root_path}")
        print(f"Target GCS location: gs://{self.bucket_name}/{prefix}")
        print(f"Dry run: {dry_run}")
        print("-" * 50)
        
        results = {
            "uploaded": [],
            "skipped": [],
            "failed": [],
            "total_files": 0
        }
        
        # Find all JSON files to upload
        json_files = list(root_path.rglob("*.json"))
        results["total_files"] = len(json_files)
        
        if not json_files:
            print("No JSON files found to upload")
            return results
        
        print(f"Found {len(json_files)} JSON files to upload")
        print("-" * 50)
        
        for json_file in json_files:
            try:
                self._upload_single_file(json_file, root_path, prefix, dry_run, results)
            except Exception as e:
                print(f"ERROR: Failed to upload {json_file}: {e}")
                results["failed"].append({
                    "file": str(json_file),
                    "error": str(e)
                })
        
        # Print summary
        self._print_summary(results)
        return results
    
    def _upload_single_file(self, file_path: Path, root_path: Path, prefix: str, 
                          dry_run: bool, results: Dict):
        """Upload a single file to GCS."""
        relative_path = file_path.relative_to(root_path)
        gcs_path = f"{prefix}{relative_path.as_posix()}"
        gcs_url = f"gs://{self.bucket_name}/{gcs_path}"
        
        print(f"📄 {relative_path}")
        print(f"   → {gcs_url}")
        
        if dry_run:
            print("   → DRY RUN: Would upload with Cache-Control headers")
            results["skipped"].append(str(file_path))
            return
        
        # Try gsutil first, fall back to native client
        success = False
        
        if self.use_gsutil:
            success = self._upload_with_gsutil(file_path, gcs_url)
        
        if not success and self.client:
            success = self._upload_with_client(file_path, gcs_path)
        
        if success:
            print("   ✅ Upload successful with Cache-Control headers")
            results["uploaded"].append(str(file_path))
        else:
            print("   ❌ Upload failed")
            results["failed"].append({
                "file": str(file_path),
                "error": "All upload methods failed"
            })
        
        print()
    
    def _upload_with_gsutil(self, file_path: Path, gcs_url: str) -> bool:
        """Upload using gsutil command with Cache-Control headers."""
        try:
            # Set Cache-Control header to prevent browser caching
            # This forces browsers to re-fetch the STAC catalog files on each request
            cmd = [
                "gsutil", 
                "-h", "Cache-Control:no-cache, no-store, must-revalidate",
                "-h", "Content-Type:application/json",
                "cp", 
                str(file_path), 
                gcs_url
            ]
            result = subprocess.run(
                cmd, 
                capture_output=True, 
                text=True, 
                check=True,
                timeout=30
            )
            return True
        except subprocess.CalledProcessError as e:
            print(f"   gsutil error: {e.stderr.strip()}")
            return False
        except subprocess.TimeoutExpired:
            print("   gsutil timeout")
            return False
        except Exception as e:
            print(f"   gsutil exception: {e}")
            return False
    
    def _upload_with_client(self, file_path: Path, gcs_path: str) -> bool:
        """Upload using native GCS client with Cache-Control headers."""
        try:
            blob = self.bucket.blob(gcs_path)
            
            # Set Cache-Control header to prevent browser caching
            blob.cache_control = "no-cache, no-store, must-revalidate"
            blob.content_type = "application/json"
            
            blob.upload_from_filename(str(file_path))
            return True
        except Exception as e:
            print(f"   GCS client error: {e}")
            return False
    
    def _print_summary(self, results: Dict):
        """Print upload summary."""
        print("=" * 50)
        print("📊 UPLOAD SUMMARY")
        print("=" * 50)
        print(f"Total files found: {results['total_files']}")
        print(f"Successfully uploaded: {len(results['uploaded'])}")
        print(f"Skipped (dry run): {len(results['skipped'])}")
        print(f"Failed: {len(results['failed'])}")
        
        if results['failed']:
            print(f"\n❌ Failed uploads:")
            for failure in results['failed']:
                if isinstance(failure, dict):
                    print(f"   • {failure['file']}: {failure['error']}")
                else:
                    print(f"   • {failure}")
        
        if results['uploaded']:
            print(f"\n✅ Upload complete! Files available at:")
            print(f"   gs://{self.bucket_name}/")
            print(f"\n🔄 Cache-Control headers set to 'no-cache, no-store, must-revalidate'")
            print(f"   This ensures STAC Browser always fetches the latest catalog data")


def upload_stac_catalog(root_dir: str, bucket_name: str, prefix: str = "", 
                       dry_run: bool = False, project_id: Optional[str] = None) -> Dict:
    """
    Convenience function to upload STAC catalog files to GCS with cache-busting headers.
    
    Args:
        root_dir: Local directory containing STAC files
        bucket_name: GCS bucket name
        prefix: GCS path prefix
        dry_run: If True, show what would be uploaded without doing it
        project_id: GCP project ID (optional)
        
    Returns:
        Dictionary with upload results
    """
    uploader = GCSUploader(bucket_name, project_id)
    return uploader.upload_directory(root_dir, prefix, dry_run)

In [4]:
def main():
    """Main function to crawl bucket and generate catalog with fixed structure."""
    # Configuration - update these values for your specific bucket
    bucket_name = "swhm_data"  # Just the bucket name, not the full URL
    prefix = "public/layers/"  # Path prefix within the bucket
    project_id = None  # Set your GCP project ID if needed
    
    # Initialize crawler
    crawler = GCPBucketCrawler(bucket_name, prefix, project_id)
    
    # Crawl the bucket
    print("Starting bucket crawl...")
    crawl_data = crawler.crawl_bucket()
    
    print(f"Found {len(crawl_data['vectors'])} vectors and {len(crawl_data['rasters'])} rasters")
    
    # Generate collections and catalog
    generator = CatalogGenerator(crawl_data)
    
    # Generate all STAC items
    print("Generating STAC items...")
    stac_items = generator.generate_all_stac_items()
    
    # Generate collections
    vector_collection = generator.generate_vector_collection()
    raster_collection = generator.generate_raster_collection()
    
    # Generate master catalog
    master_catalog = generator.generate_master_catalog(vector_collection, raster_collection)
    
    # Save all files with corrected structure
    print("\nSaving catalog files...")
    
    # Save master catalog
    save_json(master_catalog, "catalog/catalog.json")
    
    # Save collections (singular names)
    save_json(vector_collection, "catalog/vector/collection.json")
    save_json(raster_collection, "catalog/raster/collection.json")
    
    # Save individual STAC items in same directories as source data
    print("Saving individual STAC items...")
    
    # Save vector items in their respective directories
    for item in stac_items['vector_items']:
        # Find the corresponding vector data to get the directory
        vector_data = next((v for v in crawl_data['vectors'] if v['name'] == item['id']), None)
        if vector_data:
            item_path = f"catalog/{vector_data['stac_dir']}/{item['id']}.json"
            save_json(item, item_path)
    
    # Save raster items in their respective directories
    for item in stac_items['raster_items']:
        # Find the corresponding raster data to get the directory
        raster_data = next((r for r in crawl_data['rasters'] if r['name'] == item['id']), None)
        if raster_data:
            item_path = f"catalog/{raster_data['stac_dir']}/{item['id']}.json"
            save_json(item, item_path)
    
    # Save summary with enhanced metadata
    summary = {
        "crawl_summary": {
            "bucket_name": bucket_name,
            "prefix": prefix,
            "crawl_time": datetime.now().isoformat(),
            "total_items": crawl_data['total_items'],
            "vectors_found": len(crawl_data['vectors']),
            "rasters_found": len(crawl_data['rasters']),
            "stac_items_generated": len(stac_items['vector_items']) + len(stac_items['raster_items'])
        },
        "discovered_vectors": crawl_data['vectors'],
        "discovered_rasters": crawl_data['rasters'],
        "stac_structure": {
            "catalog": "catalog/catalog.json",
            "vector_collection": "catalog/vector/collection.json",
            "raster_collection": "catalog/raster/collection.json",
            "vector_items": [f"catalog/{v['stac_dir']}/{v['name']}.json" for v in crawl_data['vectors']],
            "raster_items": [f"catalog/{r['stac_dir']}/{r['name']}.json" for r in crawl_data['rasters']]
        }
    }
    save_json(summary, "catalog/crawl_summary.json")
    
    print(f"\n✅ Catalog generation complete!")
    print(f"   - Master catalog: catalog/catalog.json")
    print(f"   - Vector collection: catalog/vector/collection.json")
    print(f"   - Raster collection: catalog/raster/collection.json")
    print(f"   - Vector items: {len(stac_items['vector_items'])} items in their respective directories")
    print(f"   - Raster items: {len(stac_items['raster_items'])} items in their respective directories")
    print(f"   - Crawl summary: catalog/crawl_summary.json")
    
    # Print directory structure
    print(f"\n📁 Generated directory structure:")
    print(f"   catalog/")
    print(f"   ├── catalog.json")
    print(f"   ├── crawl_summary.json")
    print(f"   ├── vector/")
    print(f"   │   ├── collection.json")
    print(f"   │   ├── ItemName1/")
    print(f"   │   │   └── ItemName1.json")
    print(f"   │   └── ItemName2/")
    print(f"   │       └── ItemName2.json")
    print(f"   └── raster/")
    print(f"       ├── collection.json")
    print(f"       ├── ItemName1/")
    print(f"       │   └── ItemName1.json")
    print(f"       └── ItemName2/")
    print(f"           └── ItemName2.json")
    
    # Print authentication help if needed
    print(f"\n💡 If you encountered authentication errors:")
    print(f"   1. Install: pip install google-cloud-storage")
    print(f"   2. Set up authentication:")
    print(f"      - Service account: export GOOGLE_APPLICATION_CREDENTIALS='path/to/key.json'")
    print(f"      - Or use: gcloud auth application-default login")

if __name__ == "__main__":
    main()

Successfully connected to bucket: swhm_data
Starting bucket crawl...
Crawling bucket 'swhm_data' with prefix 'public/layers/'...
Found raster: Age_of_Imperviousness
Found raster: Flow_Duration_Index
Found raster: HSPF_Land_Cover_Type
Found raster: Hydrologic_Response_Units
Found raster: Imperviousness
Found raster: Land_Cover
Found raster: Land_Use
Found raster: Population_Density
Found raster: Precipitation_mm
Found raster: Runoff_mm
Found raster: Slope
Found raster: Slope_Categories
Found raster: Soils
Found raster: Total_Copper_Concentration
Found raster: Total_Kjeldahl_Nitrogen_Concentration
Found raster: Total_Phosphorus_Concentration
Found raster: Total_Suspended_Solids_Concentration
Found raster: Total_Zinc_Concentration
Found raster: Traffic
Found raster: copper_concentration_ug_per_L
Found vector: PugetSoundWA
Found vector: cig_grid_wgs
Processed 23 objects from bucket
Found 2 unique vectors and 20 unique rasters
Found 2 vectors and 20 rasters
Generating STAC items...

Saving 

In [6]:
# Configuration
GCS_BUCKET = "swhm_data"
GCS_PREFIX = "public/layers/"
CATALOG_DIR = "/Users/christiannilsen/Documents/repos/swmh-stac-catalog/catalog"

# Example usage with new refactored uploader
print("=== UPLOADING STAC CATALOG TO GCS ===\n")

# Upload with dry run first to see what would be uploaded
print("1. Dry run to preview uploads:")
results = upload_stac_catalog(
    root_dir=CATALOG_DIR,
    bucket_name=GCS_BUCKET,
    prefix=GCS_PREFIX,
    dry_run=True
)

print(f"\n2. Actual upload:")
#Uncomment the line below to perform actual upload
results = upload_stac_catalog(
    root_dir=CATALOG_DIR,
    bucket_name=GCS_BUCKET,
    prefix=GCS_PREFIX,
    dry_run=False
)

=== UPLOADING STAC CATALOG TO GCS ===

1. Dry run to preview uploads:
Using gsutil for uploads
Scanning directory: /Users/christiannilsen/Documents/repos/swmh-stac-catalog/catalog
Target GCS location: gs://swhm_data/public/layers/
Dry run: True
--------------------------------------------------
Found 26 JSON files to upload
--------------------------------------------------
📄 scripts/ipynb/catalog/crawl_summary.json
   → gs://swhm_data/public/layers/scripts/ipynb/catalog/crawl_summary.json
   → DRY RUN: Would upload with Cache-Control headers
📄 scripts/ipynb/catalog/catalog.json
   → gs://swhm_data/public/layers/scripts/ipynb/catalog/catalog.json
   → DRY RUN: Would upload with Cache-Control headers
📄 scripts/ipynb/catalog/raster/collection.json
   → gs://swhm_data/public/layers/scripts/ipynb/catalog/raster/collection.json
   → DRY RUN: Would upload with Cache-Control headers
📄 scripts/ipynb/catalog/vector/collection.json
   → gs://swhm_data/public/layers/scripts/ipynb/catalog/vector/c

KeyboardInterrupt: 