In [1]:
import os
from datetime import datetime
import pystac
import rasterio
from rio_stac.stac import create_stac_item # Import rio-stac
from google.cloud import storage
from urllib.parse import urlparse
from pathlib import Path
import tempfile
import shlex
import subprocess
import json
import geopandas as gpd

In [2]:

#  Google Cloud Storage bucket and prefix (folder) where COGs are located.
GCS_BUCKET = "swhm_data"  # e.g., "my-imagery-bucket"
GCS_PREFIX = "public/layers/"   # e.g., "sentinel-2/l2a/" or leave empty for root

# The public-facing URL for your GCS bucket.
# This is used to create accessible links in the STAC catalog.
# For GCS, it's typically "https://storage.googleapis.com/{BUCKET_NAME}/{FILE_PATH}"
# You could also use a custom domain.
ROOT_CATALOG_URL = f"https://storage.googleapis.com/{GCS_BUCKET}"
CATALOG_JSON_DEST = f"{ROOT_CATALOG_URL}/{GCS_PREFIX}"
# Where the script will save the generated STAC catalog on your local machine.
OUTPUT_DIR = "../../stac_catalog"

# Details for your STAC Catalog.
CATALOG_ID = "swhm-catalog"
CATALOG_TITLE = "Stormwater Heatmap Catalog"
CATALOG_DESCRIPTION = "A STAC catalog for COG imagery stored in GCS, created with rio-stac."

client = storage.Client(project="swhm-prod")
bucket = client.bucket(GCS_BUCKET)
# --- End of Configuration ---



In [3]:

def list_blobs_with_prefix(
    bucket_name: str,
    prefix: str,
    file_extension: str = '.tif',
    delimiter: str = None
) -> list[storage.blob.Blob]:
    """
    Lists all the blobs in a GCS bucket with a given prefix and file extension,
    and returns the Blob objects.
    """
    
    
    blobs = client.list_blobs(
        bucket_name,
        prefix=prefix,
        delimiter=delimiter
    )

    print(f"Fetching blobs from bucket '{bucket_name}' with prefix '{prefix}'...")

    matching_blobs = []
    for blob in blobs:
        name_lower = blob.name.lower()
        if name_lower.endswith(file_extension) or name_lower.endswith('.tiff'):
            matching_blobs.append(blob)

    if delimiter:
        prefixes = getattr(blobs, 'prefixes', None)
        if prefixes:
            print("Sub-prefixes found:")
            for p in prefixes:
                print(f"  {p}")

    return matching_blobs

In [4]:
blobs = list_blobs_with_prefix(GCS_BUCKET, GCS_PREFIX)

Fetching blobs from bucket 'swhm_data' with prefix 'public/layers/'...


In [5]:
blobs

[<Blob: swhm_data, public/layers/raster/Age_of_Imperviousness/Age_of_Imperviousness.tif, 1751472552998768>,
 <Blob: swhm_data, public/layers/raster/Flow_Duration_Index/Flow_Duration_Index.tif, 1751472553396022>,
 <Blob: swhm_data, public/layers/raster/HSPF_Land_Cover_Type/HSPF_Land_Cover_Type.tif, 1751472553874294>,
 <Blob: swhm_data, public/layers/raster/Hydrologic_Response_Units/Hydrologic_Response_Units.tif, 1751472554321731>,
 <Blob: swhm_data, public/layers/raster/Imperviousness/Imperviousness.tif, 1751472554814663>,
 <Blob: swhm_data, public/layers/raster/Land_Cover/Land_Cover.tif, 1751472555411437>,
 <Blob: swhm_data, public/layers/raster/Land_Use/Land_Use.tif, 1751472555877676>,
 <Blob: swhm_data, public/layers/raster/Population_Density/Population_Density.tif, 1751472556255188>,
 <Blob: swhm_data, public/layers/raster/Precipitation_mm/Precipitation_mm.tif, 1751472556599150>,
 <Blob: swhm_data, public/layers/raster/Runoff_mm/Runoff_mm.tif, 1751472556952871>,
 <Blob: swhm_data, p

In [6]:
catalog = pystac.Catalog(id="raster-catalog", description="Tutorial catalog.")
catalog.normalize_and_save(OUTPUT_DIR,  catalog_type = "ABSOLUTE_PUBLISHED")

In [7]:
#collections 


# Define collection metadata
collection_id = "raster"
description = "A collection of COGs for my project"
license = "CC-BY-4.0"  # or "proprietary", "public-domain", etc.
datetime = datetime.now()
collection = pystac.Collection(
    id=collection_id,
    description=description,
    extent=pystac.Extent(
        spatial=pystac.SpatialExtent([[-180.0, -90.0, 180.0, 90.0]]),  # update with real extent
        temporal=pystac.TemporalExtent([[datetime, None]])  # update with real range
    ),
    license=license,
    title="My STAC Collection",
    keywords=["COG", "satellite", "example"],
)

# Optionally add extra metadata or summaries
collection.add_link(pystac.Link(rel="license", target="https://creativecommons.org/licenses/by/4.0/"))

# Optionally save to file


In [8]:
# blob.public_url

# blob.public_url

#blobs = blobs[0:3]
blob = blobs[0]
item_url = blob.public_url
self_href = os.path.splitext(item_url)[0] + '.json'
self_href

'https://storage.googleapis.com/swhm_data/public/layers/raster/Age_of_Imperviousness/Age_of_Imperviousness.json'

In [9]:
props  = {
    'gsd': 10,  # Ground Sample Distance in meters
    'platform': 'sentinel-2a',
    'constellation': 'sentinel-2'
}
for blob in blobs:
    if blob.name.endswith(".tif") or blob.name.endswith(".tiff"):
        # Build identifiers
        #source_tif_gcs_uri = f"gs://{GCS_BUCKET}/{blob.name}"
        item_url = blob.public_url
        item_id = os.path.splitext(os.path.basename(blob.name))[0]
        #public_href = source_tif_gcs_uri  # or replace with public URL if applicable

        # Assume acquisition_datetime is determined from blob metadata or another source
        acquisition_datetime = blob.time_created  # or set manually

        # Create STAC item
        stac_item = create_stac_item(
            source=item_url,
            input_datetime=acquisition_datetime,
            #collection=collection_id,
            id=item_id,
            asset_name="image",
            properties= props,
            asset_href=item_url,
            asset_media_type="image/tiff; application=geotiff; profile=cloud-optimized", 
            with_proj=True,
            with_raster=False,
        )

        #self_href = os.path.splitext(item_url)[0] + '.json'
        #stac_item.set_self_href(self_href)
        # # Create output directory
        # output_json = f"{OUTPUT_DIR}/{item_id}/{item_id}.json"
        # output_dir = os.path.dirname(output_json)
        # os.makedirs(output_dir, exist_ok=True)

        # Write to file
        # with open(output_json, "w") as f:
        #     f.write(stac_item.to_dict_string() if hasattr(stac_item, "to_dict_string") else str(stac_item))

        #catalog.add_item(stac_item)    
        collection.add_item(stac_item)

In [10]:
# Optional: add collection metadata
collection.set_self_href("collection.json")

In [11]:
OUTPUT_DIR

'../../stac_catalog'

In [None]:

collection.normalize_hrefs(
    root_href="https://storage.googleapis.com/swhm_data/public/layers/raster"
)



In [22]:
#save collection 
output_path = f"{OUTPUT_DIR}/{collection_id}"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
#collection.normalize_hrefs(os.path.dirname(output_path))
collection.save(dest_href=output_path, catalog_type="ABSOLUTE PUBLISHED")

# Upload to GCP 

In [22]:
def upload_stac_assets(root_dir, bucket, prefix):
    """
    Finds and uploads STAC asset files to Google Cloud Storage.

    This function walks through the specified root directory and looks for
    JSON files that have the same name as their parent directory
    (e.g., 'asset_a/asset_a.json'). It then uploads them to a
    specified GCS bucket, maintaining the relative directory structure.

    Args:
        root_dir (str): The absolute path to the directory to search in.
        bucket (str): The name of the GCS bucket to upload to.
        prefix (str): The prefix (sub-folder) within the GCS bucket.
    """
    print(f"Starting scan in: {root_dir}")
    print(f"Uploading to: gs://{bucket}/{prefix}")
    print("-" * 30)

    # os.walk is perfect for recursively scanning a directory tree.
    # It yields the current directory path, a list of subdirectories, and a list of files.
    # We use '_' for dirnames as it's not used in this loop.
    for dirpath, _, filenames in os.walk(root_dir):
        # Get the name of the current directory being processed
        current_dir_name = os.path.basename(dirpath)
        # Construct the expected filename (e.g., directory 'asset_a' -> file 'asset_a.json')
        expected_filename = f"{current_dir_name}.json"

        # Check if a file with the expected name exists in the current directory
        if expected_filename in filenames:
            # Construct the full local path to the source file
            local_file_path = os.path.join(dirpath, expected_filename)

            # Determine the relative path from the root_dir.
            # This is used to replicate the directory structure in GCS.
            relative_path = os.path.relpath(local_file_path, root_dir)

            # Construct the destination path in GCS
            gcs_destination = f"gs://{bucket}/{prefix}{relative_path}"

            print(f"Found matching asset: {local_file_path}")
            print(f"  -> Uploading to: {gcs_destination}")

            try:
                # Build the command as a list for security and reliability
                gc_cmd = [
                    "gsutil",
                    "cp",
                    local_file_path,
                    gcs_destination
                ]

                # For logging, create a shell-safe string representation
                bash_command = ' '.join(shlex.quote(arg) for arg in gc_cmd)
                print(f"  -> Executing: {bash_command}")

                # Execute the command
                # check=True will raise an exception if gsutil returns an error
                subprocess.run(gc_cmd, capture_output=True, text=True, check=True)

                print("  -> Upload successful!")

            except FileNotFoundError:
                print("  -> ERROR: 'gsutil' command not found.")
                print("     Please ensure the Google Cloud SDK is installed and in your PATH.")
                # Stop the script if gsutil isn't available
                return
            except subprocess.CalledProcessError as e:
                # This block runs if gsutil returns a non-zero exit code (an error)
                print(f"  -> ERROR: Upload failed for {local_file_path}")
                print(f"  -> gsutil stderr: {e.stderr}")

            print("-" * 30)





In [24]:
upload_stac_assets(root_dir='../../stac_catalog/raster', bucket=GCS_BUCKET,prefix=f"{GCS_PREFIX}raster/")

Starting scan in: ../../stac_catalog/raster
Uploading to: gs://swhm_data/public/layers/raster/
------------------------------
Found matching asset: ../../stac_catalog/raster/Traffic/Traffic.json
  -> Uploading to: gs://swhm_data/public/layers/raster/Traffic/Traffic.json
  -> Executing: gsutil cp ../../stac_catalog/raster/Traffic/Traffic.json gs://swhm_data/public/layers/raster/Traffic/Traffic.json
  -> Upload successful!
------------------------------
Found matching asset: ../../stac_catalog/raster/Slope/Slope.json
  -> Uploading to: gs://swhm_data/public/layers/raster/Slope/Slope.json
  -> Executing: gsutil cp ../../stac_catalog/raster/Slope/Slope.json gs://swhm_data/public/layers/raster/Slope/Slope.json
  -> Upload successful!
------------------------------
Found matching asset: ../../stac_catalog/raster/Land_Use/Land_Use.json
  -> Uploading to: gs://swhm_data/public/layers/raster/Land_Use/Land_Use.json
  -> Executing: gsutil cp ../../stac_catalog/raster/Land_Use/Land_Use.json gs://s

# Vector Data

## Vector Collection

# Scratch