In [73]:
import os
import datetime
import pystac
import rasterio
from rio_stac.stac import create_stac_item # Import rio-stac
from google.cloud import storage
from urllib.parse import urlparse
from pathlib import Path
import tempfile
import shlex
import subprocess
import json

In [74]:

#  Google Cloud Storage bucket and prefix (folder) where COGs are located.
GCS_BUCKET = "swhm_data"  # e.g., "my-imagery-bucket"
GCS_PREFIX = "public/layers/"   # e.g., "sentinel-2/l2a/" or leave empty for root

# The public-facing URL for your GCS bucket.
# This is used to create accessible links in the STAC catalog.
# For GCS, it's typically "https://storage.googleapis.com/{BUCKET_NAME}/{FILE_PATH}"
# You could also use a custom domain.
ROOT_CATALOG_URL = f"https://storage.googleapis.com/{GCS_BUCKET}"

# Where the script will save the generated STAC catalog on your local machine.
OUTPUT_DIR = "../../stac_catalog"

# Details for your STAC Catalog.
CATALOG_ID = "swhm-catalog"
CATALOG_TITLE = "Stormwater Heatmap Catalog"
CATALOG_DESCRIPTION = "A STAC catalog for COG imagery stored in GCS, created with rio-stac."

client = storage.Client(project="swhm-prod")
bucket = client.bucket(GCS_BUCKET)
# --- End of Configuration ---



In [75]:

def list_blobs_with_prefix(
    bucket_name: str,
    prefix: str,
    file_extension: str = '.tif',
    delimiter: str = None
) -> list[storage.blob.Blob]:
    """
    Lists all the blobs in a GCS bucket with a given prefix and file extension,
    and returns the Blob objects.
    """
    
    
    blobs = client.list_blobs(
        bucket_name,
        prefix=prefix,
        delimiter=delimiter
    )

    print(f"Fetching blobs from bucket '{bucket_name}' with prefix '{prefix}'...")

    matching_blobs = []
    for blob in blobs:
        name_lower = blob.name.lower()
        if name_lower.endswith(file_extension) or name_lower.endswith('.tiff'):
            matching_blobs.append(blob)

    if delimiter:
        prefixes = getattr(blobs, 'prefixes', None)
        if prefixes:
            print("Sub-prefixes found:")
            for p in prefixes:
                print(f"  {p}")

    return matching_blobs

In [76]:
blobs = list_blobs_with_prefix(GCS_BUCKET, GCS_PREFIX)

Fetching blobs from bucket 'swhm_data' with prefix 'public/layers/'...


In [77]:
blobs

[<Blob: swhm_data, public/layers/Age_of_Imperviousness/Age_of_Imperviousness.tif, 1751410190043610>,
 <Blob: swhm_data, public/layers/Flow_Duration_Index/Flow_Duration_Index.tif, 1751410190448045>,
 <Blob: swhm_data, public/layers/HSPF_Land_Cover_Type/HSPF_Land_Cover_Type.tif, 1751410190823108>,
 <Blob: swhm_data, public/layers/Hydrologic_Response_Units/Hydrologic_Response_Units.tif, 1751410191248083>,
 <Blob: swhm_data, public/layers/Imperviousness/Imperviousness.tif, 1751410191737902>,
 <Blob: swhm_data, public/layers/Land_Cover/Land_Cover.tif, 1751410192311633>,
 <Blob: swhm_data, public/layers/Land_Use/Land_Use.tif, 1751410192753101>,
 <Blob: swhm_data, public/layers/Population_Density/Population_Density.tif, 1751410193107658>,
 <Blob: swhm_data, public/layers/Precipitation_mm/Precipitation_mm.tif, 1751410193460039>,
 <Blob: swhm_data, public/layers/Runoff_mm/Runoff_mm.tif, 1751410193821228>,
 <Blob: swhm_data, public/layers/Slope/Slope.tif, 1751410194184174>,
 <Blob: swhm_data, pu

In [78]:

# for blob in blobs:
#     blob_name = blob.name
#     item_id = Path(blob_name).stem
#     public_href = f"https://storage.googleapis.com/{GCS_BUCKET}/{blob_name}"

#     output_json = f"{OUTPUT_DIR}/{item_id}.json"

#     # Build rio stac command
#     cmd = [
#         "rio", "stac",
#         "--id", item_id,
#         "--asset-name", "image",
#         "--asset-href", public_href,
#         "--asset-mediatype", "COG",
#         "--with-proj",
#         public_href,
#         "-o", output_json
#     ]

#     # Run command and print equivalent shell command for debugging
#     bash_command = ' '.join(shlex.quote(arg) for arg in cmd)
#     print("1.", bash_command)
#     subprocess.run(cmd, capture_output=True, text=True, check=True)


In [79]:

    # # Define the destination path in the bucket
    # destination_blob_name = blob_name.replace('.tif', '.json')
    
    # # Get a reference to the blob object in GCS
    # gcs_blob = bucket.blob(destination_blob_name)

    # print(f"Uploading {output_json} to gs://{GCS_BUCKET}/{destination_blob_name}")
    
    # # Upload the local JSON file to GCS
    # gcs_blob.upload_from_filename(output_json)
    
    # print("Upload successful.")
    # # # Upload resulting JSON to GCS
    # # json_name = f"{item_id}.json"
    # # gc_cmd = [
    # #     "gsutil", "cp",
    # #    output_json,
    # #     f"gs://{GCS_BUCKET}/{blob_name.replace('.tif', '.json')}"
    # # ]

    # # bash_command = ' '.join(shlex.quote(arg) for arg in gc_cmd)
    # # print("2.", bash_command)
    # # # Uncomment to actually upload
    # # subprocess.run(gc_cmd, capture_output=True, text=True, check=True)

In [80]:
catalog = pystac.Catalog(id="test-catalog", description="Tutorial catalog.")


In [81]:

for blob in blobs:
    if blob.name.endswith(".tif") or blob.name.endswith(".tiff"):
        # Build identifiers
        source_tif_gcs_uri = f"gs://{GCS_BUCKET}/{blob.name}"
        item_id = os.path.splitext(os.path.basename(blob.name))[0]
        public_href = source_tif_gcs_uri  # or replace with public URL if applicable

        # Assume acquisition_datetime is determined from blob metadata or another source
        acquisition_datetime = blob.time_created  # or set manually

        # Create STAC item
        stac_item = create_stac_item(
            blob.public_url,
            input_datetime=acquisition_datetime,
            id=item_id,
            asset_name="image",
            asset_href=public_href,
            asset_media_type="image/vnd.stac.geotiff; cloud-optimized=true",
            with_proj=True,
            with_raster=False,
        )

        # Create output directory
        output_json = f"{OUTPUT_DIR}/{item_id}/{item_id}.json"
        output_dir = os.path.dirname(output_json)
        os.makedirs(output_dir, exist_ok=True)

        # Write to file
        #with open(output_json, "w") as f:
         #   f.write(stac_item.to_dict_string() if hasattr(stac_item, "to_dict_string") else str(stac_item))

        catalog.add_item(stac_item)    

In [82]:
catalog.normalize_hrefs(root_href=f"https://storage.googleapis.com/{GCS_BUCKET}/{GCS_PREFIX}")
catalog.save(catalog_type=pystac.CatalogType.SELF_CONTAINED,  dest_href= OUTPUT_DIR )


In [85]:
catalog.save(pystac.CatalogType.ABSOLUTE_PUBLISHED, dest_href=OUTPUT_DIR)

# Scratch

In [88]:
#upload catalog.json 
gc_cmd = [
        "gsutil", 
        "cp",
       f'{OUTPUT_DIR}/catalog.json',
        f"gs://{GCS_BUCKET}/{GCS_PREFIX}catalog.json"
    ]

bash_command = ' '.join(shlex.quote(arg) for arg in gc_cmd)
print("2.", bash_command)
# Uncomment to actually upload
subprocess.run(gc_cmd, capture_output=True, text=True, check=True)

2. gsutil cp ../../stac_catalog/catalog.json gs://swhm_data/public/layers/catalog.json


CompletedProcess(args=['gsutil', 'cp', '../../stac_catalog/catalog.json', 'gs://swhm_data/public/layers/catalog.json'], returncode=0, stdout='', stderr='Copying file://../../stac_catalog/catalog.json [Content-Type=application/json]...\n/ [0 files][    0.0 B/  4.2 KiB]                                                \n/ [1 files][  4.2 KiB/  4.2 KiB]                                                \nOperation completed over 1 objects/4.2 KiB.                                      \n')

In [89]:
def upload_stac_assets(root_dir, bucket, prefix):
    """
    Finds and uploads STAC asset files to Google Cloud Storage.

    This function walks through the specified root directory and looks for
    JSON files that have the same name as their parent directory
    (e.g., 'asset_a/asset_a.json'). It then uploads them to a
    specified GCS bucket, maintaining the relative directory structure.

    Args:
        root_dir (str): The absolute path to the directory to search in.
        bucket (str): The name of the GCS bucket to upload to.
        prefix (str): The prefix (sub-folder) within the GCS bucket.
    """
    print(f"Starting scan in: {root_dir}")
    print(f"Uploading to: gs://{bucket}/{prefix}")
    print("-" * 30)

    # os.walk is perfect for recursively scanning a directory tree.
    # It yields the current directory path, a list of subdirectories, and a list of files.
    # We use '_' for dirnames as it's not used in this loop.
    for dirpath, _, filenames in os.walk(root_dir):
        # Get the name of the current directory being processed
        current_dir_name = os.path.basename(dirpath)
        # Construct the expected filename (e.g., directory 'asset_a' -> file 'asset_a.json')
        expected_filename = f"{current_dir_name}.json"

        # Check if a file with the expected name exists in the current directory
        if expected_filename in filenames:
            # Construct the full local path to the source file
            local_file_path = os.path.join(dirpath, expected_filename)

            # Determine the relative path from the root_dir.
            # This is used to replicate the directory structure in GCS.
            relative_path = os.path.relpath(local_file_path, root_dir)

            # Construct the destination path in GCS
            gcs_destination = f"gs://{bucket}/{prefix}{relative_path}"

            print(f"Found matching asset: {local_file_path}")
            print(f"  -> Uploading to: {gcs_destination}")

            try:
                # Build the command as a list for security and reliability
                gc_cmd = [
                    "gsutil",
                    "cp",
                    local_file_path,
                    gcs_destination
                ]

                # For logging, create a shell-safe string representation
                bash_command = ' '.join(shlex.quote(arg) for arg in gc_cmd)
                print(f"  -> Executing: {bash_command}")

                # Execute the command
                # check=True will raise an exception if gsutil returns an error
                subprocess.run(gc_cmd, capture_output=True, text=True, check=True)

                print("  -> Upload successful!")

            except FileNotFoundError:
                print("  -> ERROR: 'gsutil' command not found.")
                print("     Please ensure the Google Cloud SDK is installed and in your PATH.")
                # Stop the script if gsutil isn't available
                return
            except subprocess.CalledProcessError as e:
                # This block runs if gsutil returns a non-zero exit code (an error)
                print(f"  -> ERROR: Upload failed for {local_file_path}")
                print(f"  -> gsutil stderr: {e.stderr}")

            print("-" * 30)





In [90]:
SEARCH_DIR = os.path.abspath(OUTPUT_DIR)
upload_stac_assets(SEARCH_DIR, GCS_BUCKET, GCS_PREFIX)

Starting scan in: /Users/christiannilsen/Documents/repos/swmh-stac-catalog/catalog/stac_catalog
Uploading to: gs://swhm_data/public/layers/
------------------------------
Found matching asset: /Users/christiannilsen/Documents/repos/swmh-stac-catalog/catalog/stac_catalog/Traffic/Traffic.json
  -> Uploading to: gs://swhm_data/public/layers/Traffic/Traffic.json
  -> Executing: gsutil cp /Users/christiannilsen/Documents/repos/swmh-stac-catalog/catalog/stac_catalog/Traffic/Traffic.json gs://swhm_data/public/layers/Traffic/Traffic.json
  -> Upload successful!
------------------------------
Found matching asset: /Users/christiannilsen/Documents/repos/swmh-stac-catalog/catalog/stac_catalog/test-catalog/Traffic/Traffic.json
  -> Uploading to: gs://swhm_data/public/layers/test-catalog/Traffic/Traffic.json
  -> Executing: gsutil cp /Users/christiannilsen/Documents/repos/swmh-stac-catalog/catalog/stac_catalog/test-catalog/Traffic/Traffic.json gs://swhm_data/public/layers/test-catalog/Traffic/Traff