In [2]:
import os
import sys
import boto3
import yaml
from botocore import UNSIGNED
from botocore.config import Config

from datetime import datetime, timedelta
from datetime import UTC
import io
import xarray as xr
import numpy as np

In [8]:
def read_config(config:os.PathLike)->dict:
    required_keys = ['bucket_name','prefix','product','nc_storage_dir','nc_filename_template']
    try:
        with open(config,'r') as f:
            cfg = yaml.safe_load(f)
        for key in required_keys:
            if key not in cfg:
                print(f"Error: key '{key}' not found in config.")
    except FileNotFoundError:
        raise
    return cfg

In [4]:
def transform_data(data,clip=None):
    ds = xr.open_dataset(data, engine='h5netcdf')
    # Original dimension
    nrows = ds.sizes["Rows"]   # atau "row"
    ncols = ds.sizes["Columns"] # atau "column"
    
    # metadata
    lat_min = float(ds.geospatial_lat_min)
    lat_max = float(ds.geospatial_lat_max)
    lon_min = float(ds.geospatial_lon_min)
    lon_max = float(ds.geospatial_lon_max)
    
    # generate grids coordinates
    lats = np.linspace(lat_max, lat_min, nrows)   # north → south
    lons = np.linspace(lon_min, lon_max, ncols)
    
    # Assign koordinat
    ds = ds.assign_coords(
        lat=("Rows", lats),
        lon=("Columns", lons)
    )
    
    # Ganti dimensi: row→lat, column→lon
    ds = ds.swap_dims({"Rows": "lat", "Columns": "lon"})
    
    # Tambahkan atribut CF
    ds["lat"].attrs = {
        "standard_name": "latitude",
        "long_name": "latitude",
        "units": "degrees_north"
    }
    ds["lon"].attrs = {
        "standard_name": "longitude",
        "long_name": "longitude",
        "units": "degrees_east"
    }
    
    if clip:
        north, south, west, east = clip
        ds_clip = ds['RRQPE'].sel(
        lon=slice(west, east),
        lat=slice(north, south) )
        ds_clip.attrs = ds.attrs.copy()

        ds_clip.attrs['geospatial_lat_min'] = float(ds_clip.lat.min())
        ds_clip.attrs['geospatial_lat_max'] = float(ds_clip.lat.max())
        ds_clip.attrs['geospatial_lon_min'] = float(ds_clip.lon.min())
        ds_clip.attrs['geospatial_lon_max'] = float(ds_clip.lon.max())
        return ds_clip
    
    else:
        return ds

In [5]:
def get_latest_file(bucket, prefixes, substring="GLB-5"):
    s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
    
    for prefix in prefixes:
        print(f"Looking for data at {prefix}")
        response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
        contents = response.get("Contents", [])
        if not contents:
            continue  # coba prefix berikutnya
        
        # filter berdasarkan substring
        filtered = [obj for obj in contents if substring in obj["Key"]]
        if filtered:
            # ambil objek terbaru tanpa sort full list
            latest = max(filtered, key=lambda x: x["LastModified"])
            return latest
    return None

In [6]:
def download_scampr(config:str|os.PathLike):
    now = datetime.now(UTC)
    now_1= now-timedelta(hours=1)
    print(f"Initializing download at {now:%m-%d %H:%M}")
    
    cfg = read_config(config)
    bucket_name = cfg.get('bucket_name')
    clip = cfg.get('clip')
    prefix = cfg.get('prefix').format(datestring=now.strftime('%Y/%m/%d/%H'))
    prefix_1 = cfg.get('prefix').format(datestring=now_1.strftime('%Y/%m/%d/%H'))
    prefixes = [prefix, prefix_1] 
    latest_obj = get_latest_file(bucket_name, prefixes)
    local_dir = cfg.get('local_storage_dir')
    
    if latest_obj:
        print(f"Latest data is {latest_obj['Key']}")
        #check if data is downloaded
        filename_aws = os.path.basename(latest_obj["Key"])
        timestamp = filename_aws.split("_")[3][1:]
        
        if clip:
            filename_check = cfg.get('filename_template').format(datestring=timestamp)
            local_file = os.path.join(local_dir,filename_check)
        else:
            local_file = os.path.join(local_dir,filename_aws)
        
        if not os.path.isfile(local_file):
            s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
            obj = s3.get_object(Bucket=bucket_name, Key=latest_obj["Key"])
            data = io.BytesIO(obj["Body"].read())
        elif os.path.isfile(local_file):
            #check file size
            file_size = os.path.getsize(local_file)
            # if file size less than 700KB, re-download
            if file_size < 700*1024:
                print(f"File size {file_size} bytes is less than 700KB, re-downloading...")
                s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
                obj = s3.get_object(Bucket=bucket_name, Key=latest_obj["Key"])
                data = io.BytesIO(obj["Body"].read())
            else:
                print(f"File already exists: {local_file}, skipping download.")
        else:
            print(f"File already exists: {local_file}, skipping download.")
                    
    else:
        print("No matching files found")
        data = None
        
    print("Transforming data to xarray dataset")
    ds = transform_data(data,clip)
    
    print("Saving to netcdf")
    file_datestring = datetime.strptime(ds.attrs['time_coverage_start'], "%Y-%m-%dT%H:%M:%SZ").strftime("%Y%m%d%H%M000")
    
    filename = cfg.get('filename_template').format(datestring=file_datestring)
    output_file = os.path.join(local_dir,filename)
    ds.to_netcdf(output_file)

In [7]:
download_scampr("D:\Projects\scampr-nowcasting\config.yaml")

  download_scampr("D:\Projects\scampr-nowcasting\config.yaml")


Initializing download at 10-08 04:03
Error: key 'local_storage_dir' not found in config.
Error: key 'filename_template' not found in config.
Looking for data at BLEND/RainRate-Blend-INST/2025/10/08/04
Looking for data at BLEND/RainRate-Blend-INST/2025/10/08/03
Latest data is BLEND/RainRate-Blend-INST/2025/10/08/03/RRQPE-INST-GLB-5_v1r1_blend_s202510080340000_e202510080349599_c202510080359186.nc


  download_scampr("D:\Projects\scampr-nowcasting\config.yaml")


AttributeError: 'NoneType' object has no attribute 'format'

In [17]:
now = datetime.now(UTC)

In [18]:
s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
bucket_name = "noaa-enterprise-rainrate-pds"
prefix = f"BLEND/RainRate-Blend-INST/{now:%Y/%m/%d/%H/}"
prefix_1 = f"BLEND/RainRate-Blend-INST/{(now - timedelta(hours=1)):%Y/%m/%d/%H/}"

In [19]:
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
if 'Contents' not in response:
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix_1)

objects = response.get("Contents", [])
filtered = [obj for obj in objects if "GLB-5" in obj["Key"]]
filtered = sorted(filtered, key=lambda x: x["LastModified"], reverse=True)


In [20]:
obj = s3_client.get_object(Bucket=bucket_name, Key=filtered[0]["Key"])
data = io.BytesIO(obj["Body"].read())

IndexError: list index out of range

In [23]:
objects

[{'Key': 'BLEND/RainRate-Blend-INST/2025/10/04/04/RRQPE-INST-GLB-2_v1r1_blend_s202510040400000_e202510040409599_c202510040413095.nc',
  'LastModified': datetime.datetime(2025, 10, 4, 4, 13, 32, tzinfo=tzutc()),
  'ETag': '"df61b43bdbfab5026dbd687435a56670"',
  'ChecksumAlgorithm': ['CRC32'],
  'ChecksumType': 'FULL_OBJECT',
  'Size': 2200848,
  'StorageClass': 'STANDARD'}]

In [7]:
ds = xr.open_dataset(data, engine='h5netcdf')

In [10]:
ds['RRQPE'].attrs

{'units': 'mm/h',
 'long_name': 'Rainfall Rate Quantitative Precipitation Estimates'}

In [8]:
# Original dimension
nrows = ds.sizes["Rows"]   # atau "row"
ncols = ds.sizes["Columns"] # atau "column"

# metadata
lat_min = float(ds.geospatial_lat_min)
lat_max = float(ds.geospatial_lat_max)
lon_min = float(ds.geospatial_lon_min)
lon_max = float(ds.geospatial_lon_max)

# generate grids coordinates
lats = np.linspace(lat_max, lat_min, nrows)   # north → south
lons = np.linspace(lon_min, lon_max, ncols)

# Assign koordinat
ds = ds.assign_coords(
    lat=("Rows", lats),
    lon=("Columns", lons)
)

# Ganti dimensi: row→lat, column→lon
ds = ds.swap_dims({"Rows": "lat", "Columns": "lon"})

# Tambahkan atribut CF
ds["lat"].attrs = {
    "standard_name": "latitude",
    "long_name": "latitude",
    "units": "degrees_north"
}
ds["lon"].attrs = {
    "standard_name": "longitude",
    "long_name": "longitude",
    "units": "degrees_east"
}

In [23]:
ds_clip = ds['RRQPE'].sel(
    lon=slice(lon_min, lon_max),
    lat=slice(25, -25)  # ingat: lat dari utara ke selatan
)

In [25]:
ds_clip.attrs = ds.attrs.copy()

In [27]:
ds_clip.attrs['geospatial_lat_min'] = float(ds_clip.lat.min())
ds_clip.attrs['geospatial_lat_max'] = float(ds_clip.lat.max())
ds_clip.attrs['geospatial_lon_min'] = float(ds_clip.lon.min())
ds_clip.attrs['geospatial_lon_max'] = float(ds_clip.lon.max())
start_time = datetime.strptime(ds_clip.attrs['time_coverage_start'], "%Y-%m-%dT%H:%M:%SZ")

In [33]:
ds_clip.to_netcdf(f"scampr_indonesia_{start_time:%Y%m%d%H%M}.nc")

In [16]:
ds_clip

NameError: name 'ds_clip' is not defined

In [34]:
lon_max

165

In [35]:
lon_min

70