In [1]:
# 1. Setup and Imports
# Import necessary libraries (xarray, numpy, matplotlib, etc.)
# Set up any configuration or authentication needed for data access

import gcsfs
import fsspec
import warnings
import numpy as np
import xarray as xr
import planetary_computer
from datetime import datetime
import pystac_client as pystac
warnings.filterwarnings('ignore')

AUTHOR    = 'Savannah L. Ferretti'
EMAIL     = 'savannah.ferretti@uci.edu'
SAVEDIR   = '/ocean/projects/atm200007p/sferrett/Repos/monsoon-pr/data/raw'
YEARS     = [2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020]
MONTHS    = [6,7,8]
LATRANGE  = (5.,25.) 
LONRANGE  = (60.,90.)
LEVRANGE  = (500.,1000.)

In [4]:
# 2. Data Download and Opening
## 2.1 ERA5 Data
# Function to download and open ERA5 data
# Open ERA5 dataset for surface pressure, temperature, and specific humidity

## 2.2 IMERG Data
# Function to download and open IMERG data
# Open IMERG dataset

## 2.3 GPCP Data
# Function to download and open GPCP data
# Open GPCP dataset

def get_era5():
    store = 'gs://gcp-public-data-arco-era5/ar/1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2/'
    ds    = xr.open_zarr(store,decode_times=True)
    ds    = ds.rename({'latitude':'lat','longitude':'lon','level':'lev'})    
    return ds

def get_imerg():
    store   = 'https://planetarycomputer.microsoft.com/api/stac/v1'
    catalog = pystac.Client.open(store,modifier=planetary_computer.sign_inplace)
    asset   = catalog.get_collection('gpm-imerg-hhr').assets['zarr-abfs']
    fs      = fsspec.get_mapper(asset.href,**asset.extra_fields['xarray:storage_options'])
    ds      = xr.open_zarr(fs,**asset.extra_fields['xarray:open_kwargs'])
    return ds

def get_gpcp():
    store = 'https://ncsa.osn.xsede.org/Pangeo/pangeo-forge/gpcp-feedstock/gpcp.zarr'
    ds    = xr.open_dataset(store,engine='zarr',chunks={})
    ds    = ds.rename({'latitude':'lat','longitude':'lon'})    
    return ds

In [7]:
era5  = get_era5()
imerg = get_imerg()
gpcp  = get_gpcp()

In [13]:
def standardize(da):
    # Rename all dimensions at once
    dimnames = {'latitude':'lat','longitude':'lon','level':'lev'}
    da = da.rename({oldname:newname for oldname,newname in dimnames.items() if oldname in da.dims})
    # Determine dimensions
    dims = ['lev','time','lat','lon'] if 'lev' in da.dims else ['time','lat','lon']
    for dim in dims:
        if dim == 'time' and da.coords[dim].dtype.kind != 'M':
            da.coords[dim] = da.indexes[dim].to_datetimeindex()
        elif dim != 'time':
            da.coords[dim] = da.coords[dim].astype(float)
    da = da.sortby(dims).transpose(*dims)
    return da


    def standardize(da):
    # Rename dimensions all at once
    rename_dict = {'latitude': 'lat', 'longitude': 'lon', 'level': 'lev'}
    da = da.rename({k: v for k, v in rename_dict.items() if k in da.dims})

    # Determine dimensions
    dims = ['lev', 'time', 'lat', 'lon'] if 'lev' in da.dims else ['time', 'lat', 'lon']

    # Convert time to datetime if it's not already
    if 'time' in da.dims and da.coords['time'].dtype.kind != 'M':
        da = xr.decode_cf(da)

    # Convert non-time coordinates to float and sort them
    for dim in set(dims) - {'time'}:
        if dim in da.coords:
            da.coords[dim] = da.coords[dim].astype(float)
            da = da.sel({dim: sorted(da.coords[dim].values)})

    # Transpose dimensions, keeping any extra dimensions
    return da.transpose(*dims, ..., missing_dims='ignore')
T

In [14]:
standardize(imerg)

ValueError: ('time', 'lat', 'lon') must be a permuted list of Frozen({'time': 368160, 'lon': 3600, 'lat': 1800, 'latv': 2, 'lonv': 2, 'nv': 2}), unless `...` is included

In [10]:

prdata = (imerg.precipitationCal).where(imerg.precipitationCal>=0,0)*24 # mm/hr to mm/day
psdata = era5.surface_pressure/100 # Pa to hPa
qdata  = era5.specific_humidity
tdata  = era5.temperature

Unnamed: 0,Array,Chunk
Bytes,2.09 TiB,3.96 MiB
Shape,"(552264, 721, 1440)","(1, 721, 1440)"
Dask graph,552264 chunks in 2 graph layers,552264 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.09 TiB 3.96 MiB Shape (552264, 721, 1440) (1, 721, 1440) Dask graph 552264 chunks in 2 graph layers Data type float32 numpy.ndarray",1440  721  552264,

Unnamed: 0,Array,Chunk
Bytes,2.09 TiB,3.96 MiB
Shape,"(552264, 721, 1440)","(1, 721, 1440)"
Dask graph,552264 chunks in 2 graph layers,552264 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [None]:
# 3. ERA5/IMERG Baseline Processing (hourly at 0.25 x 0.25 degrees)
## 3.1 ERA5 Processing
# Select required variables from ERA5
# Ensure hourly frequency (should already be the case)

psdata = era5.surface_pressure/100 # Pa to hPa
qdata  = era5.specific_humidity
tdata  = era5.temperature

In [None]:

## 3.2 IMERG Processing
# Change IMERG units from mm/hr to mm/day
# Resample IMERG from half-hourly to hourly
# Regrid IMERG from 0.1 x 0.1 to 0.25 x 0.25 degrees

## 3.3 Merge ERA5 and IMERG
# Merge processed ERA5 and IMERG datasets

# 4. ERA5/GPCP Baseline Processing (daily at 1 x 1 degrees)
## 4.1 ERA5 Processing
# Select required variables from ERA5
# Resample ERA5 from hourly to daily
# Regrid ERA5 from 0.25 x 0.25 to 1 x 1 degrees

## 4.2 GPCP Processing
# Ensure GPCP is at correct resolution (should already be the case)

## 4.3 Merge ERA5 and GPCP
# Merge processed ERA5 and GPCP datasets

# 5. Data Saving
## 5.1 Save ERA5/IMERG Baseline
# Save merged ERA5/IMERG dataset as netCDF

## 5.2 Save ERA5/GPCP Baseline
# Save merged ERA5/GPCP dataset as netCDF