## Import Packages

In [1]:
import gcsfs
import fsspec
import warnings
import numpy as np
import xarray as xr
import planetary_computer
from datetime import datetime
import pystac_client as pystac
warnings.filterwarnings('ignore')

## User-Defined Fields

In [13]:
AUTHOR   = 'Savannah L. Ferretti'
EMAIL    = 'savannah.ferretti@uci.edu'
SAVEDIR  = '/ocean/projects/atm200007p/sferrett/Repos/monsoon-pr/data/interim'
YEARS    = [2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014]
MONTHS   = [6,7,8]
LATRANGE = (0.,30.) 
LONRANGE = (50.,90.)
LEVRANGE = (500.,1000.)

## Functions

In [None]:
def get_era5():
    url  = 'gs://gcp-public-data-arco-era5/ar/1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2/'
    data = xr.open_zarr(url,decode_times=True)
    data = data.rename({'latitude':'lat','longitude':'lon','level':'lev'})    
    return data

def get_imerg():
    url = 'https://planetarycomputer.microsoft.com/api/stac/v1'
    catalog = pystac.Client.open(url,modifier=planetary_computer.sign_inplace)
    assets  = catalog.get_collection('gpm-imerg-hhr').assets['zarr-abfs']
    data    = xr.open_zarr(fsspec.get_mapper(assets.href,**assets.extra_fields['xarray:storage_options']),consolidated=True)
    return data

In [3]:
def preprocess(data,shape):
    if shape == '3D':
        dims = ['time','lat','lon']
    elif shape == '4D':
        dims = ['time','lat','lon','lev']
    data = data.drop_dims(set(data.dims)-{*dims})
    for dim in dims:
        if dim == 'time' and data.coords[dim].dtype.kind != 'M':
            data.coords[dim] = data.indexes[dim].to_datetimeindex()
        elif dim != 'time':
            data.coords[dim] = data.coords[dim].astype(float)
            # if dim == 'lon' and (data.coords[dim].min() >= 0 and data.coords[dim].max() <= 360):
            #     data.coords[dim] = ((data.coords[dim]+180)%360)-180
    data = data.sortby(dims).transpose(*dims)
    return data

In [None]:
def subset(data,years=YEARS,months=MONTHS,latrange=LATRANGE,lonrange=LONRANGE,levrange=LEVRANGE):
    data = data.sel(time=(data['time.year'].isin(years))&(data['time.month'].isin(months)))
    data = data.sel(lat=slice(*latrange),lon=slice(*lonrange))
    if 'lev' in data.dims:
        data = data.sel(lev=slice(*levrange))
    return data

def resample(data,frequency):
    if frequency == '3-hourly mean':
        data.coords['time'] = data.time.dt.floor('3H')
        data = data.groupby('time').mean()
    elif frequency == '6-hourly':
        data.coords['time'] = data.time.dt.floor('6H')
        data = data.groupby('time').first() 
    return data

def regrid(data,resolution,latrange=LATRANGE,lonrange=LONRANGE):
    lats = np.arange(LATRANGE[0],LATRANGE[1]+resolution,resolution)
    lons = np.arange(LONRANGE[0],LONRANGE[1]+resolution,resolution)
    data = data.interp(lat=lats,lon=lons,kwargs={'fill_value':'extrapolate'})
    return data

In [None]:
def dataset(data,varname,longname,units,model,frequency,author=AUTHOR,email=EMAIL):
    vardata = {varname:([*data.dims],data.data)}
    if 'lev' in data.dims:
        coords = {'time':data.time.data,'lat':data.lat.data,'lon':data.lon.data,'lev':data.lev.data}
    else:
        coords = {'time':data.time.data,'lat':data.lat.data,'lon':data.lon.data}
    data = xr.Dataset(vardata,coords)
    data[varname].attrs = dict(long_name=longname,units=units)
    data.time.attrs = dict(long_name='Time')
    data.lat.attrs = dict(long_name='Latitude',units='°N')
    data.lon.attrs = dict(long_name='Longitude',units='°E')
    if 'lev' in data.dims:
        data.lev.attrs = dict(long_name='Pressure level',units='hPa')
    data.attrs = dict(source=model,frequency=frequency,
                      history=f'Created on {datetime.today().strftime("%Y-%m-%d")} by {author} ({email})')
    return data

def save(data,gridtype,savedir=SAVEDIR):
    varname = list(data.keys())[0]
    return data.compute().to_netcdf(f'{SAVEDIR}/OBS-{gridtype}_{varname}.nc')

## Import Data

In [4]:
era5  = get_era5()
era5  = preprocess(era5,shape='4D')

In [None]:
imerg = get_imerg()
imerg = preprocess(imerg,shape='3D')

## Process Variables

In [None]:
q = era5.specific_humidity
q = subset(q)
q = resample(q,frequency='6-hourly')
q = dataset(q,'q','Specific humidity','kg/kg','ERA5','6-hourly')

In [None]:
t = era5.temperature
t = subset(t)
t = resample(t,frequency='6-hourly')
t = dataset(t,'t','Air temperature','K','ERA5','6-hourly')

In [None]:
ps = era5.surface_pressure/100
ps = subset(ps)
ps = resample(ps,frequency='6-hourly')
ps = dataset(ps,'ps','Surface pressure','hPa','ERA5','6-hourly')

In [None]:
pr = imerg.precipitationCal*24
pr = subset(pr.where(pr>=0,0))
pr = regrid(pr,resolution=0.25)
pr = resample(pr,'3-hourly mean')
pr = dataset(pr,'pr','Precipitation flux','mm/day','IMERG V06','3-hourly mean')

## Save Variables

In [None]:
save(q,gridtype='HR') # 1h 45m 54s
save(regrid(q,resolution=2.0),gridtype='LR')

In [None]:
save(t,gridtype='HR') # 1h 49m 20s
save(regrid(t,resolution=2.0),gridtype='LR')

In [None]:
save(ps,gridtype='HR') # 3m 40s
save(regrid(ps,resolution=2.0),gridtype='LR')

In [None]:
save(pr,gridtype='HR') # 15m 14s
save(regrid(pr,resolution=2.0),gridtype='LR')