# Download and Save Cloud Data

This notebook preprocesses and downloads variables needed to calculate the precipitation-buyancy POD from two cloud stores.

## Import Necessary Packages

In [1]:
import gcsfs
import fsspec
import warnings
import numpy as np
import xarray as xr
import planetary_computer
from datetime import datetime
import pystac_client as pystac
warnings.filterwarnings('ignore')

## User-Defined Fields

Fields defining user information, save directory, and subsetting paramaters (years, months, latitude/longitude/pressure level ranges).

In [2]:
AUTHOR    = 'Savannah L. Ferretti'
EMAIL     = 'savannah.ferretti@uci.edu'
SAVEDIR   = '/ocean/projects/atm200007p/sferrett/Repos/monsoon-pr/data/raw'
YEARS     = [2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020]
MONTHS    = [6,7,8]
LATRANGE  = (5.,25.) 
LONRANGE  = (60.,90.)
LEVRANGE  = (500.,1000.)
FREQUENCY = 'H'

## Import ERA5 and IMERG Data

Hourly ERA5 data is available via the [LEAP Pangeo Data Catalog](https://catalog.leap.columbia.edu/) at 0.25$^\circ$ x 0.25$^\circ$ resolution. Half-hourly IMERG V06 data is available on [Planetary Computer](https://planetarycomputer.microsoft.com/dataset/gpm-imerg) at 0.1$^\circ$ x 0.1$^\circ$ resolution. The ```get_era5()``` and ```get_imerg()``` functions lazily load both datasets as Xarray.Datasets.

In [3]:
def get_era5():
    url = 'gs://gcp-public-data-arco-era5/ar/1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2/'
    ds  = xr.open_zarr(url,decode_times=True)
    ds  = ds.rename({'latitude':'lat','longitude':'lon','level':'lev'})    
    return ds

def get_imerg():
    url = 'https://planetarycomputer.microsoft.com/api/stac/v1'
    catalog = pystac.Client.open(url,modifier=planetary_computer.sign_inplace)
    assets  = catalog.get_collection('gpm-imerg-hhr').assets['zarr-abfs']
    ds      = xr.open_zarr(fsspec.get_mapper(assets.href,**assets.extra_fields['xarray:storage_options']),consolidated=True)
    return ds

In [4]:
era5  = get_era5()
imerg = get_imerg()

## Specify Variables We Need

Only four variables are needed from these two datasets: precipitation from IMERG V06, and surface pressure, specific humidity, and temperature from ERA5. Convert units as necessary, and remove unrealistic values. 

In [5]:
prdata = (imerg.precipitationCal).where(imerg.precipitationCal>=0,0)*24 #mm/hr to mm/day
psdata = era5.surface_pressure/100 # Pa to hPa
qdata  = era5.specific_humidity
tdata  = era5.temperature

## Preprocess Data

The ```preprocess()``` function preprocesses each variable using the user-defined fields above. It standardizes dimensions, subsets the time and space dimensions, specifies pressure levels to keep (if applicable), resamples the data to a specified sampling frequency (which can be instantaneous or a time-mean), and regrids the IMERG V06 precipitation data to the same grid as ERA5. It also timestamps the date which these datasets were created, along with the personal information of the user who created them.

In [6]:
def standardize(da):
    dims = ['lev','time','lat','lon'] if 'lev' in da.dims else ['time','lat','lon']
    for dim in dims:
        if dim == 'time' and da.coords[dim].dtype.kind != 'M':
            da.coords[dim] = da.indexes[dim].to_datetimeindex()
        elif dim != 'time':
            da.coords[dim] = da.coords[dim].astype(float)
    da = da.sortby(dims).transpose(*dims)
    return da

def subset(da,years=YEARS,months=MONTHS,latrange=LATRANGE,lonrange=LONRANGE,levrange=LEVRANGE):
    da = da.sel(time=(da['time.year'].isin(years))&(da['time.month'].isin(months)))
    da = da.sel(lat=slice(*latrange),lon=slice(*lonrange))
    if 'lev' in da.dims:
        da = da.sel(lev=slice(*levrange))
    return da
    
def resample(da,frequency=FREQUENCY):
    da.coords['time'] = da.time.dt.floor(frequency) 
    return da.groupby('time').first()

def regrid(da,resolution,latrange=LATRANGE,lonrange=LONRANGE):
    newlats = np.arange(latrange[0],latrange[1]+resolution,resolution)
    newlons = np.arange(lonrange[0],lonrange[1]+resolution,resolution)
    da = da.interp(lat=newlats,lon=newlons,kwargs={'fill_value':'extrapolate'})
    return da

def dataset(da,shortname,longname,units,source,frequency=FREQUENCY,author=AUTHOR,email=EMAIL):
    vardata = {shortname:([*da.dims],da.data)}
    coords  = {dim:da.coords[dim].data for dim in da.dims}
    ds = xr.Dataset(vardata,coords)
    ds[shortname].attrs = dict(long_name=longname,units=units)
    ds.time.attrs = dict(long_name='Time')
    ds.lat.attrs  = dict(long_name='Latitude',units='°N')
    ds.lon.attrs  = dict(long_name='Longitude',units='°E')
    if 'lev' in ds.dims:
        ds.lev.attrs = dict(long_name='Pressure level',units='hPa')
    ds.attrs = dict(source=source,history=f'Created on {datetime.today().strftime("%Y-%m-%d")} by {author} ({email})')
    return ds

def preprocess(da,shortname,longname,units,source,years=YEARS,months=MONTHS,resolution=None,latrange=LATRANGE,lonrange=LONRANGE,levrange=LEVRANGE,frequency=FREQUENCY,author=AUTHOR,email=EMAIL):
    da = standardize(da)
    da = subset(da,years,months,latrange,lonrange,levrange)
    if xr.infer_freq(da.time) != frequency:
        da = resample(da,frequency)
    if resolution:
        da = regrid(da,resolution)
    ds = dataset(da,shortname,longname,units,source,author,email)
    return ds

In [7]:
pr = preprocess(prdata,resolution=0.25,shortname='pr',longname='Precipitation flux',units='mm/day',source='IMERG V06')
ps = preprocess(psdata,shortname='ps',longname='Surface pressure',units='hPa',source='ERA5')
q  = preprocess(qdata,shortname='q',longname='Specific humidity',units='kg/kg',source='ERA5')
t  = preprocess(tdata,shortname='t',longname='Air temperature',units='K',source='ERA5')

In [8]:
print(f'Size of pr: {pr.nbytes*1e-9:.2f} GB')
print(f'Size of ps: {ps.nbytes*1e-9:.2f} GB')
print(f'Size of q:  {q.nbytes*1e-9:.2f} GB')
print(f'Size of t:  {t.nbytes*1e-9:.2f} GB')

Size of pr: 1.82 GB
Size of ps: 1.82 GB
Size of q:  29.09 GB
Size of t:  29.09 GB


## Save Variables

Save each variable Xarray.Dataset as a netCDF file to the user-defined save directory (```SAVEDIR```).

In [21]:
def save(ds,filename,savedir=SAVEDIR):
    filepath = f'{savedir}/{filename}'
    ds.to_netcdf(filepath)

In [None]:
save(pr,'IMERG_precipitation_flux.nc') # 22min 24s
save(ps,'ERA5_surface_pressure.nc')    # 7min 3s
save(q,'ERA5_specific_humidity.nc')    #   
save(t,'ERA5_temperature.nc')          # 3h 32min 51s

In [None]:
%%time
save(q,'ERA5_specific_humidity.nc')