# Aggregate Regional and Monthly Statistics

This notebook aggregates 1D and 2D binned statistics needed for figures.

## Import Necessary Packages

In [1]:
import warnings
import numpy as np
import xarray as xr
from numba import jit
from datetime import datetime
warnings.filterwarnings('ignore')

## User-Defined Configurations

Define the user's name/email, specify the directory where the P-$B_L$ data is, and set the directory where the binned statistics will be saved. Define subregions of interest with their respective latitude/longitude bounds, and set binning parameters for $B_L$/$\mathrm{CAPE_L}$/$\mathrm{SUBSAT_L}$, along with the precipitation threhsold (in mm/day) separating precipitating from non-precipitating regimes.

In [2]:
AUTHOR    = 'Savannah L. Ferretti'
EMAIL     = 'savannah.ferretti@uci.edu'
FILEDIR   = '/global/cfs/cdirs/m4334/sferrett/monsoon-pod/data/processed'
SAVEDIR   = '/global/cfs/cdirs/m4334/sferrett/monsoon-pod/data/processed'
REGIONS   = {
    'Eastern Arabian Sea':{'latmin':9.,'latmax':19.5,'lonmin':64.,'lonmax':72.}, 
    'Central India':{'latmin':18.,'latmax':24.,'lonmin':76.,'lonmax':83.},
    'Central Bay of Bengal':{'latmin':9.,'latmax':14.5,'lonmin':86.5,'lonmax':90.},
    'Equatorial Indian Ocean':{'latmin':5.,'latmax':10.,'lonmin':62.,'lonmax':67.5},
    'Konkan Coast':{'latmin':15.,'latmax':19.5,'lonmin':69.,'lonmax':72.5}} 
BINPARAMS = {
    'bl':{'min':-0.6,'max':0.1,'width':0.0025},
    'cape':{'min':-70.,'max':20.,'width':1.},
    'subsat':{'min':-20.,'max':70.,'width':1.}}
PRTHRESH  = 0.25

## Load $P$-$B_L$ Datasets

Load in all three P-$B_L$ datasets.

In [7]:
def load(filename,filedir=FILEDIR):
    filepath = f'{filedir}/{filename}'
    ds = xr.open_dataset(filepath)
    return ds.load()

In [8]:
hrimergprbl = load('HR_ERA5_IMERG_pr_bl_terms.nc')
lrimergprbl = load('LR_ERA5_IMERG_pr_bl_terms.nc')
lrgpcpprbl  = load('LR_ERA5_GPCP_pr_bl_terms.nc')

## Functions for Calculating Binned Statistics

Subset the P-$B_L$ dataset by the the region of interest.

In [17]:
def get_region(data,key,regions=REGIONS):
    region = regions[key]
    return data.sel(lat=slice(region['latmin'],region['latmax']),lon=slice(region['lonmin'],region['lonmax']))

We compute binned statistics using both 1-D ($B_L$) and 2-D (joint $\text{SUBSAT}_\text{L}$-$\text{CAPE}_\text{L}$) binning schemes. For each (1-D and 2-D) bin, it calculates three statistics: the total count of data points, the count of precipitating points (points exceeding ```PRTHRESH```), and the sum of precipitation values. [Numba's jit](https://numba.readthedocs.io/en/stable/user/jit.html) is used in ```fast_binned_stats()``` to accelerate the computation. The resulting data is then organized and stored in an Xarray.Dataset.

In [25]:
def get_bin_edges(key,binparams=BINPARAMS):
    varname  = binparams[key]
    return np.arange(varname['min'],varname['max']+varname['width'],varname['width'])
     
@jit(nopython=True)
def fast_binned_stats(blidxs,capeidxs,subsatidxs,prdata,nblbins,ncapebins,nsubsatbins,prthresh=PRTHRESH): 
    Q0 = np.zeros(nblbins)
    QE = np.zeros(nblbins)
    Q1 = np.zeros(nblbins)
    P0 = np.zeros((nsubsatbins,ncapebins))
    PE = np.zeros((nsubsatbins,ncapebins))
    P1 = np.zeros((nsubsatbins,ncapebins))
    for i in range(prdata.size):
        blidx     = blidxs.flat[i]
        capeidx   = capeidxs.flat[i]
        subsatidx = subsatidxs.flat[i]
        prval     = prdata.flat[i]
        if 0<=blidx<nblbins and np.isfinite(prval):
            Q0[blidx] += 1
            Q1[blidx] += prval
            if prval>prthresh:
                QE[blidx] += 1
        if 0<=subsatidx<nsubsatbins and 0<=capeidx<ncapebins and np.isfinite(prval):
            P0[subsatidx,capeidx] += 1
            P1[subsatidx,capeidx] += prval
            if prval>prthresh:
                PE[subsatidx,capeidx] += 1
    return Q0,QE,Q1,P0,PE,P1

def calc_binned_stats(data,binparams=BINPARAMS,prthresh=PRTHRESH,author=AUTHOR,email=EMAIL):
    bl,cape,subsat,pr = (data[var].values for var in ['bl','cape','subsat','pr'])
    blbins,capebins,subsatbins = get_bin_edges('bl',binparams),get_bin_edges('cape',binparams),get_bin_edges('subsat',binparams)
    blidxs     = ((bl-binparams['bl']['min'])/binparams['bl']['width']+0.5).astype(np.int32)
    capeidxs   = ((cape-binparams['cape']['min'])/binparams['cape']['width']-0.5).astype(np.int32)
    subsatidxs = ((subsat-binparams['subsat']['min'])/binparams['subsat']['width']-0.5).astype(np.int32)
    Q0,QE,Q1,P0,PE,P1 = fast_binned_stats(blidxs,capeidxs,subsatidxs,pr,blbins.size,capebins.size,subsatbins.size,prthresh)
    ds = xr.Dataset(data_vars={'Q0':('bl',Q0),'QE':('bl',QE),'Q1':('bl',Q1),
                               'P0':(('subsat','cape'),P0),'PE':(('subsat','cape'),PE),'P1':(('subsat','cape'),P1)},
                    coords={'bl':bins['bl'],'cape':bins['cape'],'subsat':bins['subsat']})
    ds.Q0.attrs     = dict(long_name='Count of points in each bin')
    ds.QE.attrs     = dict(long_name=f'Count of precipitating ( > {prthresh} mm/day) points in each bin')
    ds.Q1.attrs     = dict(long_name='Sum of precipitation in each bin',units='mm/day')
    ds.P0.attrs     = dict(long_name='Count of points in each bin')
    ds.PE.attrs     = dict(long_name=f'Count of precipitating ( > {prthresh} mm/day) points in each bin')
    ds.P1.attrs     = dict(long_name='Sum of precipitation in each bin',units='mm/day')
    ds.bl.attrs     = dict(long_name='Average buoyancy in the lower troposphere',units='m/s²')
    ds.cape.attrs   = dict(long_name='Undilute plume buoyancy',units='K')
    ds.subsat.attrs = dict(long_name='Subsaturation in the lower free-troposphere',units='K')
    ds.attrs        = dict(history=f'Created on {datetime.today().strftime("%Y-%m-%d")} by {author} ({email})')
    return ds

## Execute Binned Statistics Calculation

We execute the aforementioned workflow by subregion. ```process_by_subregion()``` creates monthy binned statistics datasets, aggregates them by subregion, and merges them into a single Xarray.Dataset.

In [26]:
def process_by_subregion(ds,months=MONTHS,regions=REGIONS,binparams=BINPARAMS=prthresh=PRTHRESH,author=AUTHOR,email=EMAIL):
    regionstatslist = []
    for region in regions:
        regiondata     = get_region(ds,region,regions)
        monthstatslist = []
        for month in np.unique(ds.time.dt.month.values):
            monthdata  = regiondata.sel(time=regiondata.time.dt.month==month)
            monthstats = calc_binned_stats(monthdata,binparams,prthresh,author,email)
            monthstatslist.append(monthstats.expand_dims({'month':[month]}))
        regionstats = xr.concat(monthstatslist,dim='month')
        regionstatslist.append(regionstats.expand_dims({'region':[region]}))
    return xr.concat(regionstatslist,dim='region')

In [28]:
hrimergstats = process_by_subregion(hrimergprbl)
lrimergstats = process_by_subregion(lrimergprbl)
lrgpcpstats  = process_by_subregion(lrgpcpprbl)

## Save Statistics Datasets

Put all regions into a singular Xarray.Dataset, and save as a netCDF file to the user-defined save directory (```SAVEDIR```).

In [29]:
def save(ds,filename,savedir=SAVEDIR):
    filepath = f'{savedir}/{filename}'
    ds.to_netcdf(filepath)

In [30]:
save(hrimergstats,'HR_ERA5_IMERG_binned_stats.nc')
save(lrimergstats,'LR_ERA5_IMERG_binned_stats.nc')
save(lrgpcpstats,'LR_ERA5_GPCP_binned_stats.nc')