# Aggregate Regional and Monthly Statistics

This notebook aggregates 1D and 2D binned statistics needed for figures.

## Import Necessary Packages

In [1]:
import warnings
import numpy as np
import xarray as xr
from datetime import datetime
warnings.filterwarnings('ignore')

## User-Defined Configurations

Define the user's name/email, specify the directory where the P-$B_L$ data is, and set the directory where the binned statistics will be saved. Define subregions of interest with their respective latitude/longitude bounds, and set binning parameters for $B_L$/$\mathrm{CAPE_L}$/$\mathrm{SUBSAT_L}$, along with the precipitation threhsold (in mm/day) separating precipitating from non-precipitating regimes (in mm/day). Specify the months for statistical analysis.

In [2]:
AUTHOR    = 'Savannah L. Ferretti'
EMAIL     = 'savannah.ferretti@uci.edu'
FILEDIR   = '/global/cfs/cdirs/m4334/sferrett/monsoon-pod/data/processed'
SAVEDIR   = '/global/cfs/cdirs/m4334/sferrett/monsoon-pod/data/processed'
REGIONS   = {
    'Eastern Arabian Sea':{'latmin':9.,'latmax':19.5,'lonmin':64.,'lonmax':72.}, 
    'Central India':{'latmin':18.,'latmax':24.,'lonmin':76.,'lonmax':83.},
    'Central Bay of Bengal':{'latmin':9.,'latmax':14.5,'lonmin':86.5,'lonmax':90.},
    'Equatorial Indian Ocean':{'latmin':5.,'latmax':10.,'lonmin':62.,'lonmax':67.5},
    'Konkan Coast':{'latmin':15.,'latmax':19.5,'lonmin':69.,'lonmax':72.5}} 
BINPARAMS = {
    'bl':{'min':-0.6,'max':0.1,'width':0.0025},
    'cape':{'min':-70.,'max':20.,'width':1.},
    'subsat':{'min':-20.,'max':70.,'width':1.}}
PRTHRESH  = 0.25
MONTHS    = [6,7,8]

## Load $P$-$B_L$ Datasets

Load in the high- and low-resolution P-$B_L$ datasets.

In [3]:
def load(filename,filedir=FILEDIR):
    filepath = f'{filedir}/{filename}'
    ds = xr.open_dataset(filepath)
    return ds.load()

In [4]:
hirespbl = load('ERA5_IMERG_pr_bl_terms.nc')
lorespbl = load('ERA5_GPCP_pr_bl_terms.nc')

## Functions for Calculating Binned Statistics

### Subset Region and Month

Subset the loaded dataset by the region of interest, and subset temporally for data within a specific month.

In [5]:
def get_region(data,key,regions=REGIONS):
    region = regions[key]
    return data.sel(lat=slice(region['latmin'],region['latmax']),lon=slice(region['lonmin'],region['lonmax']))

def get_month(data,month):
    return data.sel(time=data.time.dt.month==month)

### Calculate Binned Statistics

Establish 1D ($B_L$) and 2D (joint $\text{SUBSAT}_\text{L}$-$\text{CAPE}_\text{L}$) bins for which to aggregate statistics: count of precipitating data points, sum of precipitation values, and  sum of squared precipitation values in each 1D and 2D bin.

In [6]:
def get_bin_edges(key,binparams=BINPARAMS):
    varname  = binparams[key]
    binedges = np.arange(varname['min'],varname['max']+varname['width'],varname['width'])
    return binedges

def calc_binned_stats(data,binparams=BINPARAMS,prthresh=PRTHRESH,author=AUTHOR,email=EMAIL):
    blbins      = get_bin_edges('bl')
    capebins    = get_bin_edges('cape')
    subsatbins  = get_bin_edges('subsat')
    blidxs      = ((data.bl.values-binparams['bl']['min'])/binparams['bl']['width']+0.5).astype(int)
    capeidxs    = ((data.cape.values-binparams['cape']['min'])/binparams['cape']['width']-0.5).astype(int)
    subsatidxs  = ((data.subsat.values-binparams['subsat']['min'])/binparams['subsat']['width']-0.5).astype(int)
    nblbins     = blbins.size
    ncapebins   = capebins.size
    nsubsatbins = subsatbins.size
    Q0 = np.zeros((nblbins))
    Q1 = np.zeros((nblbins))
    Q2 = np.zeros((nblbins))
    QE = np.zeros((nblbins))
    P0 = np.zeros((nsubsatbins,ncapebins))
    P1 = np.zeros((nsubsatbins,ncapebins))
    P2 = np.zeros((nsubsatbins,ncapebins))
    PE = np.zeros((nsubsatbins,ncapebins))
    for timeidx in range(len(data.time)):
        for latidx in range(len(data.lat)):
            for lonidx in range(len(data.lon)):
                prval       = data.pr.values[timeidx,latidx,lonidx]
                blidx       = blidxs[timeidx,latidx,lonidx]
                capeidx     = capeidxs[timeidx,latidx,lonidx]
                subsatidx   = subsatidxs[timeidx,latidx,lonidx]
                validpr     = np.isfinite(prval)
                validbl     = (0<=blidx<nblbins)
                validcape   = (0<=capeidx<ncapebins)
                validsubsat = (0<=subsatidx<nsubsatbins)
                if validbl & validpr:
                    Q0[blidx] += 1 
                    Q1[blidx] += prval
                    Q2[blidx] += prval**2
                    if prval > prthresh:
                        QE[blidx] += 1
                if validcape & validsubsat & validpr:
                    P0[subsatidx,capeidx] += 1
                    P1[subsatidx,capeidx] += prval
                    P2[subsatidx,capeidx] += prval**2
                    if prval > prthresh:
                        PE[subsatidx,capeidx] += 1
    ds = xr.Dataset(data_vars={'Q0':(('bl'),Q0),'QE':(('bl'),QE),'Q1':(('bl'),Q1),'Q2':(('bl'),Q2),
                               'P0':(('subsat','cape'),P0),'PE':(('subsat','cape'),PE),
                               'P1':(('subsat','cape'),P1),'P2':(('subsat','cape'),P2)},
                          coords={'subsat':subsatbins,'cape':capebins,'bl':blbins})
    ds.Q0.attrs     = dict(long_name='Count of points in each bin')
    ds.QE.attrs     = dict(long_name=f'Count of precipitating ( > {prthresh} mm/day) points in each bin')
    ds.Q1.attrs     = dict(long_name='Sum of precipitation in each bin',units='mm/day')
    ds.Q2.attrs     = dict(long_name='Sum of squared precipitation in each bin',units='mm²/day²')
    ds.P0.attrs     = dict(long_name='Count of points in each bin')
    ds.PE.attrs     = dict(long_name=f'Count of precipitating ( > {prthresh} mm/day) points in each bin')
    ds.P1.attrs     = dict(long_name='Sum of precipitation in each bin',units='mm/day')
    ds.P2.attrs     = dict(long_name='Sum of squared precipitation in each bin',units='mm²/day²')
    ds.cape.attrs   = dict(long_name='Undilute plume buoyancy',units='K')
    ds.subsat.attrs = dict(long_name='Subsaturation in the lower free-troposphere',units='K')
    ds.bl.attrs     = dict(long_name='Average buoyancy in the lower troposphere',units='m/s²')
    ds.attrs        = dict(history=f'Created on {datetime.today().strftime("%Y-%m-%d")} by {author} ({email})')
    return ds

## Execute Binned Statistics Calculation

Since the analysis is quite compute-intensive (for the high-resolution data), we execute the aforementioned workflow by subregion. ```process_by_subregion()``` creates monthy binned statistics, datasets, aggregates them by subregion, and merges them into a single Xarray.Dataset.

In [8]:
def process_by_subregion(ds,months=MONTHS,regions=REGIONS):
    regionstatslist = []
    for region in regions:
        regiondata     = get_region(ds,region)
        monthstatslist = []
        for month in months:
            monthdata  = get_month(regiondata,month)
            monthstats = calc_binned_stats(monthdata)
            monthstatslist.append(monthstats.expand_dims({'month':[month]}))
        regionstats = xr.concat(monthstatslist,dim='month')
        regionstatslist.append(regionstats.expand_dims({'region':[region]}))
    return xr.concat(regionstatslist,dim='region')

In [13]:
hiresstats = process_by_subregion(hirespbl)
loresstats = process_by_subregion(lorespbl)

## Save Statistics Datasets

Put all regions into a singular Xarray.Dataset, and save as a netCDF file to the user-defined save directory (```SAVEDIR```).

In [11]:
def save(ds,filename,savedir=SAVEDIR):
    filepath = f'{savedir}/{filename}'
    ds.to_netcdf(filepath)

In [12]:
save(hiresstats,'ERA5_IMERG_binned_stats.nc')
save(loresstats,'ERA5_GPCP_binned_stats.nc')