In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from fluxdataqaqc import Data, QaQc, Plot
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models.formatters import DatetimeTickFormatter
from bokeh.models import LinearAxis, Range1d
from bokeh.io import output_notebook
output_notebook()

import warnings
warnings.filterwarnings('ignore')

## Load input

In [None]:
station = 'US-UTW'
config_path = f'../../station_config/{station}.ini'
d = Data(config_path)
d.df.index.freq = '30min'
df = d.df.rename(columns=d.inv_map)
# get some metadata for saving
site_id = d.site_id
vars_we_want = ['H', 'LE', 'Rn', 'G']
# rename variables, calculate basice statistics table and save to HTML
df[vars_we_want].rename(columns=d.variables).describe().to_html('{}.html'.format(site_id))

q = QaQc(d, daily_frac=3/4, max_interp_hours=4, max_interp_hours_night=6)

# make copies of daily results of different correction options
q.correct_data(meth='ebr', et_gap_fill=True)
ebr_gapfilled = q.df

#q.correct_data(meth='br',et_gap_fill=True)
#br_gapfilled = q.df

q.monthly_df
q.write()

In [None]:
monthly = q.monthly_df

In [None]:
monthly.columns


In [None]:
import matplotlib.pyplot as plt
field_data = pd.read_csv("../openet-et-4974925.csv",usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14])
field_data['DateTime'] = pd.to_datetime(field_data['DateTime'])
field_data = field_data.set_index('DateTime')
#monthly = monthly.resample('MS').bfill()
field_data = field_data.resample('ME').ffill()

fig, ax = plt.subplots()

ax.plot(field_data.index, field_data['Ensemble ET'],marker='.',linestyle='--', label='OpenET Ensemble')
ax.plot(monthly.index,monthly['ET']/25.4,marker='.',label='Measured')
ax.plot(monthly.index,monthly['ET_corr']/25.4,marker='.',label='Closed',alpha=0.5)
plt.grid()
plt.ylabel('ET in/mo')
plt.legend()
plt.title('US-UTW Wellington')


In [None]:
import numpy as np

import matplotlib.pyplot as plt


doy_ms = pd.date_range(start='2020-01-01', end='2020-12-01', freq='MS').day_of_year
mo_no = pd.date_range(start='2020-01-01', end='2020-12-01', freq='MS')
mo_nm = [f"{m:%b}" for m in mo_no]
mo_no_dict = dict(zip(mo_no.month, np.column_stack((doy_ms, mo_nm))))

months_used = [5,6,7,8,9]

month_starts_doy = [np.int16(mo_no_dict[i][0]) for i in months_used]
month_labels = [mo_no_dict[i][1] for i in months_used]


summer = ebr_gapfilled[ebr_gapfilled.index.month.isin(months_used)]

fig, ax = plt.subplots(2, 1, sharex=True, figsize=(10, 5))

for year in summer.index.year.unique():
    print(year)
    yr_df = summer[summer.index.year == year]
    ax[0].plot(yr_df.index.day_of_year, yr_df['ET'], label=year, alpha=0.5)
    ax[1].plot(yr_df.index.day_of_year, yr_df['ET_corr'], label=year, alpha=0.5)
ax[0].grid()
ax[1].grid()
plt.legend()

# Set the ticks and labels explicitly
ax[0].set_xticks(month_starts_doy, month_labels)
ax[1].set_xticks(month_starts_doy, month_labels)

ax[0].set_ylabel('ET mm/day')
ax[1].set_ylabel('ET mm/day')
ax[0].set_title('Measured ET')
ax[1].set_title('Closed ET')

plt.tight_layout()


In [None]:

doy_ms = pd.date_range(start='2020-01-01', end='2020-12-01', freq='MS').day_of_year
mo_no = pd.date_range(start='2020-01-01', end='2020-12-01', freq='MS')
mo_nm = [f"{m:%b}" for m in mo_no]
mo_no_dict = dict(zip(mo_no.month, np.column_stack((doy_ms, mo_nm))))
months_used = [7,8,9]



In [None]:
import matplotlib.pyplot as plt
summer = ebr_gapfilled[ebr_gapfilled.index.month.isin([4,5,6])]

fig, ax = plt.subplots(1,2,sharey=True,figsize=(10,5))

for year in summer.index.year.unique():
    print(year)
    yr_df = summer[summer.index.year == year]
    ax[0].plot(yr_df.index.day_of_year, yr_df['ET'],label=year)
    ax[1].plot(yr_df.index.day_of_year, yr_df['ET_corr'],label=year)
ax[0].grid()
ax[1].grid()
plt.legend()

In [None]:
import matplotlib.pyplot as plt

ebr_gapfilled['Br'] = ebr_gapfilled['input_H']/ebr_gapfilled['input_LE']
#ebr_gapfilled['Br'].plot(color='green')
#plt.xlim(pd.to_datetime('2024-03-01'), pd.to_datetime('2024-04-01'))
ebr_gapfilled['input_H'].plot()
ebr_gapfilled['input_LE'].plot()
ebr_gapfilled['input_G'].plot()
ebr_gapfilled['NETRAD'].plot()
plt.xlim(pd.to_datetime('2024-03-01'), pd.to_datetime('2024-04-01'))
plt.grid()
plt.legend()


In [None]:
ebr_gapfilled['Br'].plot(color='green')
plt.xlim(pd.to_datetime('2024-03-01'), pd.to_datetime('2024-04-01'))
plt.ylim(0,1)

In [None]:
ebr_gapfilled['input_H'].plot()
ebr_gapfilled['input_LE'].plot()

In [None]:
nldas = pd.read_parquet("../Footprints/output/nldas_all_normed.parquet")
utw_nldas = nldas.loc['US-UTW','eto'].resample('1d').sum()
#pd.read_parquet("../Footprints/output/nldas_all.parquet")


import matplotlib.pyplot as plt

# make date lables
doy_ms = pd.date_range(start='2020-01-01', end='2020-12-01', freq='MS').day_of_year
mo_no = pd.date_range(start='2020-01-01', end='2020-12-01', freq='MS')
mo_nm = [f"{m:%b}" for m in mo_no]
mo_no_dict = dict(zip(mo_no.month, np.column_stack((doy_ms, mo_nm))))

months_used = [5,6,7,8,9]

month_starts_doy = [np.int16(mo_no_dict[i][0]) for i in months_used]
month_labels = [mo_no_dict[i][1] for i in months_used]

summer = pd.concat([ebr_gapfilled[ebr_gapfilled.index.month.isin(months_used)], utw_nldas[utw_nldas.index.month.isin(months_used)]],axis=1)

eto = summer['eto'].copy().to_frame()
eto['doy'] = eto.index.day_of_year
avg_eto = eto.groupby('doy').max()

fig, ax = plt.subplots(2, 1, sharex=True, figsize=(10, 5))

for year in summer.index.year.unique():
    print(year)
    yr_df = summer[summer.index.year == year]
    ax[0].plot(yr_df.index.day_of_year, yr_df['ET'], label=year, alpha=0.5)
    ax[0].plot(avg_eto.index, avg_eto['eto'])
    ax[1].plot(yr_df.index.day_of_year, yr_df['ET_corr'], label=year, alpha=0.5)
    ax[1].plot(avg_eto.index, avg_eto['eto'])
ax[0].grid()
ax[1].grid()
plt.legend()

# Set the ticks and labels explicitly
ax[0].set_xticks(month_starts_doy, month_labels)
ax[1].set_xticks(month_starts_doy, month_labels)

ax[0].set_ylabel('ET mm/day')
ax[1].set_ylabel('ET mm/day')
ax[0].set_title('Measured ET')
ax[1].set_title('Closed ET')

plt.tight_layout()

In [None]:
import matplotlib.pyplot as plt

ebr_gapfilled['input_H'].plot(color='blue')

ebr_gapfilled['input_LE'].plot(color='green')
ebr_gapfilled['input_G'].plot(color='red')

ebr_gapfilled['NETRAD'].plot(color='yellow')

plt.xlim(pd.to_datetime('2024-06-01'), pd.to_datetime('2024-06-20'))
plt.grid()
plt.legend()

In [None]:
ebr_gapfilled[ebr_gapfilled['ebr']>=0.8][['input_H','input_LE','input_G','NETRAD']].plot()


In [None]:
ebr_gapfilled['Br'].plot(color='green')
plt.xlim(pd.to_datetime('2024-06-01'), pd.to_datetime('2024-06-20'))
plt.ylim(-1,1)

In [None]:
import plotly.express as px

df = px.data.tips()
df = ebr_gapfilled[(ebr_gapfilled.index>pd.to_datetime('2024-03-01'))&(ebr_gapfilled.index<=pd.to_datetime('2024-04-01'))]
fig = px.scatter(x = df['input_H']+df['input_LE'], 
            y = df['NETRAD']+df['input_G'], trendline="ols")
fig.show()

In [None]:

fig = figure(x_axis_label='date', y_axis_label='energy (W/m2)')
# arguements needed for creating interactive plots
plt_vars =['LE','H']
colors = ['red', 'blue',]
x_name = 'date'

fig.line(df.index, df[plt_vars], line_width=2, legend_label=site_id)

show(fig)

Datetime-indexed ``pandas.DataFrame`` objects have useful features for time series analysis like grouping and calculating statistics by time aggregates. The example below shows how to calculate the day of year mean for energy balance components, it also demonstrates how to use the ``add_lines`` plotting method available to ``Data``, ``QaQc``, and ``Plot`` objects.

In [None]:
# convert to internal names, copy dataframe
df = d.df.rename(columns=d.inv_map)
# day of year mean of input energy balance components
vars_we_want = ['H', 'LE', 'Rn', 'G']
doy_means = df[vars_we_want].groupby(d.df.index.dayofyear).mean()
# create a Bokeh figure
fig = figure(x_axis_label='day of year', y_axis_label='day of year mean (w/m2)')
# arguements needed for creating interactive plots
plt_vars = vars_we_want
colors = ['red', 'blue', 'black', 'green']
x_name = 'date'
source = ColumnDataSource(doy_means)
Plot.add_lines(fig, doy_means, plt_vars, colors, x_name, source, labels=vars_we_want,
    x_axis_type=None) 
show(fig)

## Visualize input data

The ``Data.plot`` method create a series of interactive time series plots of input data, potential plots inlcude:

* energy balance components 
* radiation components 
* multiple soil heat flux measurements
* air temperature
* vapor pressure and vapor pressure deficit
* wind speed
* precipitation 
* latent energy
* multiple soil moisture measurements

If any of these variables are not found the plot(s) will not be added.

The most useful interactive features of plots created by ``flux-data-qaqc`` are:

* pan/zoom
* hover tolltips on var names, values, date
* linked x-axes on time series plots
* save plot option (can save specific subplot zoomed in)

Here is an example,

#### Filter days with sub-daily gaps

The ``drop_gaps`` and ``daily_frac`` keyword arguments used when creating a ``QaQc`` instance allow you to control how days with sub-daily measurement gaps will or will not be filtered out when resampling to daily frequency. 

Sub-daily gaps in energy balance variables $LE$, $H$, $Rn$, and $G$ can be linearly interpolated up to a certain gap length measured in hours, with options to control the longest length of gap to interpolate when $Rn \ge 0$ controlled by the ``QaQc`` keyword argument ``max_interp_hours`` (default 2 hours) and the longest gap to interpolate when $Rn < 0$ set by the ``max_interp_hours_night`` (default 4 hours). 

**Important:** By default the ``QaQc`` constructor will first linearly interpolate energy balance variables ($LE$, $H$, $Rn$, and $G$) according to the maximum gap lengths (``max_interp_hours`` and ``max_interp_hours_night``) and then count sub-daily gaps and drop days (set values to null) for all climate data columns (not QC flag or sub-daily gap count columns) where any of the sub-daily data are missing because by default ``drop_gaps=True`` and ``daily_frac=1.0``. In other words, if you have hourly input data for $LE$ and one hour was missing on a given day, by default that hour will be linearly interpolated before calculating the daily time series and the daily mean will be calculated after. On the other hand, if other climate variables had a single hour missing on a given day, e.g. wind direction or air temperature, this day would be filtered out by the ``QaQc`` constructor. This is important because the daily time series is what is used in all energy balance closure correction algorithms. 

The percentage of sub-daily samples to require set by the ``daily_frac`` argument and the maximum length of gaps to linearly interpolate set by ``max_interp_hours`` and ``max_interp_hours_night`` complement each other and are used in tandem. For example, if the input data is half-hourly and you only want a maximum of 4 hours to be interpolated on any given day and gap lengths to interpolate should be no more than 2 hours each then you would pass the following parameters to the ``QaQc`` constructor:

## Energy balance corrections 

``flux-data-qaqc`` provides routines that adjust turbulent heat fluxes (latent and sensible) to improve surface energy balance closure of eddy covariance flux station data. These routines ultimately result in a corrected daily and monthly time series of latent energy, sensible heat, and evapotranspiration with the option to gap-fill days in corrected ET with ET calculated from gridMET reference ET and fraction of reference ET.

There are two methods currently implemented:
1. Energy Balance Ratio method (default), modified from the [FLUXNET method](https://fluxnet.fluxdata.org/data/fluxnet2015-dataset/data-processing/) - use `ebr`
2. Bowen Ratio approach (forces closure) - use `br`

Detailed descriptions of both methods including ET gap-filling methods can be found in the online documentation [Closure Algorithms](https://flux-data-qaqc.readthedocs.io/en/latest/closure_explanation.html#closure-methodologies) page. A few important notes on the API of these methods and other hydro-climatic statistical variables that are calculated are shown below.

#### ET gap-filling

A few notes on the option that uses reference ET and fraction of daily reference ET to fill in large gaps in corrected ET, i.e. the keyword argument ``QaQc.correct_data(etr_gap_fill = True)``. 

* The nearest [gridMET](http://www.climatologylab.org/gridmet.html) cell's time series data for precipitation and alfalfa reference ET is attempted to be downloaded if it is not found in the ``gridmet_file_path`` entry of the config.ini file. 

* If the path to a gridMET file is not found it is re-downloaded, the config file will be updated with the new path and resaved. 

* Only the overlapping time period that matches the eddy covariance time series data is attempted to be downloaded, i.e. the period in ``QaQc.df.index``. 

* When a gridMET file is downloaded it will always be saved in a subdirectory where the config file is located called "gridMET_data" and named using the ``QaQc.site_id`` and gridMET cell centroid latitude and longitude.

* Corrected latent energy ($LE_{corr}$) gaps are also backwards filled from gap-filled ET.


**Caution:** [gridMET](http://www.climatologylab.org/gridmet.html) only exists within the contiguous United States and from 1979 to present, therefore if your station lies outside of this region or you are analyzing eddy flux data recorded before 1979 this option will not be ususable and you should always run corrections with ``etr_gap_fill=False`` to avoid potential errors.

Lastly, although variables created by energy balance closure corrections are described in [Closure Algorithms](https://flux-data-qaqc.readthedocs.io/en/latest/closure_explanation.html#closure-methodologies). For reference here is a list of all possible variables created by the Energy Balance Ratio correction algorithm:

```
rso : potential clear sky radiation (ASCE formulation)
flux : input LE + H
energy : input Rn - G
ebr_5day_clim : 5 day climatology of the filtered Energy Balance Ratio 
LE_corr : corrected latent energy
ebc_cf  : energy balance closure correction factor (inverse of ebr_corr)
ebr_corr : corrected energy balance ratio
flux_corr : LE_corr + H_corr 
ebr : input energy balance ratio
H_corr : corrected sensible heat
ET : ET calculated from input LE and average air temperature
ET_corr : ET calculated from LE_corr and avg. air temp.
gridMET_ETr : gridMET alfalfa reference ET (nearest cell)
gridMET_prcp : gridMET precipitation
ETrF : fraction of reference ET for ET_corr, i.e. ET_corr / gridMET_ETr
ETrF_filtered : filtered ETrF
ET_fill : gridMET_ETr * ETrF_filtered (to fill gaps in ET_corr, full time series)
ET_gap : True on gap days in ET_corr, False otherwise (for plotting and post-processing)
ET_fill_val : value of ET_fill on gap days (for plotting and post-processing)
```

The Bowen Ratio correction method will produce the 'br' variable which is the Bowen Ratio. 