This notebook was used to compile all of the available data from the Utah Flux Network stations.  It should only need to be used once, as other notebooks are used to comile the newer data.

# Initialization

## Import Standard Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import geopandas as gpd
import sys
import pathlib
import glob

import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
#from urllib.parse import quote
#from sqlalchemy import create_engine
#import configparser


#import statsmodels.api as sm
#import pingouin as pg



## Import Micromet

In [2]:
sys.path.append("../../src/")
import micromet
import micromet.add_header_from_peer as ahp 

%matplotlib inline




## Initialize Logger

In [6]:
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)
ch = logging.StreamHandler()
ch.setFormatter(
    logging.Formatter(
        fmt="%(levelname)s [%(asctime)s] %(name)s – %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )
)
logger.addHandler(ch)

## Define the root folder for the data

In [3]:
raw_fold = pathlib.Path(f'G:/Shared drives/UGS_Flux/Data_Downloads/compiled')

# Run Compilation

Define the site folders and stations

In [4]:
site_folders = {'US-UTD':'Dugout_Ranch',
                'US-UTB':'BSF',
                'US-UTJ':'Bluff',
                'US-UTW':'Wellington',
                'US-UTE':'Escalante',
                'US-UTM':'Matheson',
                'US-UTP':'Phrag',
                'US-CdM':'Cedar_mesa',
                'US-UTV':'Desert_View_Myton',
                'US-UTN':'Juab',
                'US-UTG':'Green_River',
                'US-UTL':'Pelican_Lake',
                }

## Compile Met Statistics Tables

In [8]:
comp_edd_df = {}
outlier_report = {}

am = micromet.AmerifluxDataProcessor(logger=logger)

for key, value in site_folders.items():

    parent_fold = raw_fold / f"{key}" / "Statistics"
    ahp.scan(parent_fold, min_sim=0.3, backup=False)
    pths = micromet.fix_all_in_parent(parent_fold)
    raw_data = am.raw_file_compile(raw_fold, parent_fold, search_str = "TOA5*Statistics*.dat")
    if raw_data is not None:
        am_data = micromet.Reformatter(drop_soil=False,
                                       logger=logger,
                                       )
        #raw_data = raw_data.drop([0], axis=0)
        am_df, report = am_data.prepare(raw_data, data_type="met")
        comp_edd_df[key] = am_df
        outlier_report[key] = report

        am_df.to_csv(raw_fold / f"{key}" / f"{key}_metStat.csv")

comp_edd = pd.concat(comp_edd_df)
outlier_report = pd.concat(outlier_report)
comp_edd.to_parquet(raw_fold / "comp_met_stat.parquet")
outlier_report.to_csv(raw_fold / "outlier_report_metstat.csv")


✔ All possible files have been checked.





✔ All possible files have been checked.

✔ All possible files have been checked.





✔ All possible files have been checked.

✔ All possible files have been checked.





✔ All possible files have been checked.

✔ All possible files have been checked.

✔ All possible files have been checked.





✔ All possible files have been checked.





✔ All possible files have been checked.

✔ All possible files have been checked.





✔ All possible files have been checked.




## Compile Downloaded Eddy Data from EasyFluxWeb


In [12]:
easyfluxdf = {}
ef_reports = {}

for key, value in site_folders.items():
    site_dir = raw_fold / key
    for file in site_dir.glob("*_Flux_AmeriFluxFormat.dat"):

        am_data = micromet.Reformatter(drop_soil=False,
                                            logger=logger,
                                            )
        df = pd.read_csv(file,skiprows=[0,2,3],
                        na_values=[-9999,"NAN","NaN","nan"])
        
        df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'])

        am_df, report = am_data.prepare(df, data_type="eddy")
        easyfluxdf[key] = am_df
        ef_reports[key] = report

easyflux = pd.concat(easyfluxdf)
ef_report = pd.concat(ef_reports, axis=1).T

easyflux.to_parquet(raw_fold / "easyflux.parquet")
ef_report.to_csv(raw_fold / "easyflux_report.csv")

## Compile Ameriflux Format dat files from Dataloggers

In [10]:

comp_edd_df = {}
outlier_report = {}

am = micromet.AmerifluxDataProcessor(logger=logger)

for key, value in site_folders.items():

    parent_fold = raw_fold / f"{key}" / "AmeriFluxFormat"
    ahp.scan(parent_fold, min_sim=0.3, backup=False)
    pths = micromet.fix_all_in_parent(parent_fold)
    raw_data = am.raw_file_compile(raw_fold, parent_fold, search_str = "*Flux_AmeriFluxFormat*.dat")
    if raw_data is not None:
        am_data = micromet.Reformatter(drop_soil=False,
                                       logger=logger,
                                       )
        #raw_data = raw_data.drop([0], axis=0)
        am_df, report = am_data.prepare(raw_data, data_type="eddy")
        comp_edd_df[key] = am_df
        outlier_report[key] = report

        timestart = am_df['TIMESTAMP_START'].values[0]
        timeend = am_df['TIMESTAMP_END'].values[-1]

        am_df.to_csv(raw_fold / f"{key}" / f"{key}_HH_{timestart:}_{timeend:}.csv")

comp_edd = pd.concat(comp_edd_df)
outlier_report = pd.concat(outlier_report)
comp_edd.to_parquet(raw_fold / "comp_edd.parquet")
outlier_report.to_csv(raw_fold / "outlier_report_edd.csv")


✔ All possible files have been checked.

✔ All possible files have been checked.

✔ All possible files have been checked.

✔ All possible files have been checked.

✔ All possible files have been checked.

✔ All possible files have been checked.

✔ All possible files have been checked.

✔ All possible files have been checked.

✔ All possible files have been checked.

✔ All possible files have been checked.


  new_result = trans(result).astype(dtype)



✔ All possible files have been checked.

✔ All possible files have been checked.


## Compile Met Ameriflux Format .dat files

In [11]:
comp_edd_df = {}
outlier_report = {}

am = micromet.AmerifluxDataProcessor(logger=logger)

for key, value in site_folders.items():

    parent_fold = raw_fold / f"{key}" / "Statistics_Ameriflux"
    ahp.scan(parent_fold, min_sim=0.3, backup=False)
    pths = micromet.fix_all_in_parent(parent_fold)
    raw_data = am.raw_file_compile(raw_fold, parent_fold, search_str = "*Statistics_AmeriFlux*.dat")
    if raw_data is not None:
        am_data = micromet.Reformatter(drop_soil=False,
                                       logger=logger,
                                       )
        #raw_data = raw_data.drop([0], axis=0)
        am_df, report = am_data.prepare(raw_data, data_type="met")
        comp_edd_df[key] = am_df
        outlier_report[key] = report

        timestart = am_df['TIMESTAMP_START'].values[0]
        timeend = am_df['TIMESTAMP_END'].values[-1]

        am_df.to_csv(raw_fold / f"{key}" / f"{key}-met_HH_{timestart:}_{timeend:}.csv")

comp_met = pd.concat(comp_edd_df)
out_report_met = pd.concat(outlier_report)
comp_met.to_parquet(raw_fold / "comp_met.parquet")
out_report_met.to_csv(raw_fold / "outlier_report_met.csv")


✔ All possible files have been checked.





✔ All possible files have been checked.

✔ All possible files have been checked.


  new_result = trans(result).astype(dtype)



✔ All possible files have been checked.


  new_result = trans(result).astype(dtype)



✔ All possible files have been checked.

✔ All possible files have been checked.


  new_result = trans(result).astype(dtype)



✔ All possible files have been checked.


  new_result = trans(result).astype(dtype)



✔ All possible files have been checked.


  new_result = trans(result).astype(dtype)



✔ All possible files have been checked.





✔ All possible files have been checked.

✔ All possible files have been checked.


  new_result = trans(result).astype(dtype)



✔ All possible files have been checked.


Compile files from each station into a a single dataframe.

In [None]:
cdf = pd.concat(comp_edd_df, axis=0)
cdf.index.set_names(['stationid','datetime_start'],inplace=True)
#cdf.rename(columns={'level_0':'stationid'},inplace=True)
#cdf.to_parquet('../station_data/all_data.parquet')
for col in cdf.columns:
    cdf.rename(columns={col:col.lower()},inplace=True)

Save to Parquet

In [None]:
cdf.to_parquet('../../station_data/all_eddy_data.parquet')

In [None]:

comp_met_df = {}
root_dir = "C:/Users/paulinkenbrandt/Documents/GitHub/MicroMet/src/micromet/data/"
config_path = root_dir + "reformatter_vars.yml"
var_limits_csv = root_dir + "extreme_values.csv"
am = micromet.AmerifluxDataProcessor(config_path, logger)


for key, value in site_folders.items():

    print(key)
    raw_fold = pathlib.Path('G:/Shared drives/UGS_Flux/Data_Downloads/')
    raw_data = am.raw_file_compile(raw_fold, value, search_str = "*Statistics_AmeriFlux*.dat")
    if raw_data is not None:
        am_data = micromet.Reformatter(
                                       config_path=config_path,
                                       var_limits_csv= var_limits_csv,
                                       drop_soil=False,
                                       logger=logger,
                                       )
        am_df = am_data.prepare(raw_data, data_type="met")
        #am_df = am_data.et_data
        comp_met_df[key] = am_df

        #am_df.to_csv(f"../../station_data/{key}_HH_{am_df['TIMESTAMP_START'].values[0]:}_{am_df['TIMESTAMP_END'].values[-1]:}.csv")

        



In [None]:
ddf.columns = ddf.columns.str.lower()

In [None]:
soildfs

for old_col, new_col in mapping.items():
    if str(old_col).lower() in soildfs.columns.str.lower():
        if str(new_col).lower() in soildfs.columns.str.lower():
            soildfs[new_col.lower()] = soildfs[[old_col.lower(), new_col.lower()]].max(axis=1)
            soildfs = soildfs.drop(old_col.lower(), axis=1)
        else:
            soildfs = soildfs.rename(columns={old_col.lower(): new_col.lower()})
    elif str(old_col).lower()+"_eddy" in soildfs.columns.str.lower():
        print(f"Found {old_col} eddy column")
        if str(new_col).lower()+"_eddy" in soildfs.columns.str.lower():
            soildfs[new_col.lower()] = soildfs[[old_col.lower()+"_eddy", new_col.lower()+"_eddy"]].max(axis=1)
            soildfs = soildfs.drop(old_col.lower()+"_eddy", axis=1)
        else:
            soildfs = soildfs.rename(columns={old_col.lower()+"_eddy": new_col.lower()})
    elif str(new_col).lower()+"_eddy" in soildfs.columns.str.lower():
        if str(new_col).lower() in soildfs.columns.str.lower():
            soildfs[new_col.lower()] = soildfs[[new_col.lower()+"_eddy", new_col.lower()+"_eddy"]].max(axis=1)
            soildfs = soildfs.drop(new_col.lower()+"_eddy", axis=1)
            print(f"Found {new_col} eddy column")
        else:
            print(f"Found {new_col} eddy column")
            soildfs = soildfs.rename(columns={new_col.lower()+"_eddy": new_col.lower()})
        


In [None]:
ddf = pd.concat(comp_met_df, axis=0)
ddf.index.set_names(['stationid','datetime_start'],inplace=True)
#cdf.rename(columns={'level_0':'stationid'},inplace=True)
#cdf.to_parquet('../station_data/all_data.parquet')
for col in ddf.columns:
    ddf.rename(columns={col:col.lower()},inplace=True)

In [None]:
ddf[~ddf['vwc_2_7_1'].isna()]

In [None]:
ddf.iloc[0:1,:].to_clipboard()

In [None]:
import re

soilcols = [col.lower() for col in am_data.MATH_SOILS_V2]
pattern = re.compile(r"2_1_1|1_2_1|1_1_2")
# Print matching columns
matching_cols = [col for col in soilcols if pattern.search(col)]
# Remove them from the original list
soilcols = [col for col in soilcols if not pattern.search(col)]

        
soildfs = pd.merge(ddf,cdf[soilcols],how='left',on=['stationid','datetime_start'],suffixes=(None,'_eddy'))
soildfs

for col in cdf.columns:
    if col in soilcols:
        cdf.drop(columns=col,inplace=True)  # drop the soil columns from the main dataframe

cdf.to_parquet('../../station_data/all_eddy_data.parquet')

soildfs.to_parquet('../../station_data/all_soil_data.parquet')

ddf.to_parquet('../../station_data/all_met_data.parquet')

In [None]:
cdf = pd.read_parquet('../../station_data/all_eddy_data.parquet')


In [None]:
cdf.columns

In [None]:
soildfs = pd.read_parquet('../../station_data/all_soil_data.parquet')
utd_soilt = soildfs.loc['US-UTD'][['ts_3_1_1','ts_3_2_1','ts_3_3_1']].replace(-9999,np.nan)
utd_soilt = utd_soilt[utd_soilt.index >= '2024-07-01']#.resample('30T').mean()
utd_soilt['ts_3_1_1'].plot()
utd_soilt['ts_3_2_1'].shift(-1).plot()
utd_soilt['ts_3_3_1'].shift(-5).plot()
plt.axvline('2024-07-04 15:00',color='r')
#plt.xlim('2024-07-01','2024-07-08')
#plt.ylim(10,35)
plt.grid(True, which='minor')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy.signal import correlate

# Function to decompose the seasonal component
def extract_seasonal(ts, period):
    decomposition = seasonal_decompose(ts, model='additive', period=period)
    return decomposition.seasonal

# Function to calculate lag between two seasonal series using cross-correlation
def calculate_lag(seasonal1, seasonal2):
    n = len(seasonal1)
    correlation = correlate(seasonal1 - np.mean(seasonal1), seasonal2 - np.mean(seasonal2), mode='full')
    lags = np.arange(-n + 1, n)
    lag = lags[np.argmax(correlation)]
    return lag, correlation, lags

ts1 = utd_soilt['ts_3_2_1']
ts2 = utd_soilt['ts_3_3_1']
#utd_soilt['ts_3_3_1'].shift(-5).plot()


# Extract seasonal components
seasonal1 = extract_seasonal(ts1, period=48)
seasonal2 = extract_seasonal(ts2, period=48)

# Calculate lag
lag, correlation, lags = calculate_lag(seasonal1.dropna(), seasonal2.dropna())

# Output
print(f"Calculated lag: {lag/2} hours")

# Plot seasonal components and correlation
fig, ax = plt.subplots(3, 1, figsize=(10, 8))

seasonal1.plot(ax=ax[0], label='Seasonal Component 1')
seasonal2.plot(ax=ax[0], label='Seasonal Component 2')
ax[0].legend()
ax[0].set_title('Seasonal Components')
ax[0].set_xlim(pd.to_datetime('2024-07-01'),pd.to_datetime('2024-07-08'))
ax[0].grid(True)

ax[1].plot(lags, correlation)
ax[1].set_title('Cross-Correlation')
ax[1].set_xlabel('Lag (hours)')
ax[1].set_ylabel('Correlation')
ax[1].set_xlim(-10, 10)
ax[1].grid(True)

ax[2].plot(seasonal1.index, seasonal1, label='Series 1')
ax[2].plot(seasonal2.index + pd.Timedelta(hours=lag/2), seasonal2, label='Series 2 (Shifted)')
ax[2].legend()
ax[2].set_title(f'Series alignment (Lag: {lag/2} hours)')
ax[2].set_xlim(pd.to_datetime('2024-07-01'),pd.to_datetime('2024-07-08'))
ax[2].grid(True)
plt.tight_layout()
plt.show()



In [None]:
cdf = pd.read_parquet('../../station_data/all_eddy_data.parquet')
ddf = pd.read_parquet('../../station_data/all_met_data.parquet')

for col in cdf.columns:
    if col in ddf.columns:
        print(col)


In [None]:
ddf.head(10).to_clipboard()

In [None]:
series = ddf.loc['US-UTD','t_si111_body'].replace(-9999,np.nan)
series.plot()
series.diff().plot()
new_series = series[series.diff()<2].diff().cumsum()
new_series.plot()

In [None]:
config = configparser.ConfigParser()

config.read('../../secrets/config.ini')

from sqlalchemy import create_engine
import urllib.parse
host = config['DEFAULT']['ip']
pw = config['DEFAULT']['pw']
user = config['DEFAULT']['login']

encoded_password = urllib.parse.quote_plus(pw)

def postconn_et(encoded_password, host='localhost',user='postgres',port='5432',db='groundwater', schema = 'groundwater'):
    connection_text = "postgresql+psycopg2://{:}:{:}@{:}:{:}/{:}?gssencmode=disable".format(user,encoded_password,host,port,db)
    return create_engine(connection_text, connect_args={'options': '-csearch_path={}'.format(schema)})


engine = postconn_et(encoded_password, host=host, user=user)

In [None]:
cdf.to_sql(name = 'amfluxeddy',
           schema='groundwater',
           con=engine,
           if_exists='replace',
           chunksize=2000)

In [None]:
for col in soildfs.columns:
    print(f"amfluxmet.{col},")

In [None]:
soildfs.to_sql(name = 'amfluxmet',
           schema='groundwater',
           con=engine,
           if_exists='replace',
           chunksize=2000)