# Analysis of random error

In [34]:
import pandas as pd
import numpy as np

In [2]:
project_path = './'

data_path   = project_path + '../data/random_error/'

graphs_path = project_path + '../graphs/'

### Functions

In [77]:
def load_EP(directory, silent=False):
    if (not silent):
        print('  - Cospectra files:')
        
    file_list = sorted(glob.glob(directory + '**/*.csv', recursive=True))
    data_list = []
    for idx, fn in enumerate(file_list):
        print('    -', fn.split('\\')[-1])
        # Read file
        temp = pd.read_csv(fn, skiprows=[0,2], na_values=-9999)
        # Append timestamp
        timestamp_fn = fn.split('\\')[-1].split('_')[0]
        temp['DateTime'] = pd.to_datetime(temp['date'] + ' ' + temp['time'], format='%Y-%m-%d %H:%M')
        # shift column 'timestamp' to first position
        col = temp.pop('DateTime')
        temp.insert(0, 'DateTime', col)
        # Append to list of dfs
        data_list.append(temp)
    # Combine all the read data
    df = pd.concat(data_list, axis=0, ignore_index=True)
    # Make the timestamp the middle of the halfhour
    df['DateTime'] = df['DateTime'] + pd.Timedelta(minutes=15)
    # Drop useless columns
    df.drop(columns=['filename','date','time'], inplace=True)
    return(df)

def add_ecosystem(df):
    df = df.copy()
    # Add ecosystem
    df['Ecosystem'] = np.nan
    df.loc[df['DateTime'] < '2019-07-16', 'Ecosystem'] = 'Desert background'
    df.loc[df['DateTime'] >= '2019-07-16', 'Ecosystem'] = 'PV field'
    # Create half-hour identifier
    df['halfhour'] = df['DateTime'].dt.strftime('%H:%M')
    # Create day identifier
    df['day'] = df['DateTime'].dt.strftime('%Y-%m-%d')
    # shift column 'timestamp' to first position
    col = df.pop('halfhour')
    df.insert(0, 'halfhour', col)
    col = df.pop('day')
    df.insert(0, 'day', col)
    col = df.pop('Ecosystem')
    df.insert(0, 'Ecosystem', col)
    col = df.pop('DateTime')
    df.insert(0, 'DateTime', col)
    return(df)

In [78]:
print('Loading data...')
df = load_EP(data_path)
df = add_ecosystem(df)

print('Done...')

Loading data...
  - Cospectra files:
    - eddypro_KeturaDesertBackgroundJuly2019randomerror_full_output_2023-02-20T224915_adv.csv
    - eddypro_KeturaSolarFieldJuly2019randomerror_full_output_2023-02-21T004106_adv.csv
Done...


In [79]:
# Show all columns with random errors
err_cols = [col for col in df.columns if 'rand_err' in col]
print(err_cols)
#display(df.columns.values)

['rand_err_Tau', 'rand_err_H', 'rand_err_LE', 'rand_err_co2_flux', 'rand_err_h2o_flux']


### Some stats on the errors

#### Sensible heat flux ($H$)

In [87]:
# Find outliers

# Z-score: Remove anything more than >2 stddevs away
z = np.abs((df['rand_err_H'] - df['rand_err_H'].mean()) / df['rand_err_H'].std())
threshold = 2.0
outliers = df[z > threshold]
#display(outliers[['DateTime','Ecosystem', 'rand_err_H']])
# Corrected df
df2 = df.drop(df[z > threshold].index).copy()

# Interquartile range (iqr)
q1 = df['rand_err_H'].quantile(0.25)
q3 = df['rand_err_H'].quantile(0.75)
iqr = q3 - q1
threshold = 1.5
outliers = df[(df['rand_err_H'] < q1 - threshold*iqr) | (df['rand_err_H'] > q3 + threshold*iqr)]
#display(outliers[['DateTime','Ecosystem', 'rand_err_H']])
# Corrected df
df3 = df.drop(df[(df['rand_err_H'] < q1 - threshold*iqr) | (df['rand_err_H'] > q3 + threshold*iqr)].index).copy()

In [81]:
# Means after z-score filter
print('Means after z-score filter:')
grouped = df2.groupby('Ecosystem').agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem',
                 'rand_err_H_mean','rand_err_H_std','rand_err_H_max','rand_err_H_min']])
print('---')
print()

# Means after iqr filter
print('Means after iqr filter:')
grouped = df3.groupby('Ecosystem').agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem',
                 'rand_err_H_mean','rand_err_H_std','rand_err_H_max','rand_err_H_min']])

Means after z-score filter:


Unnamed: 0,Ecosystem,rand_err_H_mean,rand_err_H_std,rand_err_H_max,rand_err_H_min
0,Desert background,10.051495,7.697638,27.9135,0.460446
1,PV field,10.137385,8.057221,30.7975,0.493315


---

Means after iqr filter:


Unnamed: 0,Ecosystem,rand_err_H_mean,rand_err_H_std,rand_err_H_max,rand_err_H_min
0,Desert background,10.552308,8.335396,35.4294,0.460446
1,PV field,11.791884,9.970125,39.5063,0.493315


#### Latent heat flux ($LE$)

In [86]:
# Find outliers

# Z-score: Remove anything more than >2 stddevs away
z = np.abs((df['rand_err_LE'] - df['rand_err_LE'].mean()) / df['rand_err_LE'].std())
threshold = 2.0
outliers = df[z > threshold]
#display(outliers[['DateTime','Ecosystem', 'rand_err_LE']])
# Corrected df
df2 = df.drop(df[z > threshold].index).copy()

# Interquartile range (iqr)
q1 = df['rand_err_LE'].quantile(0.25)
q3 = df['rand_err_LE'].quantile(0.75)
iqr = q3 - q1
threshold = 1.5
outliers = df[(df['rand_err_LE'] < q1 - threshold*iqr) | (df['rand_err_LE'] > q3 + threshold*iqr)]
#display(outliers[['DateTime','Ecosystem', 'rand_err_LE']])
# Corrected df
df3 = df.drop(df[(df['rand_err_LE'] < q1 - threshold*iqr) | (df['rand_err_LE'] > q3 + threshold*iqr)].index).copy()

In [83]:
# Means after z-score filter
print('Means after z-score filter:')
grouped = df2.groupby('Ecosystem').agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem',
                 'rand_err_LE_mean','rand_err_LE_std','rand_err_LE_max','rand_err_LE_min']])
print('---')
print()

# Means after iqr filter
print('Means after iqr filter:')
grouped = df3.groupby('Ecosystem').agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem',
                 'rand_err_LE_mean','rand_err_LE_std','rand_err_LE_max','rand_err_LE_min']])

Means after z-score filter:


Unnamed: 0,Ecosystem,rand_err_LE_mean,rand_err_LE_std,rand_err_LE_max,rand_err_LE_min
0,Desert background,7.239659,6.656231,42.1586,0.509731
1,PV field,7.954167,8.002699,42.1866,0.246623


---

Means after iqr filter:


Unnamed: 0,Ecosystem,rand_err_LE_mean,rand_err_LE_std,rand_err_LE_max,rand_err_LE_min
0,Desert background,6.161817,4.465722,20.7893,0.509731
1,PV field,6.211508,5.002255,20.5861,0.246623


### Daily

#### Sensible heat flux ($H$)

In [84]:
# Find outliers

# Z-score: Remove anything more than >2 stddevs away
z = np.abs((df['rand_err_H'] - df['rand_err_H'].mean()) / df['rand_err_H'].std())
threshold = 2.0
outliers = df[z > threshold]
#display(outliers[['DateTime','Ecosystem', 'rand_err_H']])
# Corrected df
df2 = df.drop(df[z > threshold].index).copy()

# Interquartile range (iqr)
q1 = df['rand_err_H'].quantile(0.25)
q3 = df['rand_err_H'].quantile(0.75)
iqr = q3 - q1
threshold = 1.5
outliers = df[(df['rand_err_H'] < q1 - threshold*iqr) | (df['rand_err_H'] > q3 + threshold*iqr)]
#display(outliers[['DateTime','Ecosystem', 'rand_err_H']])
# Corrected df
df3 = df.drop(df[(df['rand_err_H'] < q1 - threshold*iqr) | (df['rand_err_H'] > q3 + threshold*iqr)].index).copy()

In [85]:
# Means after z-score filter
print('Means after z-score filter:')
grouped = df2.groupby(['Ecosystem', 'day']).agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem', 'day',
                 'rand_err_H_mean','rand_err_H_std','rand_err_H_max','rand_err_H_min']])
print('---')
print()

# Means after iqr filter
print('Means after iqr filter:')
grouped = df3.groupby(['Ecosystem', 'day']).agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem', 'day',
                 'rand_err_H_mean','rand_err_H_std','rand_err_H_max','rand_err_H_min']])

Means after z-score filter:


Unnamed: 0,Ecosystem,day,rand_err_H_mean,rand_err_H_std,rand_err_H_max,rand_err_H_min
0,Desert background,2019-07-09,8.955879,8.902695,26.0475,1.30758
1,Desert background,2019-07-10,9.269234,7.133664,25.9817,0.977407
2,Desert background,2019-07-11,9.759389,7.485843,26.9749,1.27232
3,Desert background,2019-07-12,10.07551,6.733164,27.12,1.15975
4,Desert background,2019-07-13,10.51654,8.425083,27.6565,1.43066
5,Desert background,2019-07-14,9.997609,8.082828,26.2939,0.460446
6,Desert background,2019-07-15,11.059744,8.447872,27.9135,1.08962
7,PV field,2019-07-16,10.048867,9.052662,28.5604,0.493315
8,PV field,2019-07-17,6.337982,5.550608,27.3837,0.78303
9,PV field,2019-07-18,11.235052,7.940021,26.189,1.27416


---

Means after iqr filter:


Unnamed: 0,Ecosystem,day,rand_err_H_mean,rand_err_H_std,rand_err_H_max,rand_err_H_min
0,Desert background,2019-07-09,8.955879,8.902695,26.0475,1.30758
1,Desert background,2019-07-10,10.27522,8.481922,33.0796,0.977407
2,Desert background,2019-07-11,10.204589,8.022425,31.129,1.27232
3,Desert background,2019-07-12,10.07551,6.733164,27.12,1.15975
4,Desert background,2019-07-13,11.035558,9.077554,35.4294,1.43066
5,Desert background,2019-07-14,9.997609,8.082828,26.2939,0.460446
6,Desert background,2019-07-15,12.258113,9.703896,35.2455,1.08962
7,PV field,2019-07-16,10.048867,9.052662,28.5604,0.493315
8,PV field,2019-07-17,7.125393,7.404044,37.8344,0.78303
9,PV field,2019-07-18,11.235052,7.940021,26.189,1.27416


#### Latent heat flux ($LE$)

In [88]:
# Find outliers

# Z-score: Remove anything more than >2 stddevs away
z = np.abs((df['rand_err_LE'] - df['rand_err_LE'].mean()) / df['rand_err_LE'].std())
threshold = 2.0
outliers = df[z > threshold]
#display(outliers[['DateTime','Ecosystem', 'rand_err_LE']])
# Corrected df
df2 = df.drop(df[z > threshold].index).copy()

# Interquartile range (iqr)
q1 = df['rand_err_LE'].quantile(0.25)
q3 = df['rand_err_LE'].quantile(0.75)
iqr = q3 - q1
threshold = 1.5
outliers = df[(df['rand_err_LE'] < q1 - threshold*iqr) | (df['rand_err_LE'] > q3 + threshold*iqr)]
#display(outliers[['DateTime','Ecosystem', 'rand_err_LE']])
# Corrected df
df3 = df.drop(df[(df['rand_err_LE'] < q1 - threshold*iqr) | (df['rand_err_LE'] > q3 + threshold*iqr)].index).copy()

In [89]:
# Means after z-score filter
print('Means after z-score filter:')
grouped = df2.groupby(['Ecosystem', 'day']).agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem', 'day',
                 'rand_err_LE_mean','rand_err_LE_std','rand_err_LE_max','rand_err_LE_min']])
print('---')
print()

# Means after iqr filter
print('Means after iqr filter:')
grouped = df3.groupby(['Ecosystem', 'day']).agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem', 'day',
                 'rand_err_LE_mean','rand_err_LE_std','rand_err_LE_max','rand_err_LE_min']])

Means after z-score filter:


Unnamed: 0,Ecosystem,day,rand_err_LE_mean,rand_err_LE_std,rand_err_LE_max,rand_err_LE_min
0,Desert background,2019-07-09,5.884488,6.08812,14.5638,1.14946
1,Desert background,2019-07-10,4.843395,4.307014,27.5838,0.609497
2,Desert background,2019-07-11,6.384707,5.448317,30.0357,0.543338
3,Desert background,2019-07-12,8.145201,8.12816,41.7663,0.70789
4,Desert background,2019-07-13,7.127818,4.750629,21.3615,0.84446
5,Desert background,2019-07-14,8.611413,7.488616,31.7304,0.509731
6,Desert background,2019-07-15,8.669446,8.467182,42.1586,0.74797
7,PV field,2019-07-16,4.848065,3.392962,11.058,0.468271
8,PV field,2019-07-17,4.470596,6.277463,28.4996,0.246623
9,PV field,2019-07-18,8.706706,7.56387,35.9018,1.24436


---

Means after iqr filter:


Unnamed: 0,Ecosystem,day,rand_err_LE_mean,rand_err_LE_std,rand_err_LE_max,rand_err_LE_min
0,Desert background,2019-07-09,5.884488,6.08812,14.5638,1.14946
1,Desert background,2019-07-10,4.338052,2.637789,11.0553,0.609497
2,Desert background,2019-07-11,5.859129,4.166935,18.0681,0.543338
3,Desert background,2019-07-12,6.443346,4.628265,20.7893,0.70789
4,Desert background,2019-07-13,6.804325,4.274888,19.4561,0.84446
5,Desert background,2019-07-14,6.97157,5.298111,18.023,0.509731
6,Desert background,2019-07-15,6.778958,5.039525,18.5771,0.74797
7,PV field,2019-07-16,4.848065,3.392962,11.058,0.468271
8,PV field,2019-07-17,3.496151,4.263231,15.9017,0.246623
9,PV field,2019-07-18,7.146908,4.608224,20.5861,1.24436
