# Analysis of random error

In [None]:
import pandas as pd
import numpy as np
import glob

In [None]:
project_path = './'

data_path   = project_path + '../data/random_error/'

graphs_path = project_path + '../graphs/'

### Functions

In [None]:
def load_EP(directory, silent=False):
    if (not silent):
        print('  - Cospectra files:')
        
    file_list = sorted(glob.glob(directory + '**/*.csv', recursive=True))
    data_list = []
    for idx, fn in enumerate(file_list):
        print('    -', fn.split('\\')[-1])
        # Read file
        temp = pd.read_csv(fn, skiprows=[0,2], na_values=-9999)
        # Append timestamp
        timestamp_fn = fn.split('\\')[-1].split('_')[0]
        temp['DateTime'] = pd.to_datetime(temp['date'] + ' ' + temp['time'], format='%Y-%m-%d %H:%M')
        # shift column 'timestamp' to first position
        col = temp.pop('DateTime')
        temp.insert(0, 'DateTime', col)
        # Append to list of dfs
        data_list.append(temp)
    # Combine all the read data
    df = pd.concat(data_list, axis=0, ignore_index=True)
    # Make the timestamp the middle of the halfhour
    df['DateTime'] = df['DateTime'] + pd.Timedelta(minutes=15)
    # Drop useless columns
    df.drop(columns=['filename','date','time'], inplace=True)
    return(df)

def add_ecosystem(df):
    df = df.copy()
    # Add ecosystem
    df['Ecosystem'] = np.nan
    df.loc[df['DateTime'] < '2019-07-16', 'Ecosystem'] = 'PV desert background'
    df.loc[df['DateTime'] >= '2019-07-16', 'Ecosystem'] = 'PV field'
    # Create half-hour identifier
    df['halfhour'] = df['DateTime'].dt.strftime('%H:%M')
    # Create day identifier
    df['day'] = df['DateTime'].dt.strftime('%Y-%m-%d')
    # shift column 'timestamp' to first position
    col = df.pop('halfhour')
    df.insert(0, 'halfhour', col)
    col = df.pop('day')
    df.insert(0, 'day', col)
    col = df.pop('Ecosystem')
    df.insert(0, 'Ecosystem', col)
    col = df.pop('DateTime')
    df.insert(0, 'DateTime', col)
    return(df)

In [None]:
print('Loading data...')
df = load_EP(data_path)
df = add_ecosystem(df)

print('Done...')

In [None]:
# Show all columns with random errors
err_cols = [col for col in df.columns if 'rand_err' in col]
print(err_cols)
#display(df.columns.values)

### Some stats on the errors

#### Sensible heat flux ($H$)

In [None]:
# Find outliers

# Z-score: Remove anything more than >2 stddevs away
z = np.abs((df['rand_err_H'] - df['rand_err_H'].mean()) / df['rand_err_H'].std())
threshold = 2.0
outliers = df[z > threshold]
#display(outliers[['DateTime','Ecosystem', 'rand_err_H']])
# Corrected df
df2 = df.drop(df[z > threshold].index).copy()

# Interquartile range (iqr)
q1 = df['rand_err_H'].quantile(0.25)
q3 = df['rand_err_H'].quantile(0.75)
iqr = q3 - q1
threshold = 1.5
outliers = df[(df['rand_err_H'] < q1 - threshold*iqr) | (df['rand_err_H'] > q3 + threshold*iqr)]
#display(outliers[['DateTime','Ecosystem', 'rand_err_H']])
# Corrected df
df3 = df.drop(df[(df['rand_err_H'] < q1 - threshold*iqr) | (df['rand_err_H'] > q3 + threshold*iqr)].index).copy()

In [None]:
# Means after z-score filter
print('Means after z-score filter:')
grouped = df2.groupby('Ecosystem').agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem',
                 'rand_err_H_mean','rand_err_H_std','rand_err_H_max','rand_err_H_min']])
print('---')
print()

# Means after iqr filter
print('Means after iqr filter:')
grouped = df3.groupby('Ecosystem').agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem',
                 'rand_err_H_mean','rand_err_H_std','rand_err_H_max','rand_err_H_min']])

#### Latent heat flux ($LE$)

In [None]:
# Find outliers

# Z-score: Remove anything more than >2 stddevs away
z = np.abs((df['rand_err_LE'] - df['rand_err_LE'].mean()) / df['rand_err_LE'].std())
threshold = 2.0
outliers = df[z > threshold]
#display(outliers[['DateTime','Ecosystem', 'rand_err_LE']])
# Corrected df
df2 = df.drop(df[z > threshold].index).copy()

# Interquartile range (iqr)
q1 = df['rand_err_LE'].quantile(0.25)
q3 = df['rand_err_LE'].quantile(0.75)
iqr = q3 - q1
threshold = 1.5
outliers = df[(df['rand_err_LE'] < q1 - threshold*iqr) | (df['rand_err_LE'] > q3 + threshold*iqr)]
#display(outliers[['DateTime','Ecosystem', 'rand_err_LE']])
# Corrected df
df3 = df.drop(df[(df['rand_err_LE'] < q1 - threshold*iqr) | (df['rand_err_LE'] > q3 + threshold*iqr)].index).copy()

In [None]:
# Means after z-score filter
print('Means after z-score filter:')
grouped = df2.groupby('Ecosystem').agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem',
                 'rand_err_LE_mean','rand_err_LE_std','rand_err_LE_max','rand_err_LE_min']])
print('---')
print()

# Means after iqr filter
print('Means after iqr filter:')
grouped = df3.groupby('Ecosystem').agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem',
                 'rand_err_LE_mean','rand_err_LE_std','rand_err_LE_max','rand_err_LE_min']])

### Daily

#### Sensible heat flux ($H$)

In [None]:
# Find outliers

# Z-score: Remove anything more than >2 stddevs away
z = np.abs((df['rand_err_H'] - df['rand_err_H'].mean()) / df['rand_err_H'].std())
threshold = 2.0
outliers = df[z > threshold]
#display(outliers[['DateTime','Ecosystem', 'rand_err_H']])
# Corrected df
df2 = df.drop(df[z > threshold].index).copy()

# Interquartile range (iqr)
q1 = df['rand_err_H'].quantile(0.25)
q3 = df['rand_err_H'].quantile(0.75)
iqr = q3 - q1
threshold = 1.5
outliers = df[(df['rand_err_H'] < q1 - threshold*iqr) | (df['rand_err_H'] > q3 + threshold*iqr)]
#display(outliers[['DateTime','Ecosystem', 'rand_err_H']])
# Corrected df
df3 = df.drop(df[(df['rand_err_H'] < q1 - threshold*iqr) | (df['rand_err_H'] > q3 + threshold*iqr)].index).copy()

In [None]:
# Means after z-score filter
print('Means after z-score filter:')
grouped = df2.groupby(['Ecosystem', 'day']).agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem', 'day',
                 'rand_err_H_mean','rand_err_H_std','rand_err_H_max','rand_err_H_min']])
print('---')
print()

# Means after iqr filter
print('Means after iqr filter:')
grouped = df3.groupby(['Ecosystem', 'day']).agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem', 'day',
                 'rand_err_H_mean','rand_err_H_std','rand_err_H_max','rand_err_H_min']])

#### Latent heat flux ($LE$)

In [None]:
# Find outliers

# Z-score: Remove anything more than >2 stddevs away
z = np.abs((df['rand_err_LE'] - df['rand_err_LE'].mean()) / df['rand_err_LE'].std())
threshold = 2.0
outliers = df[z > threshold]
#display(outliers[['DateTime','Ecosystem', 'rand_err_LE']])
# Corrected df
df2 = df.drop(df[z > threshold].index).copy()

# Interquartile range (iqr)
q1 = df['rand_err_LE'].quantile(0.25)
q3 = df['rand_err_LE'].quantile(0.75)
iqr = q3 - q1
threshold = 1.5
outliers = df[(df['rand_err_LE'] < q1 - threshold*iqr) | (df['rand_err_LE'] > q3 + threshold*iqr)]
#display(outliers[['DateTime','Ecosystem', 'rand_err_LE']])
# Corrected df
df3 = df.drop(df[(df['rand_err_LE'] < q1 - threshold*iqr) | (df['rand_err_LE'] > q3 + threshold*iqr)].index).copy()

In [None]:
# Means after z-score filter
print('Means after z-score filter:')
grouped = df2.groupby(['Ecosystem', 'day']).agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem', 'day',
                 'rand_err_LE_mean','rand_err_LE_std','rand_err_LE_max','rand_err_LE_min']])
print('---')
print()

# Means after iqr filter
print('Means after iqr filter:')
grouped = df3.groupby(['Ecosystem', 'day']).agg(['mean','std','max','min'])
grouped.reset_index(inplace=True)
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

display(grouped[['Ecosystem', 'day',
                 'rand_err_LE_mean','rand_err_LE_std','rand_err_LE_max','rand_err_LE_min']])