In [6]:
###### model validation
###### leave one-site out
###### loop all sites
###### created by Qing Ying (qying@umd.edu)


import os
import sys
import urllib.request
import re
import argparse
import math
import numpy as np
import pandas as pd
import netCDF4 as nc
import xarray as xr
import matplotlib.pyplot as plt
import datetime
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
import pickle
from scipy import stats

    
df = pd.read_csv('flux_data.csv', index_col=0, parse_dates=True)

x_col = ['nbar1','nbar2','nbar3','nbar4','nbar5','nbar6','nbar7','sm_r_wetness','sm_s_wetness','sm_p_wetness','pa','tas','rsds','rsdl','le','h','spfh','ts1','ts2','ts3','dem','slope','spi','cti']
y_col = ['FCH4_mean']
dfs = df[['NewID','FCH4_count']+ y_col + x_col]
dfs = dfs.dropna()
dfs = dfs.loc[dfs['FCH4_count']>11]
dfs.info()

site_df = pd.read_csv('site_list.csv')
filter_df = site_df.loc[(site_df['WETLAND_CL']=='Fen') | (site_df['WETLAND_CL']=='Bog') | (site_df['WETLAND_CL']=='Wet tundra')]
print(filter_df)

models_df = pd.DataFrame()
predicts_df = pd.DataFrame()

n = 0
for row in filter_df.itertuples(index=True, name='Pandas'):
    id = row.NewID
    # val df
    dfss = dfs.loc[dfs['NewID']==id]
    # train df
    dfm = dfs.loc[dfs['NewID']!=id]
    if dfss.empty:
        continue
    else:
        n=n+1
        x = dfm[x_col]
        y = dfm[y_col]
        regressor = RandomForestRegressor(n_estimators=100, oob_score=True,  max_samples=0.8, max_features="sqrt", max_depth=10, min_impurity_decrease=0.1, min_samples_split=10, min_samples_leaf=4)
        regressor = regressor.fit(x,y)
        y_pred = regressor.predict(x)
        dfm['FCH4_pre'] = y_pred
        xhat = dfss[x_col]
        yhat = dfss[y_col]
        yhat_pred = regressor.predict(xhat)
        dfss['FCH4_pre']= yhat_pred
        tree_preds = np.stack([t.predict(xhat) for t in regressor.estimators_], axis=1)
        pred_variance = np.var(tree_preds, axis=1)
        dfss['FCH4_err']=np.sqrt(pred_variance)
        predicts_df = predicts_df.append(dfss, ignore_index=False)
        # model test
        rmse = np.sqrt(metrics.mean_squared_error(y, y_pred))
        r2 = regressor.score(x,y)
        # model validation of independent site
        mean = dfss['FCH4_mean'].mean()
        sd = dfss['FCH4_mean'].std()       
        dfss = dfss.loc[dfss['FCH4_mean'] <= mean+(3*sd)]
        yhat = dfss['FCH4_mean']
        yhat_pred = dfss['FCH4_pre']
        valrmse = np.sqrt(metrics.mean_squared_error(yhat, yhat_pred))
        valmae = metrics.mean_absolute_error(yhat, yhat_pred)
        slope, intercept, r_value, p_value, se = stats.linregress(yhat, yhat_pred)
        valr2 = r_value * r_value
        print(id, y.shape, y_pred.shape, yhat.shape, yhat_pred.shape)
        models_row = {"NewID": id,
                      "mRMSE": rmse,
                      "mR2": r2,
                      "valMAE": valmae,
                      "valRMSE": valrmse,
                      "valR2": valr2,
                      }
        models_df = models_df.append(models_row, ignore_index=True)
        pickle.dump(regressor, open('model_%s.sav'%(id),'wb'))
print(n)

predicts_df.to_csv('validation_obs_preds_error.csv',index=True, header=True)
models_df.to_csv('validation_models.csv',index=True, header=True)



<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 800 entries, 2015-04-11 to 2017-11-05
Data columns (total 27 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   NewID         800 non-null    object 
 1   FCH4_count    800 non-null    int64  
 2   FCH4_mean     800 non-null    float64
 3   nbar1         800 non-null    float64
 4   nbar2         800 non-null    float64
 5   nbar3         800 non-null    float64
 6   nbar4         800 non-null    float64
 7   nbar5         800 non-null    float64
 8   nbar6         800 non-null    float64
 9   nbar7         800 non-null    float64
 10  sm_r_wetness  800 non-null    float64
 11  sm_s_wetness  800 non-null    float64
 12  sm_p_wetness  800 non-null    float64
 13  pa            800 non-null    float64
 14  tas           800 non-null    float64
 15  rsds          800 non-null    float64
 16  rsdl          800 non-null    float64
 17  le            800 non-null    float64
 18  h          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CASCB (327, 1) (327,) (473,) (473,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CASCC (473, 1) (473,) (327,) (327,)
2
