In [1]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from scipy.stats import reciprocal, expon
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# read data
# df= pd.read_excel("/Users/sophi/Downloads/output.xlsx")
df = pd.read_excel("/Users/milesyang/Desktop/College Work/Fall23/Quantifying-wetland-carbon-emissions/data/output.xlsx")

# creates an object of the target variable NEE and an object of independent variables
NEE_column = ['NEE']
predictors_columns = list(set(list(df.columns))-set(NEE_column))

# normalizing (Edwin)
df[predictors_columns] = df[predictors_columns]/df[predictors_columns].max()
df.describe().transpose()

# set x and y (Edwin)
X = df[predictors_columns].values
y = df[NEE_column].values
y_search = df[NEE_column].values.ravel()

In [3]:
df.head()

Unnamed: 0,NEE,SW_IN,TA,VPD,P,SWC,WS,TS,WTD,WTDdiff,PDSI,LAI_month_max,FAPAR_month_max,NDVI,SIF_daily_8day,SIF_month
0,0.376613,0.270556,0.500683,0.135737,0.0,0.119916,0.059561,0.466343,-0.178717,-0.003815,-0.375042,0.273564,0.652632,0.110719,0.258058,0.279368
1,0.208644,0.245086,0.539503,0.13335,0.0,0.118627,0.051346,0.48375,-0.189811,-0.016949,-0.375042,0.273564,0.652632,0.044956,0.262184,0.279368
2,0.254026,0.199968,0.563597,0.105293,0.00143,0.117435,0.041178,0.527933,-0.174279,0.023729,-0.375042,0.273564,0.652632,0.100687,0.26631,0.279368
3,0.285509,0.116114,0.466244,0.054379,0.004944,0.117869,0.080252,0.523674,-0.140995,0.050847,-0.375042,0.273564,0.652632,-0.012106,0.270435,0.279368
4,0.120944,0.241135,0.509501,0.137781,0.0,0.118973,0.06722,0.519149,-0.183154,-0.064407,-0.375042,0.273564,0.652632,0.131494,0.274561,0.279368


In [4]:
# split data (Edwin)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)

In [5]:
# random search (NEW!)

# Define parameter distributions for randomized search
param_dist = {'svr__C': reciprocal(1e-4, 1e4),
              'svr__gamma': expon(scale=1.0),
              'svr__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
              'svr__epsilon': [0.1, 0.2, 0.5, 1.0]}

# Create a pipeline
pipe = make_pipeline(StandardScaler(), SVR())

# Randomized search
random_search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=10, cv=5, refit=True, verbose=3)

# Fitting the model for randomized search
random_search.fit(X, y_search)

# Print best parameter after tuning
print(random_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END svr__C=4.147114445005886, svr__epsilon=0.5, svr__gamma=0.003268194210335767, svr__kernel=rbf;, score=-0.296 total time=   7.3s
[CV 2/5] END svr__C=4.147114445005886, svr__epsilon=0.5, svr__gamma=0.003268194210335767, svr__kernel=rbf;, score=-0.026 total time=   6.8s
[CV 3/5] END svr__C=4.147114445005886, svr__epsilon=0.5, svr__gamma=0.003268194210335767, svr__kernel=rbf;, score=0.116 total time=   5.9s
[CV 4/5] END svr__C=4.147114445005886, svr__epsilon=0.5, svr__gamma=0.003268194210335767, svr__kernel=rbf;, score=0.020 total time=   7.1s
[CV 5/5] END svr__C=4.147114445005886, svr__epsilon=0.5, svr__gamma=0.003268194210335767, svr__kernel=rbf;, score=0.221 total time=   6.2s
[CV 1/5] END svr__C=0.0003547661184967199, svr__epsilon=1.0, svr__gamma=0.02793022531047679, svr__kernel=sigmoid;, score=-0.381 total time=   5.0s
[CV 2/5] END svr__C=0.0003547661184967199, svr__epsilon=1.0, svr__gamma=0.02793022531047679, sv

In [None]:
# Assess performance after random search
print("Mean cross-validated score:", random_search.best_score_)
print("Standard deviation of cross-validated score:", random_search.cv_results_['std_test_score'][random_search.best_index_])

In [None]:
# create model (Edwin)
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

In [None]:
# test model (Edwin)
test_set_rsquared = regressor.score(X_test, y_test)
print('Accuracy:',test_set_rsquared)

In [None]:
# actual v predicted NEE
predictions = regressor.predict(X_test)
reg = LinearRegression().fit(y_test.reshape((-1, 1)), predictions)
a = reg.coef_
b = reg.intercept_
fig, ax = plt.subplots(1,1,figsize=(9,9))
plt.scatter(y_test, predictions)
plt.plot([-15,15],[-15,15],color = 'k')
plt.plot(y_test, a * y_test + b,color = 'r')
plt.xlim([-15, 15])
plt.ylim([-15, 15])
ax.set_xlabel("NEE")
ax.set_ylabel("NEE Estimated")