### Import Data Set

In [81]:
import plotly.express as px
import pandas as pd

data = pd.read_csv('../data/final.csv')
data.drop(columns=['Unnamed: 0'],inplace=True)
data.head()


Unnamed: 0,long,lat,year,TimePeriod,RCP,scenario,treecanopy_percent,Ann_Herb_percent,Bare_percent,Herb_percent,...,PPT_Winter,PPT_Summer,T_Winter,T_Summer,Tmax_Summer,Tmin_Winter,VWC_Winter_whole,VWC_Spring_whole,VWC_Summer_whole,VWC_Fall_whole
0,-110.047,37.604,1980,Hist,historical,sc1,0.0,0.0,78.505,4.673,...,13.79,2.69,0.965,23.159,37.05,-12.45,0.113,0.097,0.042,0.052
1,-110.047,37.604,1981,Hist,historical,sc1,0.0,0.0,78.505,4.673,...,2.25,9.39,3.334,23.271,37.55,-9.35,0.049,0.061,0.043,0.094
2,-110.047,37.604,1982,Hist,historical,sc1,0.0,0.0,78.505,4.673,...,4.12,9.5,-0.016,22.057,36.65,-16.55,0.109,0.075,0.046,0.068
3,-110.047,37.604,1983,Hist,historical,sc1,0.0,0.0,78.505,4.673,...,7.09,10.22,0.409,21.328,34.55,-15.05,0.12,0.103,0.044,0.075
4,-110.047,37.604,1984,Hist,historical,sc1,0.0,0.0,78.505,4.673,...,4.77,9.49,-1.047,21.96,35.35,-18.45,0.12,0.078,0.043,0.07


In [69]:
cleaned_data = data.drop(columns=['TimePeriod', 'RCP', 'scenario'])

In [70]:
from sklearn.model_selection import train_test_split

train_data = cleaned_data[cleaned_data['year'] < 2021]
test_data = cleaned_data[cleaned_data['year'] >= 2021]

predictors = train_data.drop(['DrySoilDays_Summer_whole', 'Evap_Summer',
                              'ExtremeShortTermDryStress_Summer_whole', 'FrostDays_Winter',
                              'NonDrySWA_Summer_whole', 'PPT_Winter', 'PPT_Summer', 'T_Winter',
                              'T_Summer', 'Tmax_Summer', 'Tmin_Winter', 'VWC_Winter_whole',
                              'VWC_Spring_whole', 'VWC_Summer_whole', 'VWC_Fall_whole'], axis=1)
targets = train_data[['DrySoilDays_Summer_whole', 'Evap_Summer',
                      'ExtremeShortTermDryStress_Summer_whole', 'FrostDays_Winter',
                      'NonDrySWA_Summer_whole', 'PPT_Winter', 'PPT_Summer', 'T_Winter',
                      'T_Summer', 'Tmax_Summer', 'Tmin_Winter', 'VWC_Winter_whole',
                      'VWC_Spring_whole', 'VWC_Summer_whole', 'VWC_Fall_whole']]

X_train, X_valid, y_train, y_valid = train_test_split(predictors, targets, test_size=0.2, random_state=42)

X_train.shape, X_valid.shape, y_train.shape, y_valid.shape


((3525, 9), (882, 9), (3525, 15), (882, 15))

### Multi-output GBM

In [71]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

gbm = GradientBoostingRegressor(random_state=42)

multioutput_gbm = MultiOutputRegressor(gbm)

multioutput_gbm.fit(X_train, y_train)

y_pred_gbm = multioutput_gbm.predict(X_valid)

rmse_gbm = np.sqrt(mean_squared_error(y_valid, y_pred_gbm))
rmse_gbm


2.4048735840891675

### Neural Network

In [72]:
from sklearn.neural_network import MLPRegressor
nn = MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=42)

multioutput_nn = MultiOutputRegressor(nn)

multioutput_nn.fit(X_train, y_train)

y_pred_nn = multioutput_nn.predict(X_valid)

rmse_nn = np.sqrt(mean_squared_error(y_valid, y_pred_nn))
rmse_nn


6.1298204920474975

### Logistic Regression

In [86]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()

multioutput_lr = MultiOutputRegressor(linear_reg)

multioutput_lr.fit(X_train, y_train)

y_pred_lr = multioutput_lr.predict(X_valid)

rmse_lr = np.sqrt(mean_squared_error(y_valid, y_pred_lr))
rmse_lr


6.006463655660495

### Random Forest

In [74]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

multioutput_rf = MultiOutputRegressor(rf)

multioutput_rf.fit(X_train, y_train)

y_pred_rf = multioutput_rf.predict(X_valid)

rmse_rf = np.sqrt(mean_squared_error(y_valid, y_pred_rf))
rmse_rf


2.2181062671685168

### SVM

In [75]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

svm_model = MultiOutputRegressor(SVR())
svm_model.fit(X_train_scaled, y_train)

svm_predictions = svm_model.predict(X_valid_scaled)
svm_rmse = np.sqrt(mean_squared_error(y_valid, svm_predictions))
svm_rmse

6.462315059025544

### KNN

In [76]:
knn_model = MultiOutputRegressor(KNeighborsRegressor())
knn_model.fit(X_train_scaled, y_train)

knn_predictions = knn_model.predict(X_valid_scaled)
knn_rmse = np.sqrt(mean_squared_error(y_valid, knn_predictions))

knn_rmse

3.4243566569531936

### Decision Tree

In [77]:
from sklearn.tree import DecisionTreeRegressor

dt_model = MultiOutputRegressor(DecisionTreeRegressor(random_state=42))
dt_model.fit(X_train_scaled, y_train)

dt_predictions = dt_model.predict(X_valid_scaled)
dt_rmse = np.sqrt(mean_squared_error(y_valid, dt_predictions))

dt_rmse

2.233301096934572

### Model Selection
It seeems that Random Forest Model has the lowest RMSE. Therefore, I decide to choose it as the most fit model to do the prediction.

### Prediction

In [78]:
data2 = pd.read_csv('../data/nearterm_cleaned.csv')

contextual_data = data2[['long','lat','year','TimePeriod', 'RCP', 'scenario']]

In [79]:
X_test = data2.drop(['TimePeriod', 'RCP', 'scenario',
                     'DrySoilDays_Summer_whole', 'Evap_Summer', 'ExtremeShortTermDryStress_Summer_whole',
                     'FrostDays_Winter', 'NonDrySWA_Summer_whole', 'PPT_Winter', 'PPT_Summer', 'T_Winter',
                     'T_Summer', 'Tmax_Summer', 'Tmin_Winter', 'VWC_Winter_whole',
                     'VWC_Spring_whole', 'VWC_Summer_whole', 'VWC_Fall_whole'], axis=1)

predictions_rf = multioutput_rf.predict(X_test)

predicted_columns = [
    'DrySoilDays_Summer_whole', 'Evap_Summer', 'ExtremeShortTermDryStress_Summer_whole',
    'FrostDays_Winter', 'NonDrySWA_Summer_whole', 'PPT_Winter', 'PPT_Summer', 'T_Winter',
    'T_Summer', 'Tmax_Summer', 'Tmin_Winter', 'VWC_Winter_whole',
    'VWC_Spring_whole', 'VWC_Summer_whole', 'VWC_Fall_whole'
]
predictions_df = pd.DataFrame(predictions_rf, columns=predicted_columns, index=X_test.index)

predictions_df

Unnamed: 0,DrySoilDays_Summer_whole,Evap_Summer,ExtremeShortTermDryStress_Summer_whole,FrostDays_Winter,NonDrySWA_Summer_whole,PPT_Winter,PPT_Summer,T_Winter,T_Summer,Tmax_Summer,Tmin_Winter,VWC_Winter_whole,VWC_Spring_whole,VWC_Summer_whole,VWC_Fall_whole
0,0.00,2.14072,35.2470,69.0,0.01133,2.6700,4.4800,2.1310,24.55000,35.950,-12.15,0.04704,0.04390,0.04214,0.09343
1,0.00,2.14072,35.2470,69.0,0.01133,2.6700,4.4800,2.1310,24.55000,35.950,-12.15,0.04704,0.04390,0.04214,0.09343
2,0.00,2.14072,35.2470,69.0,0.01133,2.6700,4.4800,2.1310,24.55000,35.950,-12.15,0.04704,0.04390,0.04214,0.09343
3,0.00,2.14072,35.2470,69.0,0.01133,2.6700,4.4800,2.1310,24.55000,35.950,-12.15,0.04704,0.04390,0.04214,0.09343
4,0.00,2.14072,35.2470,69.0,0.01133,2.6700,4.4800,2.1310,24.55000,35.950,-12.15,0.04704,0.04390,0.04214,0.09343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18075,6.73,2.64730,33.0973,72.0,0.22235,2.9548,6.1216,1.0278,22.73946,34.117,-13.84,0.09297,0.10247,0.08987,0.12108
18076,6.73,2.64730,33.0973,72.0,0.22235,2.9548,6.1216,1.0278,22.73946,34.117,-13.84,0.09297,0.10247,0.08987,0.12108
18077,6.73,2.64730,33.0973,72.0,0.22235,2.9548,6.1216,1.0278,22.73946,34.117,-13.84,0.09297,0.10247,0.08987,0.12108
18078,6.73,2.64730,33.0973,72.0,0.22235,2.9548,6.1216,1.0278,22.73946,34.117,-13.84,0.09297,0.10247,0.08987,0.12108


In [80]:
merged_predictions = pd.concat([contextual_data.reset_index(drop=True), predictions_df], axis=1)

output_path = '../data/predict_outcome.csv'
merged_predictions.to_csv(output_path, index=False)