## Random Forest Base

### Data Preparation

In [67]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import hvplot.xarray
import sys

%matplotlib inline

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

sys.path.insert(0, '../src')

from utils import df_to_xarray,read_xarray

In [2]:
# Reading Data
dir_name="../data/member_001"
chl,mld,sss,sst,u10,fg_co2,xco2,icefrac,patm,pco2=read_xarray(dir_name)

In [3]:
# Creating one singular df
data_read=xr.merge([mld.MLD,mld.MLD_socat,sst.SST,sst.SST_socat,sss.SSS,sss.SSS_socat,xco2])

In [98]:
tmp_data=data_read.to_dataframe().reset_index()


In [None]:
tmp_data=tmp_data.drop(columns=['bnds','TLONG', 'TLAT', 'time_bnds'])

In [99]:
chl_data=chl.Chl.to_dataframe().reset_index()
chl_data_socat=chl.Chl_socat.to_dataframe().reset_index()
pco2_data=pco2.pCO2.to_dataframe().reset_index()
pco2_data_socat=pco2.pCO2_socat.to_dataframe().reset_index()

In [100]:
tmp_data["Chl_socat"]=chl_data_socat["Chl_socat"]
tmp_data["Chl"]=chl_data["Chl"]
tmp_data["pCO2_socat"]=pco2_data_socat["pCO2_socat"]
tmp_data["pCO2"]=pco2_data["pCO2"]

In [101]:
features_socat = ['time','xlon', 'ylat','MLD_socat', 'SST_socat', 'SSS_socat','Chl_socat', 'XCO2','pCO2_socat']
features = ['time','xlon', 'ylat','MLD','SST','SSS','Chl','XCO2','pCO2']

# create separate dataframe for socat
combined_socat=tmp_data.loc[:,features_socat]
combined=tmp_data.loc[:,features]

In [102]:
# drop rows where pco2 or pco2_socat == NA or 0
combined_socat.dropna(subset = ["pCO2_socat"], inplace=True)
combined_socat= combined_socat[combined_socat['pCO2_socat']!=0]
combined.dropna(subset = ["pCO2"], inplace=True)
combined= combined[combined['pCO2']!=0]



In [103]:
#separating X and y
X_socat=combined_socat.iloc[:,3:-1]
X=combined.iloc[:,3:-1]
y=combined.loc[:,'pCO2']
y_socat=combined_socat.loc[:,'pCO2_socat']


### Imputation
We can save 6452246 rows through imputation.

Try Building a Custom Imputation based on lon and lat?
https://towardsdatascience.com/coding-a-custom-imputer-in-scikit-learn-31bd68e541de


Also, consider using Hurdle Model?

https://geoffruddock.com/building-a-hurdle-regression-estimator-in-scikit-learn/

#### Two Different Imputation Methods
- KNNImputer: fill in the average of the 2 nearest neighbors, takes a long time to train
- Simple Imputer: just fill in using the average

In [23]:
X_socat.describe()

Unnamed: 0,MLD_socat,SST_socat,SSS_socat,Chl_socat,XCO2
count,132903.0,132903.0,132903.0,245577.0,245577.0
mean,1.023873,0.30543,0.659811,0.365122,382.684174
std,10.095239,2.532298,4.656825,0.820338,15.27244
min,0.0,-1.82534,0.0,-0.08963,340.848541
25%,0.0,0.0,0.0,0.124559,371.56842
50%,0.0,0.0,0.0,0.178825,384.370575
75%,0.0,0.0,0.0,0.222653,394.913391
max,1193.783569,30.828823,36.89407,13.049178,407.208405


In [25]:
X.describe()

Unnamed: 0,MLD,SST,SSS,Chl,XCO2
count,10838220.0,10838220.0,10838220.0,17290470.0,17290470.0
mean,65.33868,13.19081,33.75592,0.34495,370.186
std,53.73071,11.53312,1.569614,0.8521562,18.69547
min,7.500032,-1.936021,13.76396,-0.4092084,340.8485
25%,34.66645,0.6017284,33.24879,0.1087646,354.7707
50%,55.51451,13.52542,33.85484,0.1580932,368.1608
75%,83.6946,24.82988,34.5995,0.2111136,385.4302
max,1868.325,34.07636,43.05632,14.67028,407.2084


In [174]:
#We can save this many rows through imputation.
# These rows have xCO2, pXO2 and CHL, but no MLD, SSS, SST
combined_socat.isna().sum()

time               0
xlon               0
ylat               0
MLD_socat     112674
SST_socat     112674
SSS_socat     112674
Chl_socat          0
XCO2               0
pCO2_socat         0
dtype: int64

In [26]:
# Two Different Imputation Methods

# KNNImputer
# from sklearn.impute import KNNImputer
# imp = KNNImputer(n_neighbors=2)
# X=imp.fit_transform(X)
# X_socat=imp.fit_transform(X_socat)

# SimpleImputer
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
X=imp.fit_transform(X)
X_socat=imp.fit_transform(X_socat)

### Modeling - Random Forest


Uses train_test_split build into sklearn.model_selection


By default this method shuffles the data (30% = testing 70%=training/validation)
- Will test validation via 7-fold cross validation

Train  = 70%, Test   = 30%

In [27]:

X_train, X_test, y_train, y_test = train_test_split(X_socat, y_socat, test_size=0.3, random_state= 73)


In [28]:
regressor=RandomForestRegressor(n_estimators=20, random_state=42)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=20, random_state=42)

In [29]:
from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

forest_scores = cross_val_score(regressor, X_train, y_train,
                                scoring="neg_mean_squared_error", cv=7)
forest_rmse_scores = np.sqrt(-forest_scores)

In [30]:
display_scores(forest_rmse_scores)

Scores: [31.30501344 30.48264106 30.56466388 30.81977593 31.88391381 30.6039626
 30.76351001]
Mean: 30.917640103017195
Standard deviation: 0.46740719704756006


In [31]:
#Fine Tuning Using RandomizedSearch

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=20, high=50),
        'max_features': randint(low=1, high=6),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=7, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_train, y_train)

RandomizedSearchCV(cv=7, estimator=RandomForestRegressor(random_state=42),
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x13efec250>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x13efec4f0>},
                   random_state=42, scoring='neg_mean_squared_error')

In [32]:
final_model =rnd_search.best_estimator_


### Final Result

Test Set RMSE: 30.56

Whole Grid Rmse: 42.12

In [33]:
y_pred=final_model.predict(X_test)
test_mse=mean_squared_error(y_test, y_pred,squared=True)
np.sqrt(test_mse)

30.55997229013893

In [56]:
## The whole grid
y_pred=final_model.predict(X)
final_test_rmse=np.sqrt(mean_squared_error(y, y_pred,squared=True))
error=y-y_pred

In [35]:
final_test_rmse

42.11730847320173

### Visualization of the Residual

In [104]:
combined["residual"]=np.abs(error)

In [105]:
result_data=combined[["time","xlon","ylat","residual"]]

In [106]:
cols=result_data.columns.tolist()
cols=[cols[0],cols[2],cols[1],cols[3]]
result_data=result_data[cols]
result_data.columns=['time','lat','lon','residual']

In [107]:
result_data['time'].iloc[0]

cftime.DatetimeNoLeap(1982, 2, 1, 0, 0, 0, 0, has_year_zero=True)

In [69]:
ds=df_to_xarray(result_data[:100])

# p="../data/rf_residuals.nc"
# ds.to_netcdf(path=p)

In [145]:
a=result_data[:100]



In [171]:

def tmp_df(df_in=None):
    dates = xr.cftime_range(start=f'1982-02-01', end=f'2018-12-01',freq='MS') 
    ds_skeleton = xr.Dataset({'lon':np.arange(0.5, 360, 1), 
                              'lat':np.arange(-89.5, 90, 1),
                              'time':dates})    
    # make dataframe
    skeleton = df_in.reset_index()[['time','lat','lon']]
    # Merge predictions with df_all dataframe
    df_out = skeleton.merge(df_in, how = 'left', on = ['time','lat','lon'])
    # convert to xarray dataset
    # old way to `dimt, = ds_skeleton.time.shape` ect. to get dimensions
    # then reshape  `df_out.values.reshape(dim_lat, dim_lon, dim_time)`
    # finally create a custom dataset
    df_out.set_index(['time', 'lat','lon'], inplace=True)
    ds = df_out.to_xarray()
    return ds

ds=tmp_df(result_data)
              


In [173]:
a=ds.residual.hvplot(groupby="time",width=512,height=512, widget_type='scrubber', widget_location='bottom')
a