# Block 6 Exercise 2: finding the best parameters for predicting the fare of taxi rides
We return to our Random Forest Regression and want to automatically optimize all free parameters ...

In [4]:
import pandas as pd
import numpy as np
import folium


In [6]:
# we load the data we have saved after wrangling and pre-processing in block I
X=pd.read_csv('../../DATA/train_cleaned.csv')
drop_columns=['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','key','pickup_datetime','pickup_date','pickup_latitude_round3','pickup_longitude_round3','dropoff_latitude_round3','dropoff_longitude_round3']
X=X.drop(drop_columns,axis=1)
X=pd.get_dummies(X)# one hot coding
#generate labels
y=X['fare_amount']
X=X.drop(['fare_amount'],axis=1)

### Scikit Optimize
Scikit Optimize (https://scikit-optimize.github.io/stable/index.html) is a AutoML toolbox wrapped around Scikit-Learn. It allows us to use state-of-the-art automatic hyper-parameter optimization on top of our learning algorithms.   



In [8]:
# install 
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.7.4-py2.py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 1.0 MB/s eta 0:00:01
Collecting pyaml>=16.9
  Downloading pyaml-20.4.0-py2.py3-none-any.whl (17 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-20.4.0 scikit-optimize-0.7.4


### E 2.1 Bayesian Optimization of a Random Forest Regression Model
use Bayesian Optimization with Cross-Validation (https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV) to find the best regression model. Compare
* linear regression (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression) 
* Random Forest regression (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)
* and SVM regression (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR)

NOTES: this can become quite compute intensive! Hence,
* use a smaller subset of the training data to run the experiments 
* think about the range of your parameters (e.g. larger number of trees in RF or high C-values in SMV will make models expensive)
* optimize only the following parameters per model type:
    * linear: no parameters to optimize
    * RF: #trees and depth
    * SVM: C and gamma (use RBF kernel)
* parallelize -> n_jobs
* use CoLab to rum the job for up to 12h 


In [10]:
#import
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

In [58]:
#split train and test
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy()[:10000,:], y.to_numpy()[:10000]) #subsample

In [59]:
#fisrt, simple example using only RF
opt = BayesSearchCV( 
         RandomForestRegressor(),
         {
             'n_estimators':Integer(10,200, prior='log-uniform'),
             'max_depth':Integer(2,20,prior='log-uniform')
         },
         n_iter=32,
         random_state=0,
         n_jobs=4, #parallelize
         cv=5, # set to 5 folds
         scoring='neg_mean_squared_error' #eval MSE as before -> see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter for other scoring methods
     )

In [60]:
opt.fit(X_train,y_train)

BayesSearchCV(cv=5, error_score='raise',
              estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                              criterion='mse', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False...
              fit_params=None, iid=True, n_iter=32, n_jobs=4, n_points=1,
              optimizer_kwargs=None,

In [61]:
opt.best_score_

-22.211899628933683

In [62]:
opt.best_params_

OrderedDict([('max_depth', 14), ('n_estimators', 127)])

In [63]:
#opt.cv_results_

In [56]:
#now, more complicated example with several model types
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.datasets import load_digits
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


# pipeline class is used as estimator to enable
# search over different model types
pipe = Pipeline([
    ('model', SVR()) #just put one model as placeholder
])

# single categorical value of 'model' parameter is
# sets the model class
# We will get ConvergenceWarnings because the problem is not well-conditioned.
# But that's fine, this is just an example.
rf_search = {
    'model': Categorical([RandomForestRegressor()]),
    'model__n_estimators': Integer(10, 100, 'log-uniform'),
    
}

# explicit dimension classes can be specified like this
svr_search = {
    'model': Categorical([SVR()]),
    'model__C': Real(1e-6, 1e+6, prior='log-uniform'),
    'model__gamma': Real(1e-6, 1e+1, prior='log-uniform'),
}

lin_search = {
    'model' : Categorical([LinearRegression()]),
}

opt = BayesSearchCV(
    pipe,
    [(svr_search, 10), (rf_search, 10), (lin_search,1)], # (parameter space, # of evaluations)
    cv=5,
     n_iter=32,
    random_state=0,
    n_jobs=4, #parallelize
    scoring='neg_mean_squared_error'
)

In [57]:
opt.fit(X_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))

val. score: -20.956843783427853
test score: -25.538474243667295


In [55]:
opt.best_params_

OrderedDict([('model',
              RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                                    max_samples=None, min_impurity_decrease=0.0,
                                    min_impurity_split=None, min_samples_leaf=1,
                                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                                    n_estimators=33, n_jobs=None, oob_score=False,
                                    random_state=None, verbose=0, warm_start=False)),
             ('model__n_estimators', 33)])