In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm
from statsmodels.graphics.gofplots import ProbPlot
from statsmodels.stats.diagnostic import linear_harvey_collier
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.tsa.stattools import pacf
from sklearn.preprocessing import MinMaxScaler
import statsmodels
from sklearn import datasets


from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

plt.style.use('ggplot')

pd.options.display.max_columns = None  # display all columns when I print a dataframe
pd.options.display.max_seq_items = 5000 # display larger numbers of rows in pandas when I print out a Series or Index
pd.options.display.max_rows = 5000

In [2]:
boston_data = datasets.load_boston()
df = pd.DataFrame(boston_data.data,columns=boston_data.feature_names)
df['target'] = pd.Series(boston_data.target)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [3]:
X = df.drop(columns=['target'])
Y = df['target'] 

### Baseline Random Forest

In [4]:
rf_base = RandomForestRegressor(random_state=0, oob_score=True)

#### K-fold cross-validation

- the samples in different folds are continuous rather than taken randomly as in Startifed Sampling

- from sklearn.metrics import make_scorer: you can make your own score

- neg_root_mean_squared_error (https://github.com/scikit-learn/scikit-learn/issues/2439)

- greater_is_better : boolean, default=True
- Whether score_func is a score function (default), meaning high is good, or a loss function, meaning low is good. In the latter case, the scorer object will sign-flip the outcome of the score_func.


In [5]:
scores = cross_validate(rf_base, X, Y, cv=3, scoring=('r2', 'neg_root_mean_squared_error'), return_train_score=True)
print(scores)
print('\n')
print(f"Mean RMSE test: {np.mean(scores['test_neg_root_mean_squared_error'])}")
print(f"Std RMSE test: {np.std(scores['test_neg_root_mean_squared_error'])}")
print('\n')
print(f"Mean R^2 test: {np.mean(scores['test_r2'])}")
print(f"Std R^2 test: {np.std(scores['test_r2'])}")

{'fit_time': array([0.29525399, 0.29782987, 0.29695702]), 'score_time': array([0.01135206, 0.01129079, 0.01090837]), 'test_r2': array([0.81149539, 0.56619126, 0.3091955 ]), 'train_r2': array([0.98266133, 0.97999717, 0.98681152]), 'test_neg_root_mean_squared_error': array([-3.05731614, -5.65657214, -6.74150736]), 'train_neg_root_mean_squared_error': array([-1.3264661 , -1.1209407 , -0.98093352])}


Mean RMSE test: -5.1517985488474665
Std RMSE test: 1.5458360232814086


Mean R^2 test: 0.5622940492988148
Std R^2 test: 0.20508158715824418


#### Startified cross-validation 

- Generate test sets such that all contain the same distribution of classes, or as close as possible.

- Shuffle split: Shuffle the data before diving into test/train sets

- **Looks like cross_validate takes Statified by default when we are classifying and K-Fold otherwise**

- **Pros and Cons of each cv splitting strategy**

### Hyperparameter Optimization 

In [6]:
param_grid_instance = {"max_depth": [1, 3, 5, None],
              "max_features": ['auto', 0.5, 0.8],
              "min_samples_leaf": [1, 2, 5, 11],
              "n_estimators": [100, 300, 500],
              "max_samples": [None, 0.8, 0.5],
              "max_leaf_nodes": [None, 10, 20]}
#"bootstrap": [True, False],
#ValueError: Out of bag estimation only available if bootstrap=True

#### Grid Search CV
- estimator
- splitting startegy: Kfold, Statified, Shuffle, Startified-Shuffle
- scorer
- parameter grid 


- make_scorer can be used here

In [None]:
hyper_opt_rf = GridSearchCV(rf_base, param_grid=param_grid_instance, cv=5)
hyper_opt_rf.fit(X, Y)

In [None]:
# result of all param combinations
hyper_opt_rf.cv_results_

In [None]:
hyper_opt_rf.best_params_

In [None]:
#By default, parameter search uses the score function of the estimator to evaluate a parameter setting. 
#These are the sklearn.metrics.accuracy_score for classification and sklearn.metrics.r2_score for regression. 
hyper_opt_rf.scorer_

In [None]:
hyper_opt_rf.best_score_

In [None]:
 ### Use the best params to refit data
print(hyper_opt_rf.best_estimator_)

In [None]:
scores = cross_validate(hyper_opt_rf.best_estimator_, X, Y, cv=5, scoring=('r2', 'neg_root_mean_squared_error'), return_train_score=True)
print(f"Mean RMSE test: {np.mean(scores['test_neg_root_mean_squared_error'])}")
print(f"Std RMSE test: {np.std(scores['test_neg_root_mean_squared_error'])}")
print('\n')
print(f"Mean R^2 test: {np.mean(scores['test_r2'])}")
print(f"Std R^2 test: {np.std(scores['test_r2'])}")

### Multiple scorers/custom scorers in Grid Search CV

- You have to specify refit parameter in grid search on which refit will be done

- Error otherwie= ValueError: For multi-metric scoring, the parameter refit must be set to a scorer key or a callable to refit an estimator with the best parameter setting on the whole data and make the best_* attributes available for that metric. If this is not needed, refit should be set to False explicitly. True was passed.

In [None]:
# param_grid_instance = {"max_depth": [1, 3, 5, None],
#               "max_features": ['auto', 0.5, 0.8],
#               "min_samples_leaf": [1, 2, 5, 11],
#               "n_estimators": [100, 300, 500],
#               "max_samples": [None, 0.8, 0.5],
#               "max_leaf_nodes": [None, 10, 20]}
scoring = {'r-squared': 'r2', 'exp_variance': 'explained_variance'}
param_grid={'criterion': 'mse',
 'max_depth': 4,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': None,
 'splitter': 'best'}
gs = GridSearchCV(rf_base, param_grid=param_grid_instance, cv=5, scoring=scoring, refit='r-squared', return_train_score=False)
gs.fit(X, Y)
results = gs.cv_results_

In [None]:
results

In [None]:
gs.best_score_

In [None]:
gs.scorer_

In [None]:
gs.best_estimator_

In [None]:
gs.best_index_

In [None]:
gs.cv_results_

### Visualizing Training on various metrics:

[1] https://scikit-learn.org/stable/modules/learning_curve.html

[2] https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py

### See also
-  ParameterGrid: generates all the combinations of a hyperparameter grid.

- sklearn.model_selection.train_test_split: utility function to split the data into a development set usable for fitting a GridSearchCV instance and an evaluation set for its final evaluation.

- sklearn.metrics.make_scorer: Make a scorer from a performance metric or loss function.

[1] https://scikit-learn.org/stable/modules/model_evaluation.html#multimetric-scoring

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html

In [None]:
### Dummy estimator
# [1] https://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators
