# 7.06 Decision Trees Parameters, Cross Validation and Hyperparameter search

In [1]:
#conda install dtreeviz 
!pip install dtreeviz 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dtreeviz
  Downloading dtreeviz-2.2.1-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.5/91.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting colour (from dtreeviz)
  Downloading colour-0.1.5-py2.py3-none-any.whl (23 kB)
Installing collected packages: colour, dtreeviz
Successfully installed colour-0.1.5 dtreeviz-2.2.1


In [2]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import dtreeviz
import graphviz
import graphviz.backend as be
from IPython.display import Image, display_svg, SVG
import warnings
warnings.filterwarnings( "ignore", module = "matplotlib\..*" )

In [3]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import matplotlib.font_manager 
fm = matplotlib.font_manager
fm._get_fontconfig_fonts.cache_clear()
plt.rcParams['font.family'] = 'Times New Roman'

In [4]:
housing = fetch_california_housing()
#housing
X = pd.DataFrame(housing['data'], columns = housing['feature_names'])
y = pd.DataFrame(housing['target'], columns = ['labels'])
display(X.head())
display(y.head())

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


Unnamed: 0,labels
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [5]:
X = X.drop(columns=["Latitude","Longitude"])
y = y["labels"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 1)

# We need to cast again into df to visualize the decision tree
X_train_df = pd.DataFrame(X_train, columns=X.columns)
y_train_df = pd.DataFrame(y_train, columns=["Price"])

X_test_df = pd.DataFrame(X_test, columns=X.columns)
y_test_df = pd.DataFrame(y_test, columns=["Price"])

In [7]:
X_train_df.dtypes

MedInc        float64
HouseAge      float64
AveRooms      float64
AveBedrms     float64
Population    float64
AveOccup      float64
dtype: object

Let's try to train an oversimplified decision tree.

In [8]:
regr = DecisionTreeRegressor(max_depth=3)
model = regr.fit(X_train, y_train)

In [9]:
print("Train data R2 was: {:.2f} ".format(regr.score(X_train, y_train)))
print("Test data R2 was: {:.2f}".format(regr.score(X_test, y_test)))

Train data R2 was: 0.54 
Test data R2 was: 0.52


Let's visualize our the decision tree.

In [10]:
viz = dtreeviz.model(regr,
                     X_train_df,
                     y_train,
                     feature_names=housing.feature_names,
                     target_name='Price');
viz.view()


Output hidden; open in https://colab.research.google.com to view.

In [11]:
regr = DecisionTreeRegressor(max_depth=5,
                             criterion = 'squared_error',
                             min_samples_split=2,
                             min_samples_leaf = 1,
                             max_features = 6)
regr.fit(X_train, y_train)
print("Train data R2 was: {:.2f}".format(regr.score(X_train, y_train)))
print("Test data R2 was: {:.2f}".format(regr.score(X_test, y_test)))


Train data R2 was: 0.63
Test data R2 was: 0.60


In [13]:
regr = DecisionTreeRegressor(max_depth=5,
                             criterion = 'squared_error',
                             min_samples_split=10,
                             min_samples_leaf = 10,
                             max_features = 3)
regr.fit(X_train, y_train)
print("Train data R2 was: {:.2f}".format(regr.score(X_train, y_train)))
print("test data R2 was: {:.2f}".format(regr.score(X_test, y_test)))

Train data R2 was: 0.59
test data R2 was: 0.55


## Cross validation

Here we're going to split the train set in several subsets called "folds" and we are going to train one model using all the folds except one.

In [14]:
regr = DecisionTreeRegressor(max_depth=5,
                             criterion = 'squared_error',
                             min_samples_split=2,
                             min_samples_leaf = 1,
                             max_features = 6)

In [15]:
from sklearn.model_selection import cross_validate

results = cross_validate(regr, X_train, y_train, cv = 5) # We always do the CV on the TRAIN set. cv = K

In [16]:
results

{'fit_time': array([0.03959012, 0.03486395, 0.03541851, 0.03382087, 0.03373885]),
 'score_time': array([0.00203896, 0.00202608, 0.00229979, 0.00206327, 0.00199413]),
 'test_score': array([0.61601063, 0.60485243, 0.59017325, 0.61577324, 0.60588712])}

In [17]:
print(results['test_score'])
print("The average R2 over the folds is: {:.2f}".format(results['test_score'].mean()))
print("The standard deviation of R2 over the folds is: {:.2f}".format(results['test_score'].std()))

[0.61601063 0.60485243 0.59017325 0.61577324 0.60588712]
The average R2 over the folds is: 0.61
The standard deviation of R2 over the folds is: 0.01


In [18]:
regr = DecisionTreeRegressor(max_depth=5,
                             criterion = 'squared_error',
                             min_samples_split=10,
                             min_samples_leaf = 10,
                             max_features = 3)

In [19]:
results = cross_validate(regr, X_train, y_train, cv = 5)
print(results['test_score'])
print("The average R2 over the folds is: {:.2f}".format(results['test_score'].mean()))
print("The standard deviation of R2 over the folds is: {:.2f}".format(results['test_score'].std()))

[0.61650065 0.54675083 0.56329394 0.51410817 0.54897468]
The average R2 over the folds is: 0.56
The standard deviation of R2 over the folds is: 0.03


In [20]:
regr.fit(X_train,y_train)

In [21]:
regr.feature_names_in_

array(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population',
       'AveOccup'], dtype=object)

In [23]:
import numpy as np

feature_importances = list(zip(np.abs(regr.feature_importances_), X_train.columns))
feature_importances.sort(reverse=True)
feature_importances

[(0.5598055337212278, 'MedInc'),
 (0.22598479643098554, 'AveRooms'),
 (0.1603258178127658, 'AveOccup'),
 (0.045869750687372604, 'HouseAge'),
 (0.006594877936530913, 'AveBedrms'),
 (0.00141922341111751, 'Population')]

## Hyperparameter search

As we can see, Decission Trees have many hyperparameters to adjust. How we can find the best ones? 

There are two possibe strategies:

* Grid Search ( a collection of pre-defined hyperparameters is tested )
* Random Search ( a range of pre-deined hyperparameters is tested )

The first approach is more systematic but can be slower. The second one could be more successful.

Let's use each.

### Grid Search

In [24]:
from sklearn.model_selection import GridSearchCV
# 2 * 2 * 2 * 2 * 5 = 4 * 4 * 5 = 4 * 20 = 80
max_depth_choices = [3,5] # A list of the possible values of max_depth to try
criterion_choices = ['squared_error','absolute_error'] # A list of the possible values optimization metrics
min_samples_split_choices = [2,10] # A list of the possible values of min_samples_split to try
min_samples_leaf_choices = [2,10] # A list of the possible values of min_samples_leaf to try       

In [25]:
# Create the  grid 
# this is a dictionary from hyperparameters to potential values
# the keys in this dictionary have to match the names of the hyperparameters in the documentation of the model
grid = {'max_depth': max_depth_choices,
        'criterion': criterion_choices,
        'min_samples_split': min_samples_split_choices,
        'min_samples_leaf': min_samples_leaf_choices}

In [27]:
# Instantiate the grid search model object

# estimator -> model to optimize 
model = DecisionTreeRegressor()
# param_grid -> state the dictionary of parameters to optimize
# cv = 5 -> number of cross validation folds <------ CV is REALLY important in grid search. Why?
grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = 5) # 2 * 2 * 2 * 2 * 5 = 4 * 4 * 5 = 4 * 20 = 80

In [28]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

In [29]:
# and the winner is...
grid_search.best_params_

{'criterion': 'squared_error',
 'max_depth': 5,
 'min_samples_leaf': 10,
 'min_samples_split': 2}

In [30]:
# in grid search you are more likely to get really good results in your training set, even with CV
print("The best R2 for the best hyperparameters is {:.2f}".format(grid_search.best_score_))

The best R2 for the best hyperparameters is 0.61


### Random Search

In [31]:
from sklearn.model_selection import RandomizedSearchCV

max_depth_choices= [3,4,5] # A list of the possible values of max_depth to try at random
criterion_choices =  ['squared_error','absolute_error'] # A list of the possible values optimization metrics
min_samples_split_choices = [2,3,4,5,6,7,8,9,10] # A list of the possible values of min_samples_split to try
min_samples_leaf_choices = [2,3,4,5,6,7,8,9,10] # A list of the possible values of min_samples_leaf to try 
max_features_choices = [2,3,4,5,6] # A list of max_features to use in the Decission tree

random_grid = {'max_depth': max_depth_choices,
               'criterion': criterion_choices,
               'min_samples_split': min_samples_split_choices,
               'min_samples_leaf': min_samples_leaf_choices,
               'max_features': max_features_choices}

In [32]:
#trying grid search
#means building 3 * 2 * 9 * 9 * 5 * 5 = 12150 models

model = DecisionTreeRegressor()
grid_search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, cv = 5)
grid_search.fit(X_train, y_train)

In [33]:
grid_search.best_params_

{'min_samples_split': 4,
 'min_samples_leaf': 2,
 'max_features': 6,
 'max_depth': 5,
 'criterion': 'absolute_error'}

In [34]:
print("The best R2 according to the random search is {:.2f}".format(grid_search.best_score_))

The best R2 according to the random search is 0.58


In [35]:
#and now more realistic
model = DecisionTreeRegressor()
random_search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter=25, cv = 5, n_jobs = 10) # n_jobs = number_processors - 2

In [36]:
random_search.fit(X_train,y_train)

In [37]:
random_search.best_params_

{'min_samples_split': 7,
 'min_samples_leaf': 7,
 'max_features': 6,
 'max_depth': 5,
 'criterion': 'squared_error'}

In [38]:
print("The best R2 according to the random search is {:.2f}".format(random_search.best_score_))

The best R2 according to the random search is 0.61
