# Classification Hyperparameter Tuning

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [1]:
import datetime
print(datetime.datetime.now())

2020-10-06 09:21:29.594957


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.23.1.


In [4]:
# Create the 'out' directory to store output images
import os
if not os.path.exists('out'):
    os.makedirs('out')

# Read Data


In [5]:
df = pd.read_csv("../data/generated_marketing.csv")
df.info()
df.head()

feature_names = ['Age', 'Income']
X = df[feature_names].to_numpy()
y = df['Bought'].to_numpy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     500 non-null    float64
 1   Income  500 non-null    float64
 2   Bought  500 non-null    int64  
dtypes: float64(2), int64(1)
memory usage: 11.8 KB


Unnamed: 0,Age,Income,Bought
0,0.748126,1.18589,0
1,-1.64576,-2.933332,1
2,0.279829,0.835401,0
3,0.195853,1.249209,0
4,-1.357053,-1.409123,1


# Splitting the Data

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

type(X_train)
type(y_train)

numpy.ndarray

numpy.ndarray

# Decision Trees

In [7]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=42, criterion="entropy",
                             min_samples_split=10, min_samples_leaf=10, max_depth=3, max_leaf_nodes=5)
clf.fit(X_train, y_train)

y_pred_dt = clf.predict(X_test)

DecisionTreeClassifier(criterion='entropy', max_depth=3, max_leaf_nodes=5,
                       min_samples_leaf=10, min_samples_split=10,
                       random_state=42)

In [8]:
class_names = [str(x) for x in clf.classes_]

## Hyperparameter Tuning

### Grid Search

In [9]:
from sklearn.model_selection import GridSearchCV

clf = DecisionTreeClassifier(splitter='best', class_weight=None, random_state=42)

params = {'criterion': ('gini', 'entropy'), 
              'max_depth': [2, 10, 20], 
              'min_samples_leaf': [1, 5, 10],
              'max_features':[None, 'auto'], 
              'max_leaf_nodes':[None, 10, 50]}

gridsearch = GridSearchCV(clf, params, scoring='roc_auc', cv=5, return_train_score=True)

%time gridsearch.fit(X, y)

Wall time: 1.81 s


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'criterion': ('gini', 'entropy'),
                         'max_depth': [2, 10, 20],
                         'max_features': [None, 'auto'],
                         'max_leaf_nodes': [None, 10, 50],
                         'min_samples_leaf': [1, 5, 10]},
             return_train_score=True, scoring='roc_auc')

In [10]:
gridsearch.best_params_
gridsearch.best_score_
gridsearch.best_estimator_

{'criterion': 'gini',
 'max_depth': 20,
 'max_features': 'auto',
 'max_leaf_nodes': 50,
 'min_samples_leaf': 5}

0.9391163345338136

DecisionTreeClassifier(max_depth=20, max_features='auto', max_leaf_nodes=50,
                       min_samples_leaf=5, random_state=42)

In [11]:
# Print out the results of hyperparmater tuning in a nice table.

def cv_results_to_df(cv_results):
    results = pd.DataFrame(list(cv_results['params']))
    results['mean_fit_time'] = cv_results['mean_fit_time']
    results['mean_score_time'] = cv_results['mean_score_time']
    results['mean_train_score'] = cv_results['mean_train_score']
    results['std_train_score'] = cv_results['std_train_score']
    results['mean_test_score'] = cv_results['mean_test_score']
    results['std_test_score'] = cv_results['std_test_score']
    results['rank_test_score'] = cv_results['rank_test_score']

    results = results.sort_values(['mean_test_score'], ascending=False)
    return results

cv_results_to_df(gridsearch.cv_results_)

Unnamed: 0,criterion,max_depth,max_features,max_leaf_nodes,min_samples_leaf,mean_fit_time,mean_score_time,mean_train_score,std_train_score,mean_test_score,std_test_score,rank_test_score
52,gini,20,auto,50.0,5,0.000800,0.001000,0.986342,0.001037,0.939116,0.017479,1
26,gini,10,,50.0,10,0.001214,0.000812,0.979327,0.002067,0.938637,0.014886,2
44,gini,20,,50.0,10,0.000807,0.000991,0.979327,0.002067,0.938637,0.014886,2
20,gini,10,,,10,0.000800,0.001399,0.979327,0.002067,0.938557,0.014907,4
38,gini,20,,,10,0.001402,0.001007,0.979327,0.002067,0.938557,0.014907,4
...,...,...,...,...,...,...,...,...,...,...,...,...
18,gini,10,,,1,0.000800,0.001000,0.998937,0.001281,0.868551,0.016339,104
36,gini,20,,,1,0.001199,0.001201,1.000000,0.000000,0.868271,0.015107,105
33,gini,10,auto,50.0,1,0.000792,0.000997,0.997142,0.001937,0.864549,0.016555,106
51,gini,20,auto,50.0,1,0.000999,0.001001,0.998952,0.000841,0.855110,0.021121,107


### Random Search

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

params_random = {"criterion": ["gini", "entropy"],
               "max_depth": sp_randint(2, 21),
               "min_samples_leaf": sp_randint(1, 11),
               "max_features": sp_randint(1, 3),
               "max_leaf_nodes": sp_randint(10, 100),
              }

randomsearch = RandomizedSearchCV(clf, param_distributions=params_random, 
                                  cv=5, return_train_score=True, n_iter=500);
%time randomsearch.fit(X_train, y_train);

Wall time: 4.53 s


RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=500,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000240045FC940>,
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000240045FCDA0>,
                                        'max_leaf_nodes': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002407F5D0EB8>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000240045FCE10>},
                   return_train_score=True)

In [13]:
cv_results_to_df(randomsearch.cv_results_)

Unnamed: 0,criterion,max_depth,max_features,max_leaf_nodes,min_samples_leaf,mean_fit_time,mean_score_time,mean_train_score,std_train_score,mean_test_score,std_test_score,rank_test_score
352,gini,4,2,99,7,0.001000,0.000000,0.923125,0.012119,0.9075,0.028062,1
148,gini,4,2,71,7,0.000400,0.000407,0.923125,0.012119,0.9075,0.028062,1
0,gini,10,2,73,10,0.000400,0.000400,0.916250,0.008705,0.9075,0.029155,3
162,gini,20,1,40,8,0.000994,0.000000,0.923125,0.012437,0.9075,0.035882,3
252,gini,8,2,54,9,0.001000,0.000199,0.916250,0.008705,0.9075,0.029155,3
...,...,...,...,...,...,...,...,...,...,...,...,...
23,gini,15,1,75,2,0.000800,0.000000,0.948125,0.007552,0.8500,0.023717,492
99,gini,11,1,45,2,0.000600,0.000200,0.946875,0.008615,0.8500,0.023717,492
346,gini,15,1,97,2,0.000600,0.000400,0.948125,0.007552,0.8500,0.023717,492
18,entropy,14,2,82,1,0.001199,0.000600,0.997500,0.001250,0.8500,0.054772,492
