# Better Data Science | Hyperparameter Tuning with GridSearch

- Library imports
- You'll use the Iris dataset for training and tuning

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

iris = pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


- Separate the dataset on features (X) and target (y)
- Make the train/test split

In [6]:
X = iris.drop('species', axis=1)
y = iris['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

<br>

## Baseline model
- Model with default hyperparameters

In [7]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)

print(f'Accuracy = {round(accuracy_score(y_test, preds), 2)}')
print()
print(confusion_matrix(y_test, preds))

Accuracy = 0.97

[[17  0  0]
 [ 0  7  1]
 [ 0  0 13]]


<br>

## Manual hyperparameter optimization - Method #1
- Declare parameter dictionaries beforehand
- Train and evaluate multiple models
- Can become really tedious really fast
- Not scalable

In [8]:
# 3 sets of hyperparameters
params_1 = {'criterion': 'gini', 'splitter': 'best', 'max_depth': 10}
params_2 = {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 1000}
params_3 = {'criterion': 'gini', 'splitter': 'random', 'max_depth': 100}

# 3 separate models
model_1 = DecisionTreeClassifier(**params_1)
model_2 = DecisionTreeClassifier(**params_2)
model_3 = DecisionTreeClassifier(**params_3)

model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)
model_3.fit(X_train, y_train)

# 3 separate prediction sets
preds_1 = model_1.predict(X_test)
preds_2 = model_3.predict(X_test)
preds_3 = model_2.predict(X_test)

print(f'Accuracy on Model 1 = {round(accuracy_score(y_test, preds_1), 5)}')
print(f'Accuracy on Model 2 = {round(accuracy_score(y_test, preds_2), 5)}')
print(f'Accuracy on Model 3 = {round(accuracy_score(y_test, preds_3), 5)}')

Accuracy on Model 1 = 0.97368
Accuracy on Model 2 = 1.0
Accuracy on Model 3 = 0.94737


<br>

## Manual hyperparameter optimization - Method #2
- Better than the first method
- Still way too manual 
- Nested `for` loops don't look nice

In [9]:
# Define parameter possibilities as lists
p_criterion = ['gini', 'entropy']
p_splitter = ['best', 'random']
p_max_depth = [1, 10, 100, 1000]
# The scores will go here
results = []

# Nested loops - we need to test for all combinations
for criterion in p_criterion:
    for splitter in p_splitter:
        for max_depth in p_max_depth:
            # Train the model
            model = DecisionTreeClassifier(
                criterion=criterion,
                splitter=splitter,
                max_depth=max_depth
            )
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            # Append current results
            results.append({
                'Accuracy': round(accuracy_score(y_test, preds), 5),
                'P_Criterion': criterion,
                'P_Splitter': splitter,
                'P_MaxDepth': max_depth
            })
            
# Convert to Pandas DataFrame and sort descendingly by accuracy
results = pd.DataFrame(results)
results = results.sort_values(by='Accuracy', ascending=False)
results

Unnamed: 0,Accuracy,P_Criterion,P_Splitter,P_MaxDepth
7,1.0,gini,random,1000
1,0.97368,gini,best,10
2,0.97368,gini,best,100
3,0.97368,gini,best,1000
6,0.97368,gini,random,100
9,0.97368,entropy,best,10
10,0.97368,entropy,best,100
11,0.97368,entropy,best,1000
14,0.97368,entropy,random,100
15,0.97368,entropy,random,1000


<br>

## Go-to approach: `GridSearchCV`
- Define model and hyperparameter space beforehand
- Use `GridSearchCV` for optimization
- Also does the cross validation for you

In [10]:
model = DecisionTreeClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 10, 100, 1000]
}

clf = GridSearchCV(
    estimator=model, 
    param_grid=params, 
    cv=10,  # 10-fold cross validation
    n_jobs=-1  # run in parallel
)
clf.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 10, 100, 1000],
                         'splitter': ['best', 'random']})

- Convert best parameters array to a Pandas DataFrame:

In [11]:
cv_results = pd.DataFrame(clf.cv_results_)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.007927,0.001962,0.004545,0.004631,gini,1,best,"{'criterion': 'gini', 'max_depth': 1, 'splitte...",0.666667,0.666667,...,0.636364,0.636364,0.636364,0.636364,0.727273,0.727273,0.727273,0.669697,0.039394,13
1,0.007427,0.002715,0.005617,0.002882,gini,1,random,"{'criterion': 'gini', 'max_depth': 1, 'splitte...",0.666667,0.666667,...,0.636364,0.636364,0.636364,0.636364,0.545455,0.545455,0.727273,0.633333,0.051515,15
2,0.005825,0.001475,0.003713,0.002078,gini,10,best,"{'criterion': 'gini', 'max_depth': 10, 'splitt...",1.0,0.833333,...,1.0,0.909091,0.909091,1.0,0.909091,0.909091,0.818182,0.928788,0.065713,4
3,0.004112,0.000698,0.00252,0.00027,gini,10,random,"{'criterion': 'gini', 'max_depth': 10, 'splitt...",1.0,0.833333,...,1.0,0.909091,0.909091,1.0,1.0,0.727273,0.818182,0.919697,0.093413,11
4,0.003296,0.000345,0.00215,0.000477,gini,100,best,"{'criterion': 'gini', 'max_depth': 100, 'split...",0.916667,0.833333,...,1.0,0.909091,0.909091,1.0,1.0,0.909091,0.818182,0.929545,0.065525,3


- Keep only what matters
- Sort descendingly by average test score

In [12]:
cv_results = cv_results[['mean_test_score', 'param_criterion', 'param_splitter', 'param_max_depth']]
cv_results.sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,mean_test_score,param_criterion,param_splitter,param_max_depth
13,0.94697,entropy,random,100
5,0.936364,gini,random,100
4,0.929545,gini,best,100
2,0.928788,gini,best,10
6,0.928788,gini,best,1000
12,0.928788,entropy,best,100
14,0.928788,entropy,best,1000
11,0.92803,entropy,random,10
15,0.92803,entropy,random,1000
10,0.920455,entropy,best,10


- Get the best parameters

In [13]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 100, 'splitter': 'random'}