# 1. Hyperparameter tuning using GridSearchCV

In [111]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

In [112]:
df = pd.read_csv('heart-disease.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [113]:
# Check for missing values
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [114]:
# Check for categorical features
# b) check for categorical values
# get all categorical columns in the dataframe

# you need not do this fr the exams
catCols = [col for col in df.columns if df[col].dtype == '0']
catCols

[]

In [115]:
# Features and Target
X = df.drop(['target'], axis=1)
y = df['target']

In [116]:
# Split it into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25)

In [117]:
# Build a model
from sklearn import tree
classifier = DecisionTreeClassifier()

In [118]:
# Training of a model
classifier.fit(X_train, y_train)

DecisionTreeClassifier()

In [119]:
# Testing to make predictions
pred = classifier.predict(X_test)

In [120]:
# Calculate accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.8289473684210527

## Tuning with grid search CV

In [121]:
param_dict = {
    'criterion':['gini', 'entropy'],
    'max_depth': range(1,10), 
    'min_samples_split':range(2,10),
    'min_samples_leaf': range(1, 20)
}

# to find the parameters -> press shift + tab on say DecisionTreeClassifier() or use intellisense

In [122]:
grid = GridSearchCV(classifier, 
                    param_grid = param_dict, 
                    cv = 10, 
                    n_jobs = 1)
# here again you can press shift + tab to see the parameters.
# cv = 10 fold cross validation
# n_jobs -> multithreading

grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 10),
                         'min_samples_leaf': range(1, 20),
                         'min_samples_split': range(2, 10)})

In [123]:
# finding the best hyperparameter
grid.best_params_

{'criterion': 'entropy',
 'max_depth': 3,
 'min_samples_leaf': 13,
 'min_samples_split': 3}

In [124]:
# you don't need to print this one
grid.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=13,
                       min_samples_split=3)

In [125]:
grid.best_score_

0.784387351778656

# RandomSearchCV

In [126]:
from sklearn.model_selection import RandomizedSearchCV
param_dict = {
    'criterion':['gini', 'entropy'],
    'max_depth': range(1,10), 
    'min_samples_split':range(2,10),
    'min_samples_leaf': range(1, 20)
}

In [127]:
randomcv = RandomizedSearchCV(classifier, 
                            param_distributions=param_dict)
# in grid search, we had param_grid instead
randomcv.fit(X_train, y_train)

RandomizedSearchCV(estimator=DecisionTreeClassifier(),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': range(1, 10),
                                        'min_samples_leaf': range(1, 20),
                                        'min_samples_split': range(2, 10)})

In [128]:
# finding the best hyperparameter
randomcv.best_params_

{'min_samples_split': 6,
 'min_samples_leaf': 2,
 'max_depth': 3,
 'criterion': 'entropy'}

In [129]:
randomcv.best_score_

0.7620289855072464