# 1. Generating Datasets

In [16]:
import pandas as pd
import numpy as np

In [17]:
path = "C:/Users/dell/Downloads/iris.csv"
df = pd.read_csv(path)

In [20]:
X.shape, Y.shape

((200, 10), (200,))

# 2. Data Split

A ratio of 80/20 is used for data splitting such that 80% goes to the training subset and 20% to the testing subset.

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X , Y, test_size=0.3)

In [41]:
X_train.shape , X_test.shape

((140, 10), (60, 10))

In [42]:
Y_train.shape, Y_test.shape

((140,), (60,))

# Building a simple machine learning model using Random Forest

In the following blocks of codes, we will first start with building a random forest model. Finally, we will explore how to tune the hyperparameters **(e.g. n_estimators and max_features)** of the random forest algorithm.

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(max_features=5, n_estimators=100)

The most important settings are the number of trees in the forest **(n_estimators)** and the number of features considered for splitting at each leaf node **(max_features)**.

In [50]:
rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=5, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [51]:
rf.score(X_test, Y_test)

0.85

# Hyperparameter Tuning

Now we will be performing the tuning of hyperparameters of Random forest model. The hyperparameters that we will tune includes **max_features** and the **n_estimators**.

Firstly, we will import the necessary modules.

The **GridSearchCV()** function from scikit-learn will be used to perform the hyperparameter tuning. Particularly, **GridSearchCV()** function can perform the typical functions of a classifier such as **fit**, **score** and predict as well as **predict_proba, decision_function, transform** and **inverse_transform**.

Secondly, we define variables that are necessary input to the GridSearchCV() function.

In [53]:
from sklearn.model_selection import GridSearchCV

In [62]:
max_features_range = [1,2,3,4,5,'auto']
n_estimators_range = np.arange(10,210,10)
param_grid = dict(max_features=max_features_range, n_estimators=n_estimators_range)

rf = RandomForestClassifier()

grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)

its **arange** not **arrange**..Lol !

In [63]:
grid.fit(X_train, Y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [64]:
print('The best parameters are %s with accuracy score of %0.2f' %(grid.best_params_ , grid.best_score_))

The best parameters are {'max_features': 3, 'n_estimators': 70} with accuracy score of 0.91


In [65]:
grid

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

# DataFrame of Grid search parameters and their Accuracy scores

In [66]:
import pandas as pd

grid_results = pd.concat([pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
grid_results.head()

Unnamed: 0,max_features,n_estimators,Accuracy
0,1,10,0.792857
1,1,20,0.835714
2,1,30,0.807143
3,1,40,0.8
4,1,50,0.857143


In [67]:
grid_grouped = grid_results.groupby(['max_features','n_estimators']).mean()
grid_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy
max_features,n_estimators,Unnamed: 2_level_1
1,10,0.792857
1,20,0.835714
1,30,0.807143
1,40,0.800000
1,50,0.857143
1,60,0.878571
1,70,0.857143
1,80,0.878571
1,90,0.857143
1,100,0.871429


In [68]:
grid_reset = grid_grouped.reset_index()
grid_reset.columns = ['max_features', 'n_estimators', 'Accuracy']
grid_pivot = grid_reset.pivot('max_features', 'n_estimators')
grid_pivot

Unnamed: 0_level_0,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy
n_estimators,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200
max_features,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
1,0.792857,0.835714,0.807143,0.8,0.857143,0.878571,0.857143,0.878571,0.857143,0.871429,0.878571,0.842857,0.864286,0.878571,0.842857,0.85,0.871429,0.85,0.864286,0.871429
2,0.842857,0.878571,0.864286,0.857143,0.878571,0.885714,0.892857,0.864286,0.878571,0.885714,0.878571,0.857143,0.885714,0.885714,0.878571,0.871429,0.885714,0.892857,0.871429,0.892857
3,0.878571,0.892857,0.878571,0.878571,0.878571,0.878571,0.907143,0.878571,0.878571,0.878571,0.9,0.871429,0.878571,0.9,0.892857,0.878571,0.892857,0.878571,0.871429,0.892857
4,0.85,0.857143,0.878571,0.885714,0.864286,0.878571,0.871429,0.892857,0.878571,0.885714,0.878571,0.892857,0.907143,0.885714,0.9,0.885714,0.892857,0.892857,0.885714,0.878571
5,0.878571,0.864286,0.878571,0.892857,0.885714,0.871429,0.878571,0.892857,0.9,0.885714,0.878571,0.878571,0.9,0.9,0.878571,0.892857,0.885714,0.9,0.885714,0.878571
auto,0.85,0.885714,0.892857,0.864286,0.878571,0.892857,0.878571,0.878571,0.878571,0.878571,0.892857,0.864286,0.885714,0.885714,0.885714,0.871429,0.885714,0.892857,0.892857,0.885714
