In [1]:
# Load the library with the iris dataset
from sklearn.datasets import load_iris

# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from time import time

In [2]:
# Create an object called iris with the iris data
# iris is a sklearn bunch, which is like a dictionary. The keys correspond to different parts of the bunch
# Ex: data includes the data. target includes the ys. target_names include the names that correspond to the target #s
iris = load_iris()

# Create a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)

df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [3]:
# Add a new column with the species names, this is what we are going to try to predict
# Categorical.from_codes matches integers that point to code names
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### Train test split

In [4]:
# Train test split
Xtrain, Xtest, ytrain, ytest = train_test_split(df.iloc[:, 0:4], df.species)

In [5]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(Xtrain))
print('Number of observations in the test data:',len(Xtest))

Number of observations in the training data: 112
Number of observations in the test data: 38


In [6]:
# ytrain contains the actual species names. Before we can use it,
# we need to convert each species name into a digit. So, in this case there
# are three species, which have been coded as 0, 1, or 2.
#ytrain = pd.factorize(ytrain)[0]
#ytest = pd.factorize(ytest)[0]
ytrain, ytest

([versicolor, setosa, setosa, setosa, setosa, ..., virginica, virginica, versicolor, versicolor, versicolor]
 Length: 112
 Categories (3, object): [setosa, versicolor, virginica],
 [virginica, setosa, versicolor, versicolor, setosa, ..., versicolor, virginica, versicolor, versicolor, setosa]
 Length: 38
 Categories (3, object): [setosa, versicolor, virginica])

In [7]:
# Create a random forest classifier
clf = RandomForestClassifier(n_jobs=2)

# Train the classifier to take the training features and learn how they relate to the y (species)
clf.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=2, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [8]:
# View the predicted probabilities of the first 10 observations
clf.predict_proba(Xtest)[0:10]

array([[ 0. ,  0. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0.9,  0.1,  0. ],
       [ 0.1,  0.9,  0. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ]])

In [9]:
# Compare predicted values vs actuals
print('1st 5 predicted: ', clf.predict(Xtest)[0:5])
print('1st 5 actual: ', ytest[0:5])

1st 5 predicted:  ['virginica' 'setosa' 'versicolor' 'versicolor' 'setosa']
1st 5 actual:  [virginica, setosa, versicolor, versicolor, setosa]
Categories (3, object): [setosa, versicolor, virginica]


In [10]:
# Accuracy score
print(accuracy_score(clf.predict(Xtest), ytest))

0.921052631579


In [11]:
# Create confusion matrix
pd.crosstab(ytest, clf.predict(Xtest), rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,12,1
virginica,0,2,10


In [12]:
# View a list of the features and their importance scores
list(zip(Xtrain, clf.feature_importances_))

[('sepal_length', 0.1267400796640181),
 ('sepal_width', 0.05930328589296012),
 ('petal_length', 0.46407790528306647),
 ('petal_width', 0.34987872915995533)]

### K fold cross validation

In [13]:
# Create new X and y datasets
X2 = df.iloc[:, 0:4]
y2 = df.species

# Create a 2nd random forest classifier
clf2 = RandomForestClassifier(n_jobs=2)

# Split data into training data for cross validation and test data
X2train, X2test, y2train, y2test = train_test_split(X2, y2)

accuracy = []
i = 1

# Cross validate on the training set
for train, test in KFold(5).split(X2train):
    clf2.fit(X2train.iloc[train], y2train[train]) # y2 is Categorical, not a Dataframe, so we don't use iloc
    score = accuracy_score(clf2.predict(X2train.iloc[test]), y2train[test])
    print('Accuracy score for pass %i: %.2f' % (i, score))
    accuracy.append(score)
    i += 1

print('Avg accuracy score: %.2f' % np.average(accuracy))

Accuracy score for pass 1: 1.00
Accuracy score for pass 2: 1.00
Accuracy score for pass 3: 1.00
Accuracy score for pass 4: 0.95
Accuracy score for pass 5: 0.95
Avg accuracy score: 0.98


In [14]:
print('Accuracy on the test set: %.2f' % accuracy_score(clf2.predict(X2test), y2test))

Accuracy on the test set: 0.89


### Gridsearch

In [15]:
# Create new X and y datasets
X3 = df.iloc[:, 0:4]
y3 = pd.factorize(df.species)[0]

# Split data into training data for cross validation and test data
X3train, X3test, y3train, y3test = train_test_split(X3, y3)

In [16]:
# Use grid search to optimize the number of random forest trees
parameters = {'n_estimators': [10, 20, 50, 100],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 3, 10],
              'bootstrap': [True, False],
              'criterion': ["gini", "entropy"]}
clf3 = GridSearchCV(RandomForestClassifier(n_jobs=2), parameters, scoring='accuracy', cv=5)
start = time()
clf3.fit(X3train, y3train)
print('It took %.2f seconds to find the optimal model.' % (time() - start))

It took 277.10 seconds to find the optimal model.


In [17]:
clf3.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=3,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=2, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [18]:
print('Accuracy on the test set: %.2f' % accuracy_score(clf3.predict(X3test), y3test))

Accuracy on the test set: 0.95


In [21]:
pd.crosstab(y3test, clf3.predict(X3test), rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,0,1,2
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,11,0,0
1,0,13,1
2,0,1,12
