In [2]:
# load library with iris dataset
from sklearn.datasets import load_iris
# load scikit library's random forest classifier library
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
# set random seed
np.random.seed(0)

In [3]:
# Creating on=bject called iris with iris data
iris = load_iris()

# creating dataframe with 4 feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
print(iris.target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [44]:
print(iris.target_names)

['setosa' 'versicolor' 'virginica']


In [5]:
# Use pd.Categorical.from_codes if you already have codes and categories and so do not need the (computation intensive)
# factorization step

# Adding new column to df for species name (target)
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)  # iris['target'] will also work with pandas
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:
# Creating train and test data
df['is_train'] = np.random.uniform(0,1,len(df)) <= .75 #generate random nos between 0 and 1, creating rughly 75% test data 
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [9]:
# Creating dataframes with test rows and training rows
train, test = df[df['is_train']==True], df[df['is_train']==False]
print('Number of rows in train set: ', len(train))
print('Number of rows in test set: ', len(test))

Number of rows in train set:  118
Number of rows in test set:  32


In [10]:
#Create a list of feature names
features = df.columns[:4]
# View features (when running in command line/ jupiter notebook, you can print directly instead of print(features) )
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [11]:
# converting each species into digits (for converting it into machine readable format)
y = pd.factorize(train['species'])[0]
# View target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2])

In [15]:
# apply trained Classifier to test
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2])

In [16]:
# View predicted probabilities as per randon forest of 10 observations at this time
# Col 1, 2 and 3 are for 'setosa', 'versicolor', 'virginica' 
#in case of .5, .5 probability, the one that comes first is selected as the decision
clf.predict_proba(test[features])[10:20]

array([[ 1. ,  0. ,  0. ],
       [ 0.9,  0.1,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  0.5,  0.5],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  0.9,  0.1],
       [ 0. ,  0.2,  0.8],
       [ 0. ,  0.3,  0.7],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  0.8,  0.2]])

In [53]:
# mapping names for the plants for each predicted plant class
preds = iris.target_names[clf.predict(test[features])]
# View the species predicted by the forest for the first 5 observations
preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'],
      dtype='<U10')

In [54]:
#View actual species for first 5 observations
test['species'].head()

7     setosa
8     setosa
10    setosa
13    setosa
17    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [55]:
#Creating confusion matrix for predicted vs actual values
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])
# the values in the diagonal in below matrix are the correct predictions, others are errors

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,5,2
virginica,0,0,12


In [56]:
#Now predict some more values using our trained model
preds = iris.target_names[clf.predict( [[5.0,3.6,1.4,2.0], [5.0,3.6,1.4,2.0]] )]
preds

array(['versicolor', 'versicolor'],
      dtype='<U10')

In [57]:
# note same species predicted for same input parameters

Use pickle library to serialize a tried and tested model for later use

In [60]:
import pickle
s = pickle.dumps(clf)
clf2 = pickle.loads(s)
preds = iris.target_names[clf2.predict( [[5.0,3.6,1.4,2.0], [5.0,3.6,1.4,2.0]] )]
preds

array(['versicolor', 'versicolor'],
      dtype='<U10')

In [76]:
# You can also save the serialized model as a binary file
filehandler = open(b"Classifier.obj","wb")
pickle.dump(clf,filehandler)
filehandler.close()

# Desrialize the saved model stored as a binary file in above step
file = open("Classifier.obj","rb")
clf3 = pickle.load(file)
file.close()

preds = iris.target_names[clf3.predict( [[5.0,3.6,1.4,2.0], [5.0,3.6,1.4,2.0]] )]
preds

array(['versicolor', 'versicolor'],
      dtype='<U10')