In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [3]:
#import data
from sklearn.datasets import load_iris

In [7]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [9]:
# adding a new column for the species name
df['species'] = pd.Categorical.from_codes(iris.target,iris.target_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [11]:
# creating test and train data
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,False
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,False
3,4.6,3.1,1.5,0.2,setosa,False
4,5.0,3.6,1.4,0.2,setosa,True


In [13]:
# creating dataframes with test rows and training rows
train, test = df[df['is_train']==True], df[df['is_train']==False]
print('number of observations in training data:', len(train))
print('number of observations in test data:', len(test))

number of observations in training data: 110
number of observations in test data: 40


In [15]:
# create a list of the feature column's names
features = df.columns[:4]
print(features)

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')


In [16]:
# converting each species name into digits
y = pd.factorize(train['species'])[0]
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [17]:
# creating a random forest Classifier 
clf = RandomForestClassifier(n_jobs=2, random_state=0)

# Training the classifier
clf.fit(train[features], y)

RandomForestClassifier(n_jobs=2, random_state=0)

In [18]:
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 1, 1, 2,
       1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [20]:
clf.predict_proba(test[features])[10:20]

array([[1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.97, 0.03],
       [0.  , 0.99, 0.01],
       [0.  , 1.  , 0.  ],
       [0.  , 0.63, 0.37],
       [0.  , 1.  , 0.  ],
       [0.  , 0.02, 0.98],
       [0.  , 1.  , 0.  ]])

In [22]:
# maping names for the plants for each predict plant class
preds= iris.target_names[clf.predict(test[features])]
# View the predicted species for the first five observations
preds[0:25]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'virginica', 'versicolor', 'versicolor', 'virginica',
       'versicolor', 'versicolor', 'versicolor'], dtype='<U10')

In [23]:
test['species'].head()

0     setosa
2     setosa
3     setosa
7     setosa
19    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [25]:
# creating  confusion matrix
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,12,2
virginica,0,0,13


In [32]:
preds= iris.target_names[clf.predict( [[5.0, 3.6, 1.4, 2.0]] )]
preds

array(['setosa'], dtype='<U10')