In [19]:
from sklearn.datasets import load_iris

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, precision_score

import pandas as pd
import numpy as np

np.random.seed(0)

In [8]:
iris = load_iris()

df  = pd.DataFrame(iris.data,columns= iris.feature_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [9]:
 iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [10]:
# Added a new column with the species name this is what we are going to predict
df['species'] = pd.Categorical.from_codes(iris.target,iris.target_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


# Alternate of Train test split 

In [12]:
#dividing the data into train test split using np.radom.uniform
# uniform is used for uniform generation  if we use random we can get random generation
df['is_train']= np.random.uniform(0,1,len(df)) <=.75

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,False
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [13]:
train,test = df[df["is_train"]==True],df[df["is_train"]==False]

In [14]:
train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True
7,5.0,3.4,1.5,0.2,setosa,True
...,...,...,...,...,...,...
141,6.9,3.1,5.1,2.3,virginica,True
143,6.8,3.2,5.9,2.3,virginica,True
144,6.7,3.3,5.7,2.5,virginica,True
145,6.7,3.0,5.2,2.3,virginica,True


In [15]:
test

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
1,4.9,3.0,1.4,0.2,setosa,False
5,5.4,3.9,1.7,0.4,setosa,False
6,4.6,3.4,1.4,0.3,setosa,False
13,4.3,3.0,1.1,0.1,setosa,False
14,5.8,4.0,1.2,0.2,setosa,False
15,5.7,4.4,1.5,0.4,setosa,False
24,4.8,3.4,1.9,0.2,setosa,False
27,5.2,3.5,1.5,0.2,setosa,False
34,4.9,3.1,1.5,0.2,setosa,False
43,5.0,3.5,1.6,0.6,setosa,False


In [16]:
# Number of observations 
print("Train size :",len(train))
print("Test size :",len(test))

Train size : 112
Test size : 38


In [17]:
# List of varibles
features = df.columns[:4]
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

# DIfferent method for Encoding 

In [18]:
# for each category it gives a numerical values
y = pd.factorize(train["species"])[0]

y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [21]:
# n_jobs = 2 for only two threads if we give -1 then it will consume all threads
# random state so it will give the same records to all the trees so nothing will fluctuate
clf = RandomForestClassifier(n_jobs=2,random_state=0)

clf.fit(train[features],y)

In [22]:
# APplying it on the test data which it is not seen
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [24]:
# Creating actrual english names for the  predicted plants so that we know what are the predictions
preds = iris.target_names[clf.predict(test[features])]

In [25]:
preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

# New way of creating the confusion matrix

In [26]:
# Creating the confusion matrix
# Cross tabulation
pd.crosstab(test['species'],preds,rownames=['Actual Species'],colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,10,0,0
versicolor,0,9,0
virginica,0,0,19


In [28]:
print(classification_report(test['species'],preds,digits=3))

              precision    recall  f1-score   support

      setosa      1.000     1.000     1.000        10
  versicolor      1.000     1.000     1.000         9
   virginica      1.000     1.000     1.000        19

    accuracy                          1.000        38
   macro avg      1.000     1.000     1.000        38
weighted avg      1.000     1.000     1.000        38

