##  <font color='blue'>Organizing and Exploring the Data</font>  

In [2]:
#Loading Library with Iris Dataset
from sklearn.datasets import load_iris

#Loading scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

#loading pandas
import pandas as pd

#Loading numpy
import numpy as np

#setting random seed
np.random.seed(0)

In [13]:
#creating an object called Iris with iris data
iris=load_iris()

#Creating a dataframe with 4 feature variables
df=pd.DataFrame(iris.data,columns=iris.feature_names)

#viewing the head -top 5 of dataframe
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


Species is the one we are trying to predict.That is the Target(Y).

In [26]:
#Adding new column for the species name
df['species'] = pd.Categorical.from_codes(iris.target,iris.target_names)

#viewing the top 5 of dataframe
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [25]:
#dropping a column from the dataframe
df.drop(['species'],axis=1,inplace= True)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [30]:
print(df.shape)
print(len(df))

(150, 5)
150


## <font color='blue'>Creating Train And Test Data</font> 

In [31]:
#Creating Train and Test Data - we have taken 75% into True and 25% into False -- just used for train and test set split
df['is_train'] = np.random.uniform(0,1,len(df)) <=.75

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [34]:
#to find the amount of the data split - frequency in Y or N(T or F)
df.groupby(["is_train"]).size()

is_train
False     32
True     118
dtype: int64

In [38]:
#Creating dataframes with test rows and train rows
train,test = df[df['is_train'] == True],df[df['is_train'] == False]

#show number of observations in train and test data
print('no of observations in training data is:',len(train))
print('no of observations in test data is :' ,len(test))

no of observations in training data is: 118
no of observations in test data is : 32


In [39]:
#create a list of feature columns names
features = df.columns[:4]

#view features
features


Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [41]:
#converting each species name into digits
y=pd.factorize(train['species'])[0]

#Viewing Target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

## <font color="blue"> Predicting The Data </font>

In [58]:
#Creating a Random Forest Classifer
clf = RandomForestClassifier(n_jobs=2,random_state=0)

In [62]:
#Training the Classifier
clf.fit(train[features],y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [61]:
test[features]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
10,5.4,3.7,1.5,0.2
13,4.3,3.0,1.1,0.1
17,5.1,3.5,1.4,0.3
18,5.7,3.8,1.7,0.3
19,5.1,3.8,1.5,0.3
20,5.4,3.4,1.7,0.2
21,5.1,3.7,1.5,0.4
23,5.1,3.3,1.7,0.5


In [63]:
#Applying trained classfier to test
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [64]:
#Viewing predicted probablities of first 10 observations
clf.predict_proba(test[features])[0:10]

array([[1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0.9, 0.1, 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ]])

In [65]:
#Mapping names for the plants for each predicted plant
preds=iris.target_names[clf.predict(test[features])]

In [67]:
#View predicted species for first 5 observations
preds[0:25]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'versicolor', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica'], dtype='<U10')

In [68]:
#Viewing actual species of first 5 observations
test['species'].head()

7     setosa
8     setosa
10    setosa
13    setosa
17    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

## <font color='blue'> Understanding How Forest Works (Accuracy)</font>

In [69]:
#Creating Confusion Matrix
pd.crosstab(test['species'],preds,rownames=['Actual Species'],colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,5,2
virginica,0,0,12


In [70]:
test['species']

7          setosa
8          setosa
10         setosa
13         setosa
17         setosa
18         setosa
19         setosa
20         setosa
21         setosa
23         setosa
27         setosa
31         setosa
38         setosa
52     versicolor
66     versicolor
68     versicolor
70     versicolor
72     versicolor
89     versicolor
98     versicolor
103     virginica
109     virginica
111     virginica
114     virginica
116     virginica
118     virginica
122     virginica
140     virginica
143     virginica
144     virginica
147     virginica
149     virginica
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [71]:
preds=iris.target_names[clf.predict([[5.0,3.6,1.4,2.0],[5.0,3.6,1.4,2.0]])]

In [72]:
preds

array(['versicolor', 'versicolor'], dtype='<U10')