### IRIS DATASET
**Problem statement**:  
Predict the species of flowers in the iris dataset

In [196]:
#load library with iris dataset
from sklearn.datasets import load_iris
#load sklearn random forest classifier library
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np

In [197]:
#set a random seed
np.random.seed(0)

#### Create the dataframe with all the iris data

In [198]:
#load iris dataset
iris = load_iris()
#create a dataframe with four features available
df = pd.DataFrame(iris.data, columns=iris.feature_names)

## DATA EXPLORATION

In [199]:
#show top 5 rows in the dataframe
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [200]:
#show dataframe and dimensions
print(df)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                  5.1               3.5                1.4               0.2
1                  4.9               3.0                1.4               0.2
2                  4.7               3.2                1.3               0.2
3                  4.6               3.1                1.5               0.2
4                  5.0               3.6                1.4               0.2
..                 ...               ...                ...               ...
145                6.7               3.0                5.2               2.3
146                6.3               2.5                5.0               1.9
147                6.5               3.0                5.2               2.0
148                6.2               3.4                5.4               2.3
149                5.9               3.0                5.1               1.8

[150 rows x 4 columns]


#### Add a new column "species" to the dataset

In [201]:
#add a new column for the species name
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
#top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


#### Create test and train data
Add test and training data to the dataset
* generate a boolean for each row of the dataframe
* 75% of row will be tagged True
* the remaining 25% will be tagged False

This way 75% of the dataset will be used for training, the rest for evaluation of the final model

In [202]:
#generate random samples from a uniform distribution of of numbers from 0 to 1
#The result will be true if the number is < 0.75 and False if > 0.75
#Being a uniform distribution we will have 75% True and 25% False
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
#view top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [203]:
#Create a dataframe with test rows and training rows
#Insert training rows and test rows in 2 different variables for later use
train, test = df[df['is_train'] == True], df[df['is_train'] == False]
print(f"Number of observations in the training data: {len(train)}")
print(f"Number of observations in the test data: {len(test)}")

Number of observations in the training data: 118
Number of observations in the test data: 32


#### Explore the features in the dataset

In [204]:
#create a list of the feature column's name
features = df.columns[:4]
#view features
print(features)

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')


#### Finish preparation of the data
Convert each species name into digits.  
We need to convert the train species into digits so that can be used by the algorithm.  
Pick the 0 index in the `train["species"]` array because it's an array of arrays.  
The array will be filled with 0, 1 and 2 that represent each of the three species of irises in the dataset.

In [205]:
y = pd.factorize(train["species"])[0]
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2]


## TRAINING

##### Random forest classifier
Inits a random forest classifier with some parameters:

* n_jobs : number of jobs to run in parallel for fit and predict
* random_state : it's the seed used by the random number generator

##### Fit
Performs fitting on the model on X and y (features and species)

* features are the input samples
* species are the target values

In [223]:
# Create a random forest Classifier
clf = RandomForestClassifier(n_estimators=100, n_jobs=2, random_state=0)

#Training the classifier
fitting = clf.fit(train[features], y)
score = f"{clf.score(train[features], y) * 100}%"
print(f"This model has a training accuracy of {score}")

This model has a training accuracy of 100.0%


## EVALUATION AND RESULT EXPLORATION

* Apply the trained Classifier to the test portion of the dataset
* Explore the results

In [248]:
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [249]:
#show the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[10:30]

array([[1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.67, 0.33],
       [0.  , 1.  , 0.  ],
       [0.  , 0.82, 0.18],
       [0.  , 0.03, 0.97],
       [0.  , 0.42, 0.58],
       [0.  , 0.99, 0.01],
       [0.  , 0.96, 0.04],
       [0.  , 0.  , 1.  ],
       [0.  , 0.  , 1.  ],
       [0.  , 0.  , 1.  ],
       [0.  , 0.01, 0.99],
       [0.  , 0.  , 1.  ],
       [0.  , 0.  , 1.  ],
       [0.  , 0.  , 1.  ],
       [0.  , 0.01, 0.99],
       [0.  , 0.  , 1.  ],
       [0.  , 0.  , 1.  ]])

In [250]:
#mapping the name of the plants for each predicted plant
preds = iris.target_names[clf.predict(test[features])]

#view the predicted species for the first 25 observations
print(preds[0:25])

['setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'versicolor' 'versicolor'
 'versicolor' 'virginica' 'virginica' 'versicolor' 'versicolor'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica']


In [251]:
#view actual species for the first 25 observations
print(test['species'][:25])

7          setosa
8          setosa
10         setosa
13         setosa
17         setosa
18         setosa
19         setosa
20         setosa
21         setosa
23         setosa
27         setosa
31         setosa
38         setosa
52     versicolor
66     versicolor
68     versicolor
70     versicolor
72     versicolor
89     versicolor
98     versicolor
103     virginica
109     virginica
111     virginica
114     virginica
116     virginica
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]


## CONFUSION MATRIX
Used to described the performance of the classification model in a set of data for which the true values are known.  

It contains the number of correct and incorrect predictions

In [252]:
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,5,2
virginica,0,0,12


In [253]:
def computeAccuracy(actual_species, predictions):
    ok_count = 0
    for specie, pred in zip(actual_species, predictions):
        if specie == pred:
            ok_count += 1
    return ok_count/(len(actual_species)) * 100

accuracy = computeAccuracy(test['species'], preds)

print(f"This model has a validation accuracy of {accuracy}%")

This model has a validation accuracy of 93.75%


## DEPLOY

In [254]:
what_species = [[5.0, 3.6, 1.4, 2.0]]
preds = iris.target_names[clf.predict(what_species)]
probability = clf.predict_proba(what_species)

def show_probs():
    prob_dict = {}
    for i, prob in enumerate(probability[0]):
        prob = {iris.target_names[i] : [f"{int(prob*100)}%"]}
        prob_dict.update(prob)
    return pd.DataFrame.from_dict(prob_dict)
show_probs()

Unnamed: 0,setosa,versicolor,virginica
0,51%,19%,30%
