In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, KFold

## Random Forest Classifier

#### Training

In [3]:
df = pd.read_csv('Iris.csv')

In [4]:
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [5]:
model = RandomForestClassifier()
X = df.drop(['Id','Species'],axis=1)
y = df['Species']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [10]:
model.fit(X_train,y_train)

RandomForestClassifier()

#### Model hyper-paremeters

In [19]:
trees_depth = []
for x in model.estimators_:
    trees_depth.append(x.get_depth())
np.max(trees_depth)

7

#### Validation

In [13]:
y_val_pred = model.predict(X_val)

In [15]:
# confusion matrix
confusion_matrix(y_val, y_val_pred)

array([[8, 0, 0],
       [0, 9, 2],
       [0, 0, 5]], dtype=int64)

In [23]:
# model's accuracy
accuracy_score(y_val, y_val_pred)

0.9166666666666666

In [24]:
# f1 score for unbalanced classes
f1_score(y_val, y_val_pred, average='weighted')

0.9194444444444443

#### Cross-validation

In [30]:
kf = KFold(n_splits=5, shuffle=True)
next(kf.split(X))

(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  11,  12,  13,  14,
         15,  16,  17,  18,  19,  20,  21,  22,  23,  25,  26,  27,  28,
         30,  31,  33,  35,  36,  38,  39,  40,  41,  42,  43,  44,  45,
         46,  48,  49,  50,  51,  52,  53,  56,  57,  58,  60,  61,  62,
         63,  64,  65,  66,  67,  68,  69,  71,  72,  74,  76,  78,  79,
         81,  82,  84,  86,  87,  89,  91,  92,  93,  94,  95,  96,  97,
         98,  99, 101, 102, 103, 104, 106, 108, 109, 111, 112, 113, 114,
        115, 116, 117, 118, 119, 120, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 134, 135, 136, 137, 138, 139, 140, 142, 144, 146,
        147, 148, 149]),
 array([  0,  10,  24,  29,  32,  34,  37,  47,  54,  55,  59,  70,  73,
         75,  77,  80,  83,  85,  88,  90, 100, 105, 107, 110, 121, 122,
        133, 141, 143, 145]))

In [32]:
batch_true = []
batch_pred = []
for train_idx, test_idx in kf.split(X):
    X_train_batch = X.iloc[train_idx]
    y_train_batch = y.iloc[train_idx]
    X_test_batch = X.iloc[test_idx]
    y_test_batch = y.iloc[test_idx]
    model_batch = RandomForestClassifier()
    model_batch.fit(X_train_batch, y_train_batch)
    batch_true.append(y_test_batch)
    batch_pred.append(pd.Series(model_batch.predict(X_test_batch), index = test_idx))

In [40]:
y_pred = pd.concat(batch_pred).sort_index()
y_pred

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Length: 150, dtype: object

In [41]:
y_true = pd.concat(batch_true).sort_index()
y_true

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: Species, Length: 150, dtype: object

In [42]:
confusion_matrix(y_true,y_pred)

array([[50,  0,  0],
       [ 0, 46,  4],
       [ 0,  5, 45]], dtype=int64)

#### Test

In [44]:
y_test_pred = model.predict(X_test)

In [45]:
confusion_matrix(y_test, y_test_pred)

array([[10,  0,  0],
       [ 0,  9,  0],
       [ 0,  0, 11]], dtype=int64)

In [46]:
accuracy_score(y_test, y_test_pred)

1.0

In [47]:
f1_score(y_test, y_test_pred, average='weighted')

1.0