# Using Scikit-Learn Classifier on Iris Data

- **KNN**

- **Random Forest**: http://scikit-learn.org/stable/modules/ensemble.html

## Preparation

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import random

In [2]:
# Load data

dat = pd.read_csv("data/iris.csv")
dat = np.array(dat)
print dat.shape

(150, 5)


In [3]:
data = dat[:, 0:4]
label = dat[:, 4]

def label_encoder(x):
    if x == "setosa":
        return 0
    if x == "versicolor":
        return 1
    else:
        return 2
    
label = np.array(map(label_encoder, label))

train_index = random.sample(range(50), 35) + random.sample(range(51, 100+1), 35) + random.sample(range(101, 150), 35)

train_data = data[train_index]
train_label = label[train_index]

test_data = data[list(set(range(150)) - set(train_index))]
test_label = label[list(set(range(150)) - set(train_index))]

## KNN Classifier

In [4]:
knn = KNeighborsClassifier(n_neighbors = 5, metric="euclidean")

knn.fit(train_data, train_label)

preds = knn.predict(test_data)

In [5]:
print preds
print test_label

print "Accuracy: %g" % (sum(preds == test_label)/float(len(preds)))

[0 0 0 2 0 2 2 0 2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2
 2 2 2 2 2 2 2 2]
[0 0 0 2 0 2 2 0 2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2
 2 2 2 2 2 2 2 2]
Accuracy: 0.977778


In [6]:
from sklearn.cross_validation import cross_val_score
print cross_val_score(knn, train_data, train_label, cv = 5)

[ 0.95454545  0.95238095  1.          0.85714286  0.95      ]


In [7]:
# Futher tune the model

from sklearn.grid_search import GridSearchCV

params = {"n_neighbors": np.arange(1,10),
         "metric": ["euclidean", "minkowski"]}

grid = GridSearchCV(estimator=knn,
                    param_grid=params)

grid.fit(train_data, train_label)

print(grid.best_score_)
print(grid.best_estimator_.n_neighbors)
print(grid.best_estimator_)
grid.grid_scores_

0.961904761905
9
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='uniform')


[mean: 0.94286, std: 0.02469, params: {'n_neighbors': 1, 'metric': 'euclidean'},
 mean: 0.92381, std: 0.01365, params: {'n_neighbors': 2, 'metric': 'euclidean'},
 mean: 0.94286, std: 0.02439, params: {'n_neighbors': 3, 'metric': 'euclidean'},
 mean: 0.94286, std: 0.04049, params: {'n_neighbors': 4, 'metric': 'euclidean'},
 mean: 0.94286, std: 0.02439, params: {'n_neighbors': 5, 'metric': 'euclidean'},
 mean: 0.94286, std: 0.04049, params: {'n_neighbors': 6, 'metric': 'euclidean'},
 mean: 0.95238, std: 0.03497, params: {'n_neighbors': 7, 'metric': 'euclidean'},
 mean: 0.95238, std: 0.03497, params: {'n_neighbors': 8, 'metric': 'euclidean'},
 mean: 0.96190, std: 0.02699, params: {'n_neighbors': 9, 'metric': 'euclidean'},
 mean: 0.94286, std: 0.02469, params: {'n_neighbors': 1, 'metric': 'minkowski'},
 mean: 0.92381, std: 0.01365, params: {'n_neighbors': 2, 'metric': 'minkowski'},
 mean: 0.94286, std: 0.02439, params: {'n_neighbors': 3, 'metric': 'minkowski'},
 mean: 0.94286, std: 0.04049

## Random Forest


### Key Parameter
- **n_jobs**: (parallel computation) If n_jobs=k then computations are partitioned into k jobs, and run on k cores of the machine. If n_jobs=-1 then all cores available on the machine are used. Note that because of inter-process communication overhead, the speedup might not be linear (i.e., using k jobs will unfortunately not be k times as fast). Significant speedup can still be achieved though when building a large number of trees, or when building a single tree requires a fair amount of time (e.g., on large datasets).

- **n_estimators**:  the number of trees in the forest. The larger the better, but also the longer it will take to compute. In addition, note that results will stop getting significantly better beyond a critical number of trees.

- **max_features**: the size of the random subsets of features to consider when splitting a node. The lower the greater the reduction of variance, but also the greater the increase in bias. 

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, n_jobs=4)
rf.fit(train_data, train_label)

preds_rf = rf.predict(test_data)

print "Random Forest Accuracy: %f" % (sum(preds_rf == test_label)/float(len(test_label)))

Random Forest Accuracy: 0.977778


### Importance of Features

In [9]:
rf.feature_importances_

array([ 0.09761503,  0.02508047,  0.49547878,  0.38182572])

### Futher Tune the Model

In [10]:
from sklearn.grid_search import GridSearchCV

params = {"n_estimators": np.arange(1,25)}

grid = GridSearchCV(estimator=rf,
                    param_grid=params)

grid.fit(train_data, train_label)

print(grid.best_score_)
print(grid.best_estimator_)
grid.grid_scores_

0.971428571429
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


[mean: 0.97143, std: 0.02403, params: {'n_estimators': 1},
 mean: 0.91429, std: 0.04842, params: {'n_estimators': 2},
 mean: 0.92381, std: 0.05071, params: {'n_estimators': 3},
 mean: 0.94286, std: 0.02439, params: {'n_estimators': 4},
 mean: 0.94286, std: 0.02240, params: {'n_estimators': 5},
 mean: 0.94286, std: 0.02240, params: {'n_estimators': 6},
 mean: 0.94286, std: 0.02439, params: {'n_estimators': 7},
 mean: 0.93333, std: 0.02704, params: {'n_estimators': 8},
 mean: 0.94286, std: 0.00133, params: {'n_estimators': 9},
 mean: 0.95238, std: 0.02831, params: {'n_estimators': 10},
 mean: 0.93333, std: 0.02704, params: {'n_estimators': 11},
 mean: 0.95238, std: 0.01356, params: {'n_estimators': 12},
 mean: 0.96190, std: 0.01445, params: {'n_estimators': 13},
 mean: 0.94286, std: 0.02240, params: {'n_estimators': 14},
 mean: 0.96190, std: 0.01445, params: {'n_estimators': 15},
 mean: 0.95238, std: 0.01356, params: {'n_estimators': 16},
 mean: 0.95238, std: 0.01356, params: {'n_estimat