# Classification: Iris dataset

Also included in scikit-learn is the well-known Iris dataset.

## Load and inspect the data

In [None]:
# the usual imports
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
iris.keys()

The data are botanical features ...

In [None]:
iris.feature_names

... for 3 species of Iris flower:

In [None]:
iris.target_names

In [None]:
X = iris.data
y = iris.target
X.shape, y.shape

Again, for better inspection, we build a pandas DataFrame:

In [None]:
# build a DataFrame with column names = iris.feature_names and a "target" column containing the class labels
### fill in missing code
df = 
df['target'] = 
df

Let's look at the intercorrelations of the features.

In [None]:
df[iris.feature_names].corr()

In [None]:
import seaborn as sns
plt.figure()
coefs = np.corrcoef(df[iris.feature_names].values.T)
sns.set(style='whitegrid')
hm = sns.heatmap(coefs, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=df[iris.feature_names].columns, xticklabels=df[iris.feature_names].columns) 
plt.show()
sns.reset_orig()

Overall the predictors are very much correlated. For classification, we will choose just two features, namely the uncorrelated sepal_width and sepal_length.

In [None]:
### fill in missing code
# just use features 0 and 1
X = X[:,:2]

## Cross validation

Normally we would split the data into training and test sets. In this case, as we don't have enough data to meaningfully do that, we will use cross validation to average over iteratively performed train-test splits.

## Logistic Regression

Let's first use logistic regression on the data set.

In [None]:
from sklearn import linear_model
from sklearn.cross_validation import StratifiedKFold
skf = StratifiedKFold(y = y, n_folds = 10)
accuracies_train = []
accuracies_test = []

for train_index, test_index in skf:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    ### fill in missing code
    logistic_model = 
    
    accuracies_train.append(logistic_model.score(X_train, y_train))
    # same for test
    ### fill in missing code
    accuracies_test.append

print 'training accuracies: {}\n'.format(map(lambda x: round(x,2), accuracies_train))
print 'average training accuracy: {}\n'.format(round(np.mean(accuracies_train),2))
print 'test accuracies: {}\n'.format(map(lambda x: round(x,2), accuracies_test))
print 'average test accuracy: {}\n'.format(round(np.mean(accuracies_test),2))

Let's inspect the predictions from the last of these models:

In [None]:
# look at model predictions
### fill in missing code
y_predicted = 
y_predicted

In [None]:
y_predicted == y

In [None]:
# from https://github.com/rasbt/python-machine-learning-book
from matplotlib.colors import ListedColormap
def plot_decision_regions(title, X, y, classifier, resolution=0.02):

    plt.figure()
    plt.title(title)
    
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('cyan', 'red', 'orange', 'gray', 'blue')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
                    alpha=0.8, c=cmap(idx),
                    marker=markers[idx], label=cl)

    plt.show()    


Let's inspect the decision boundaries of the logistic regression model (again, looking at the last one from above, as they behaved very similarly):

In [None]:
plot_decision_regions('Decision regions', X, y, logistic_model)

We see that one class is correctly identified in general, while the other two are not linearly separable. 
We can confirm this by looking at the confusion matrix: 

In [None]:
from sklearn import metrics
metrics.confusion_matrix(y, logistic_model.predict(X))

## Support Vector Machine

Next, let's try a nonlinear classifier: a support vector machine (with the scikit-learn default RBF kernel).

In [None]:
from sklearn import svm
skf = StratifiedKFold(y = y, n_folds = 10)
accuracies_train = []
accuracies_test = []

for train_index, test_index in skf:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    ### fill in missing code
    svc_model = 
    
    accuracies_train.append(svc_model.score(X_train, y_train))
    ### fill in missing code
    # same for test
    accuracies_test.append

print 'training accuracies: {}\n'.format(map(lambda x: round(x,2), accuracies_train))
print 'average training accuracy: {}\n'.format(round(np.mean(accuracies_train),2))
print 'test accuracies: {}\n'.format(map(lambda x: round(x,2), accuracies_test))
print 'average test accuracy: {}\n'.format(round(np.mean(accuracies_test),2))


We see SVM performs better than logistic regression on both test and training sets.
How do its decision boundaries look?

In [None]:
plot_decision_regions('Decision regions', X, y, svc_model)

Again, look at the confusion matrix:

In [None]:
### fill in missing code


# Decision Tree

Next, try another nonlinear classifier, decision tree.

In [None]:
from sklearn import tree
skf = StratifiedKFold(y = y, n_folds = 10)
accuracies_train = []
accuracies_test = []

for train_index, test_index in skf:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    ### fill in missing code
    tree_model = 
    
    accuracies_train.append(tree_model.score(X_train, y_train))
    ### fill in missing code
    # same for test
    accuracies_test.append(tree_model.score(X_test, y_test))

print 'training accuracies: {}\n'.format(map(lambda x: round(x,2), accuracies_train))
print 'average training accuracy: {}\n'.format(round(np.mean(accuracies_train),2))
print 'test accuracies: {}\n'.format(map(lambda x: round(x,2), accuracies_test))
print 'average test accuracy: {}\n'.format(round(np.mean(accuracies_test),2))


Accuracy on test set is much lower than on training set. The tree is clearly overfitted.
We can confirm this looking at the decision boundaries:

In [None]:
plot_decision_regions('Decision regions', X, y, tree_model)

Let's see how many nodes the tree has (for 150 data points!)

In [None]:
tree_model.tree_.node_count

## Decision Tree - constrained

We now build another tree, restricted to having not fewer than 5 samples at every leaf.
How does it perform, and how do the decision boundaries look?

In [None]:
skf = StratifiedKFold(y = y, n_folds = 10)
accuracies_train = []
accuracies_test = []

for train_index, test_index in skf:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    ### fill in missing code
    tree_model = 
    
    accuracies_train.append(tree_model.score(X_train, y_train))
    ### fill in missing code
    # same for test
    accuracies_test.append(tree_model.score(X_test, y_test))

print 'training accuracies: {}\n'.format(map(lambda x: round(x,2), accuracies_train))
print 'average training accuracy: {}\n'.format(round(np.mean(accuracies_train),2))
print 'test accuracies: {}\n'.format(map(lambda x: round(x,2), accuracies_test))
print 'average test accuracy: {}\n'.format(round(np.mean(accuracies_test),2))

plot_decision_regions('Decision regions', X, y, tree_model)

# Gradient Boosting Tree

Finally, let's try boosting, using scikit-learn's GradientBoostingClassifier:

In [None]:
from sklearn import ensemble
skf = StratifiedKFold(y = y, n_folds = 10)
accuracies_train = []
accuracies_test = []

for train_index, test_index in skf:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    gb_model = ensemble.GradientBoostingClassifier(n_estimators=200)
    gb_model.fit(X_train, y_train)
    accuracies_train.append(gb_model.score(X_train, y_train))
    accuracies_test.append(gb_model.score(X_test, y_test))

print 'training accuracies: {}\n'.format(map(lambda x: round(x,2), accuracies_train))
print 'average training accuracy: {}\n'.format(round(np.mean(accuracies_train),2))
print 'test accuracies: {}\n'.format(map(lambda x: round(x,2), accuracies_test))
print 'average test accuracy: {}\n'.format(round(np.mean(accuracies_test),2))

plot_decision_regions('Decision regions', X, y, gb_model)

Interestingly, the ensemble looks overfitted, but test error is not as bad as in the case of the unconstrained decision tree.