In [55]:
import matplotlib.pyplot as plt
import numpy as np

from matplotlib.colors import ListedColormap

from sklearn import datasets, svm
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import precision_recall_fscore_support

%matplotlib inline

In [3]:
#Define color map
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

#Define function to plot iris data
def plot_estimator(estimator, X, y):
    estimator.fit(X, y)
    x_min, x_max = X[:, 0].min() - 0.1, X[:, 0].max() + 0.1
    y_min, y_max = X[:, 1].min() - 0.1, X[:, 1].max() + 0.1
    
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), 
                         np.linspace(y_min, y_max, 100))
    
    Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()])
    
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap = cmap_light)
    
    plt.scatter(X[:, 0], X[:, 1], c = y, cmap = cmap_bold)
    plt.axis('tight')
    plt.tight_layout()

In [6]:
#Load data
iris = datasets.load_iris()

X = iris.data
y = iris.target

#Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40)

print('Training set has %d points' % len(X_train))
print('Test set has %d points' % len(X_test))

Training set has 90 points
Test set has 60 points


In [8]:
svc = svm.SVC(kernel = 'linear', C = 1)
svc.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [19]:
print('SVC score for the test dataset is %0.3f' % svc.score(X_test, y_test))

SVC score for the test dataset is 0.983


In [74]:
#The score is higher using the split dataset to train and test sets than 
#the un-split dataset.

In [73]:
#Build cross validation iterator
cval_itr = KFold(len(iris.data), n_folds = 5)

svc_score = np.array([])
precision = np.array([])
recall = np.array([])
f_score = np.array([])


for train, test in cval_itr:
    #Load training data into a data frame
    X_train = iris.data[train][:, [0,1]]
    y_train = iris.target[train]
    
    #Load test data into a data frame
    X_test = iris.data[test][:, [0,1]]
    y_test = iris.target[test]
    
    #Fit to training data
    svc = svm.SVC(kernel = 'linear', C = 1)
    svc.fit(X_train, y_train)
    
    #Predict categories
    y_predict = svc.predict(X_test)
    
    #Calculate precision, recall and f1 score
    results = precision_recall_fscore_support(y_test, y_predict, average = 'weighted')
    precision = np.append(precision, results[0])
    recall = np.append(recall, results[1])
    f_score = np.append(f_score, results[2])
    
    #Store score for test data
    svc_score = np.append(svc_score, [float(svc.score(X_test, y_test))])

print('Mean score for 5-fold cross validation: %0.3f' % svc_score.mean())
print('Score standard deviation for 5-fold cross validation: %0.3f\n' % svc_score.std())

print('Mean F1 score for 5-fold cross validation: %0.3f' % f_score.mean())

Mean score for 5-fold cross validation: 0.613
Score standard deviation for 5-fold cross validation: 0.309

Mean F1 score for 5-fold cross validation: 0.412


