# Model evaluation and hyperparamter tuning

## Plotting function for decision boundaries, test and training samples

In [None]:
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
    # Setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])
    
    # Plotting the data
    x1_min, x1_max = min(X[:,0]) - 1, max(X[:,0]) + 1   
    x2_min, x2_max = min(X[:,1]) - 1, max(X[:,1]) + 1 
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), 
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    
    # Plot all samples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y==cl, 0], y=X[y==cl, 1], 
                    alpha = 0.8, c=cmap(idx), marker=markers[idx], label=cl)
    
    # Highlight test samples
    if test_idx:
        X_test, y_test = X[test_idx, :], y[test_idx]
        plt.scatter(X_test[:, 0], X_test[:, 1], alpha=1.0, c='', marker='o', label='test sample')

## Getting the Breast Cancer data

In [None]:
import pandas as pd
# Read the data from the net and store it in the disk
# data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', 
#                   header=None)
# data.to_csv("breast_cancer.csv", header=False, index=False)

# Read the stored data in the disk
from sklearn.preprocessing import LabelEncoder
data = pd.read_csv("breast_cancer.csv", header=None, index_col=False)
X = data.iloc[:, 2:].values
y = data.iloc[:, 1 ].values
y = LabelEncoder().fit_transform(y)

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Combining transformers and estimators in pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
pca_lr_pipe = Pipeline([('std', StandardScaler()),
                   ('pca', PCA(n_components=2)),
                   ('lr', LogisticRegression(random_state=1))])
pca_lr_pipe.fit(X_train, y_train)
pca_lr_pipe.score(X_test, y_test)

# Using stratified k-fold cross validation 

In [None]:
from sklearn.cross_validation import StratifiedKFold
import numpy as np
skf = StratifiedKFold(y_train, n_folds = 10, random_state=1)
scores = []
for k, (trn_index, val_index) in enumerate(skf):
    X_trn, X_val = X_train[trn_index], X_train[val_index]
    y_trn, y_val = y_train[trn_index], y_train[val_index]
    pca_lr_pipe.fit(X_trn, y_trn)
    score = pca_lr_pipe.score(X_val, y_val)
    scores.append(score)
    print ("k = %s, Class dist: %s, score = %0.3f" %(k+1, np.bincount(y_trn), score))
print ("CV accuracy = %s +/- %s " %(np.mean(scores), np.std(scores))) 

In [None]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(estimator=pca_lr_pipe, 
                         X=X_train,
                         y=y_train,
                         cv=10,
                         n_jobs=n_jobs)
print ("CV accuracy = %s +/- %s" %(np.mean(scores), np.std(scores)))

## Plotting learning curve

from sklearn.learning_curve import learning_curve

train_sizes, train_scores, test_scores = learning_curve(estimator=pca_lr_pipe,
                                                        X=X_train,
                                                        y=y_train, 
                                                        train_sizes=np.linspace(0.1, 1.0, 15),
                                                        cv=10,
                                                        n_jobs=1)


plt.plot(train_sizes, np.mean(train_scores, axis=1), c='blue', marker='o', label='training accuracy')
plt.fill_between(train_sizes, np.mean(train_scores, axis=1) + np.std(train_scores, axis=1), 
                 np.mean(train_scores, axis=1) - np.std(train_scores, axis=1), 
                 color = 'blue', alpha=0.2)

plt.plot(train_sizes, np.mean(test_scores, axis=1), c='green', marker='x', label='testing accuracy')
plt.fill_between(train_sizes, np.mean(test_scores, axis=1) + np.std(test_scores, axis=1),
                 np.mean(test_scores, axis=1) - np.std(test_scores, axis=1), 
                 color = 'green', alpha=0.2)

plt.ylim(0.8, 1)
plt.grid()
plt.legend(loc='lower right')
plt.show()

## Plotting validation curve 

In [None]:
from sklearn.learning_curve import validation_curve

param_range = [0.001, 0.01, 0.1, 1.0, 10, 100]
train_scores, test_scores = validation_curve(estimator=pca_lr_pipe,
                                             X=X_train, 
                                             y=y_train,
                                             param_name='lr__C',
                                             param_range=param_range,
                                             cv=10,
                                             n_jobs=1)
print train_scores.shape
print test_scores.shape

plt.plot(param_range, np.mean(train_scores, axis=1), c='blue', marker='o', label='training accuracy')
plt.fill_between(param_range, np.mean(train_scores, axis=1) + np.std(train_scores, axis=1), 
                 np.mean(train_scores, axis=1) - np.std(train_scores, axis=1), 
                 color = 'blue', alpha=0.2)

plt.plot(param_range, np.mean(test_scores, axis=1), c='green', marker='x', label='testing accuracy')
plt.fill_between(param_range, np.mean(test_scores, axis=1) + np.std(test_scores, axis=1),
                 np.mean(test_scores, axis=1) - np.std(test_scores, axis=1), 
                 color = 'green', alpha=0.2)
plt.xscale('log')
plt.show()


## Hyperparameter tuning using grid search

In [106]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
pipe_svm = Pipeline([('scl', StandardScaler()),
                     ('clf', SVC(random_state=1))])
param_range = [0.01, 0.1]
param_grid = [{'clf__C': param_range, 'clf__kernel': ['linear']}, 
              {'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf']}]
gs = GridSearchCV(estimator  = pipe_svm, 
                  param_grid = param_grid, 
                 )
gs.fit(X_train, y_train)
print gs.best_params_
print gs.best_score_
print gs.grid_scores_

Shan
{'clf__C': 0.1, 'clf__kernel': 'linear'}
0.969230769231
huha
[mean: 0.95385, std: 0.02149, params: {'clf__C': 0.01, 'clf__kernel': 'linear'}, mean: 0.96923, std: 0.03053, params: {'clf__C': 0.1, 'clf__kernel': 'linear'}, mean: 0.62637, std: 0.00195, params: {'clf__gamma': 0.01, 'clf__C': 0.01, 'clf__kernel': 'rbf'}, mean: 0.62637, std: 0.00195, params: {'clf__gamma': 0.1, 'clf__C': 0.01, 'clf__kernel': 'rbf'}, mean: 0.94066, std: 0.00537, params: {'clf__gamma': 0.01, 'clf__C': 0.1, 'clf__kernel': 'rbf'}, mean: 0.90549, std: 0.02499, params: {'clf__gamma': 0.1, 'clf__C': 0.1, 'clf__kernel': 'rbf'}]
