In [7]:
"""
chapter 06
"""

# import libs

import pandas as pd 
import numpy as np 

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score 

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [2]:
# import dataset 
# the breast canser wisconsin dataset  

# read in the dataset from the UCI website 
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)

# label encoding 
X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
# malignant tumors: class 1; # benign tumors: class 0
print('label encoding:', le.transform(['M', 'B']))

# divide the dataset into a separate training dataset
# training 80%; test 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

label encoding: [1 0]


In [3]:
# combine transformers and esitimators in a pipeline 
# sklearn pipeline

# pipeline logistic regression 
# take a list of tuples as input: 
# first value: arbitrary identifier string --> to access the individual elements 
# second element: scikit-learn obj 
pipe_lr = Pipeline([('scl', StandardScaler()), # standardize the columns 
                    ('pca', PCA(n_components=2)), # compress data onto 2d subspace 
                    ('clf', LogisticRegression(random_state=1))]) # logistic regression

pipe_lr.fit(X_train, y_train)
print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))

Test Accuracy: 0.947


In [5]:
# Stratified KFold 
kfold = StratifiedKFold(n_splits=10, random_state=1).split(X_train, y_train)

scores = []
for train, test in kfold:
    pipe_lr.fit(X_train[train], y_train[train])
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
    print('Class dist.: %s, Acc: %.3f' % (np.bincount(y_train[train]), score))
    
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))   

Class dist.: [256 153], Acc: 0.891
Class dist.: [256 153], Acc: 0.978
Class dist.: [256 153], Acc: 0.978
Class dist.: [256 153], Acc: 0.913
Class dist.: [256 153], Acc: 0.935
Class dist.: [257 153], Acc: 0.978
Class dist.: [257 153], Acc: 0.933
Class dist.: [257 153], Acc: 0.956
Class dist.: [257 153], Acc: 0.978
Class dist.: [257 153], Acc: 0.956

CV accuracy: 0.950 +/- 0.029


In [8]:
# cross validtion score 
# note: scoring parameters: http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
scores = cross_val_score(estimator=pipe_lr, scoring='accuracy', X=X_train, y=y_train, cv=10, n_jobs=1)
print('CV accuracy scores: %s' % scores)
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [ 0.89130435  0.97826087  0.97826087  0.91304348  0.93478261  0.97777778
  0.93333333  0.95555556  0.97777778  0.95555556]

CV accuracy: 0.950 +/- 0.029
