In [1]:
# 模型预测及参数调优
import pandas as pd

from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', 
                 header=None)

# X = df.loc[:, 2:].values
# y = df.loc[:, 1].values

# le = LabelEncoder()
# y = le.fit_transform(y)

In [2]:
X = df.loc[:, 2:].values
y = df.loc[:, 1].values

le = LabelEncoder()
y = le.fit_transform(y)

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# 使用pipeline，做流水线模型训练
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipeline_lr = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('clf', LogisticRegression(random_state=42))
])

pipeline_lr.fit(X_train, y_train)
print('Test accuracy: ', pipeline_lr.score(X_test, y_test))

Test accuracy:  0.991228070175


In [5]:
import numpy as np
from sklearn.cross_validation import StratifiedKFold

kfold = StratifiedKFold(y=y_train, n_folds=10, random_state=42)

scores = []
for k, (train, test) in enumerate(kfold):
    pipeline_lr.fit(X_train[train], y_train[train])
    score = pipeline_lr.score(X_train[test], y_train[test])
    scores.append(score)
    print('Fold: %s, Class dist: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train]), score))

Fold: 1, Class dist: [257 152], Acc: 0.935
Fold: 2, Class dist: [257 152], Acc: 0.913
Fold: 3, Class dist: [257 152], Acc: 0.978
Fold: 4, Class dist: [257 152], Acc: 0.891
Fold: 5, Class dist: [257 152], Acc: 0.978
Fold: 6, Class dist: [257 152], Acc: 0.957
Fold: 7, Class dist: [258 152], Acc: 0.933
Fold: 8, Class dist: [258 152], Acc: 0.956
Fold: 9, Class dist: [258 152], Acc: 0.956
Fold: 10, Class dist: [258 153], Acc: 0.932




In [6]:
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy: 0.943 +/- 0.026


In [7]:
# 这段代码的效果等同于上边的，使用这种更好一点
from sklearn.cross_validation import cross_val_score

scores2 = cross_val_score(estimator=pipeline_lr, X=X_train, y=y_train, cv=10, n_jobs=1)
scores2

array([ 0.93478261,  0.91304348,  0.97826087,  0.89130435,  0.97826087,
        0.95652174,  0.93333333,  0.95555556,  0.95555556,  0.93181818])

In [8]:
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores2), np.std(scores2)))

CV accuracy: 0.943 +/- 0.026
