In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("./data/breast_cancer_wisconsin_diagnostic/wdbc.data", header=None)

X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
print(le.classes_ )
print(le.transform(['M', 'B']))

['B' 'M']
[1 0]


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=1)

In [6]:
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(StandardScaler(), PCA(n_components=2), LogisticRegression())

pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)
test_acc = pipe_lr.score(X_test, y_test)
print(f'测试集准确率: {test_acc:.3f}')

测试集准确率: 0.956


In [11]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=10).split(X_train, y_train)
scores = []

for k, (train, test) in enumerate(kfold): 
    pipe_lr.fit(X_train[train], y_train[train]) 
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score) 
    print(f'折(Fold): {k+1:02d}, 类别分布: {np.bincount(y_train[train])}, 准确率.: {score:.3f}')

mean_acc = np.mean(scores)
std_acc = np.std(scores)
print(f'\nCV 准确率: {mean_acc:.3f} +/- {std_acc:.3f}') 

折(Fold): 01, 类别分布: [256 153], 准确率.: 0.935
折(Fold): 02, 类别分布: [256 153], 准确率.: 0.935
折(Fold): 03, 类别分布: [256 153], 准确率.: 0.957
折(Fold): 04, 类别分布: [256 153], 准确率.: 0.957
折(Fold): 05, 类别分布: [256 153], 准确率.: 0.935
折(Fold): 06, 类别分布: [257 153], 准确率.: 0.956
折(Fold): 07, 类别分布: [257 153], 准确率.: 0.978
折(Fold): 08, 类别分布: [257 153], 准确率.: 0.933
折(Fold): 09, 类别分布: [257 153], 准确率.: 0.956
折(Fold): 10, 类别分布: [257 153], 准确率.: 0.956

CV 准确率: 0.950 +/- 0.014


In [12]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=pipe_lr, X=X_train, y=y_train, cv=10, n_jobs=-1) 
print(f'每折的准确率: {scores}')

print(f'\nCV 准确率: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')

每折的准确率: [0.93478261 0.93478261 0.95652174 0.95652174 0.93478261 0.95555556
 0.97777778 0.93333333 0.95555556 0.95555556]

CV 准确率: 0.950 +/- 0.014
