In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

df = pd.read_csv('https://archive.ics.uci.edu/ml/'
                 'machine-learning-databases'
                 '/breast-cancer-wisconsin/wdbc.data', header=None)

In [2]:
df.shape

(569, 32)

In [3]:
from sklearn.preprocessing import LabelEncoder

x = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

array(['B', 'M'], dtype=object)

In [4]:
le.transform(['M','B'])

array([1, 0], dtype=int64)

In [5]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,
                                                 random_state = 1,
                                                 stratify = y)

In [6]:
pipe_lr = make_pipeline(StandardScaler(),
                       PCA(n_components=2),
                       LogisticRegression(random_state = 1, solver='lbfgs'))
pipe_lr.fit(x_train,y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=2)),
                ('logisticregression', LogisticRegression(random_state=1))])

In [7]:
y_pred =pipe_lr.predict(x_test)
print('test accuracy:%.3f' % pipe_lr.score(x_test,y_test))


test accuracy:0.956


In [14]:
a = confusion_matrix(y_test,y_pred)
a

array([[71,  1],
       [ 4, 38]], dtype=int64)

In [21]:
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=10).split(x_train,y_train)

scores = []

for k , (train , test) in enumerate(kfold):
    pipe_lr.fit(x_train[train],y_train[train])
    score = pipe_lr.score(x_train[test],y_train[test])
    scores.append(score)
    print('fold: %2d , class dist.: %s , acc: %.3f' % (k+1,np.bincount(y_train[train]),score))
    
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

fold:  1 , class dist.: [256 153] , acc: 0.935
fold:  2 , class dist.: [256 153] , acc: 0.935
fold:  3 , class dist.: [256 153] , acc: 0.957
fold:  4 , class dist.: [256 153] , acc: 0.957
fold:  5 , class dist.: [256 153] , acc: 0.935
fold:  6 , class dist.: [257 153] , acc: 0.956
fold:  7 , class dist.: [257 153] , acc: 0.978
fold:  8 , class dist.: [257 153] , acc: 0.933
fold:  9 , class dist.: [257 153] , acc: 0.956
fold: 10 , class dist.: [257 153] , acc: 0.956

CV accuracy: 0.950 +/- 0.014
