In [178]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from  sklearn.svm import SVC as svm
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [5]:
cancer = datasets.load_breast_cancer()

In [6]:
cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [10]:
cancer_df = pd.DataFrame(cancer.data , columns = cancer.feature_names)

In [12]:
cancer_df.shape

(569, 30)

In [14]:
cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [16]:
cancer_df['class_label'] = cancer.target

In [17]:
cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,class_label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [19]:
cancer_df.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
class_label                0
dtype: int64

In [31]:
x = cancer_df.iloc[:,0:-1].values
y = cancer_df.iloc[:,-1].values

In [177]:
x_train , x_test , y_train , y_test = train_test_split(x, y, test_size=0.2, random_state=1 , stratify = y)

In [33]:
x_train.shape

(455, 30)

In [34]:
x_test.shape

(114, 30)

In [105]:
lr_pipe = make_pipeline(StandardScaler() , PCA(n_components=4 ), 
                        LogisticRegression(penalty='l2',  C=0.1, random_state= 1  ))

In [106]:
lr_pipe.fit(x_train, y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logisticregression', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [107]:
print(" Accuracy = %f" % lr_pipe.score(x_test, y_test))

 Accuracy = 0.973684


In [174]:
svm_pipe = make_pipeline(StandardScaler() ,  
                        svm(C=0.1, kernel='linear',random_state=1 , gamma =0.0010))

In [175]:
svm_pipe.fit(x_train, y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='linear',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False))])

In [176]:
print(" Accuracy = %f" % svm_pipe.score(x_test, y_test))

 Accuracy = 0.982456


In [155]:
kfold = StratifiedKFold(n_splits=10, shuffle=False, random_state=1).split(x_train,y_train)

In [156]:
scores =[]

for k , [train,test] in enumerate(kfold):
    lr_pipe.fit(x_train[train], y_train[train])
    score = lr_pipe.score(x_train[test] , y_train[test])
    scores.append(score)
    print('Fold: %2d, Class dist.: %s, Acc: %.3f' % (k+1,np.bincount(y_train[train]), score))
    
    

Fold:  1, Class dist.: [153 256], Acc: 0.978
Fold:  2, Class dist.: [153 256], Acc: 0.935
Fold:  3, Class dist.: [153 256], Acc: 0.957
Fold:  4, Class dist.: [153 256], Acc: 1.000
Fold:  5, Class dist.: [153 256], Acc: 0.891
Fold:  6, Class dist.: [153 257], Acc: 1.000
Fold:  7, Class dist.: [153 257], Acc: 0.933
Fold:  8, Class dist.: [153 257], Acc: 0.956
Fold:  9, Class dist.: [153 257], Acc: 0.911
Fold: 10, Class dist.: [153 257], Acc: 0.978


In [158]:
print('\n CV Accuracy: %.3f +/- %.3f' % (np.mean(scores) , np.std(scores)))


 CV Accuracy: 0.954 +/- 0.035


In [180]:
score_cv = cross_val_score(estimator = svm_pipe, X = x_train, y=y_train, cv=10, n_jobs=1)
print("CV Accuracy Score: %s" % score_cv)

CV Accuracy Score: [0.95652174 0.97826087 0.95652174 0.95652174 0.95652174 1.
 0.97777778 0.95555556 0.93333333 1.        ]


In [182]:
score_cv.mean()


0.9671014492753625

In [183]:
np.mean(score_cv)

0.9671014492753625