In [17]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC




In [18]:
from sklearn.datasets import load_breast_cancer

In [19]:
data = load_breast_cancer()
print (data.DESCR)
pd.DataFrame(data.data).sample(10)

Breast Cancer Wisconsin (Diagnostic) Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, field
        13 is Radius SE, field 23 is Worst Radius.

        

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
497,12.47,17.31,80.45,480.1,0.08928,0.0763,0.03609,0.02369,0.1526,0.06046,...,14.06,24.34,92.82,607.3,0.1276,0.2506,0.2028,0.1053,0.3035,0.07661
151,8.219,20.7,53.27,203.9,0.09405,0.1305,0.1321,0.02168,0.2222,0.08261,...,9.092,29.72,58.08,249.8,0.163,0.431,0.5381,0.07879,0.3322,0.1486
541,14.47,24.99,95.81,656.4,0.08837,0.123,0.1009,0.0389,0.1872,0.06341,...,16.22,31.73,113.5,808.9,0.134,0.4202,0.404,0.1205,0.3187,0.1023
282,19.4,18.18,127.2,1145.0,0.1037,0.1442,0.1626,0.09464,0.1893,0.05892,...,23.79,28.65,152.4,1628.0,0.1518,0.3749,0.4316,0.2252,0.359,0.07787
188,11.81,17.39,75.27,428.9,0.1007,0.05562,0.02353,0.01553,0.1718,0.0578,...,12.57,26.48,79.57,489.5,0.1356,0.1,0.08803,0.04306,0.32,0.06576
473,12.27,29.97,77.42,465.4,0.07699,0.03398,0.0,0.0,0.1701,0.0596,...,13.45,38.05,85.08,558.9,0.09422,0.05213,0.0,0.0,0.2409,0.06743
317,18.22,18.87,118.7,1027.0,0.09746,0.1117,0.113,0.0795,0.1807,0.05664,...,21.84,25.0,140.9,1485.0,0.1434,0.2763,0.3853,0.1776,0.2812,0.08198
428,11.13,16.62,70.47,381.1,0.08151,0.03834,0.01369,0.0137,0.1511,0.06148,...,11.68,20.29,74.35,421.1,0.103,0.06219,0.0458,0.04044,0.2383,0.07083
319,12.43,17.0,78.6,477.3,0.07557,0.03454,0.01342,0.01699,0.1472,0.05561,...,12.9,20.21,81.76,515.9,0.08409,0.04712,0.02237,0.02832,0.1901,0.05932
213,17.42,25.56,114.5,948.0,0.1006,0.1146,0.1682,0.06597,0.1308,0.05866,...,18.07,28.07,120.4,1021.0,0.1243,0.1793,0.2803,0.1099,0.1603,0.06818


In [20]:
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, 
                                                    stratify=data.target, 
                                                    random_state=66, test_size=0.2)

In [21]:
pipeline = Pipeline([
    ('standardize', StandardScaler()), 
    ('grid_search_lr', GridSearchCV(
        KNeighborsClassifier(),
        param_grid={'n_neighbors': [3,5,7,10],
                    'p':[1,2],
                    },
        cv=5,
        n_jobs=-1,
        scoring='roc_auc',
        verbose=2,
        refit=True
    ))
])

In [23]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_scores = pipeline.predict_proba(X_test)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] n_neighbors=3, p=1 ..............................................
[CV] n_neighbors=3, p=1 ..............................................
[CV] n_neighbors=3, p=1 ..............................................
[CV] n_neighbors=3, p=1 ..............................................
[CV] ..................................... n_neighbors=3, p=1 -   0.0s
[CV] n_neighbors=3, p=1 ..............................................
[CV] ..................................... n_neighbors=3, p=1 -   0.0s
[CV] n_neighbors=3, p=2 ..............................................
[CV] ..................................... n_neighbors=3, p=1 -   0.0s
[CV] n_neighbors=3, p=2 ..............................................
[CV] ..................................... n_neighbors=3, p=1 -   0.0s
[CV] ..................................... n_neighbors=3, p=2 -   0.0s
[CV] ..................................... n_neighbors=3, p=2 -   0.0s
[CV] ............

[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.4s finished


In [24]:
print (classification_report(y_pred=y_pred, y_true=y_test))
print ("AUC-ROC: "+str(roc_auc_score(y_score=y_scores[:,1], y_true=y_test)))

             precision    recall  f1-score   support

          0       1.00      0.88      0.94        42
          1       0.94      1.00      0.97        72

avg / total       0.96      0.96      0.96       114

AUC-ROC: 0.980158730159
