In [28]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

In [14]:
data = load_breast_cancer()
print (data.DESCR)

Breast Cancer Wisconsin (Diagnostic) Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, field
        13 is Radius SE, field 23 is Worst Radius.

        

In [15]:
print(len(data.feature_names))

30


In [16]:
pd.DataFrame(data.data).sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
317,18.22,18.87,118.7,1027.0,0.09746,0.1117,0.113,0.0795,0.1807,0.05664,...,21.84,25.0,140.9,1485.0,0.1434,0.2763,0.3853,0.1776,0.2812,0.08198
232,11.22,33.81,70.79,386.8,0.0778,0.03574,0.004967,0.006434,0.1845,0.05828,...,12.36,41.78,78.44,470.9,0.09994,0.06885,0.02318,0.03002,0.2911,0.07307
108,22.27,19.67,152.8,1509.0,0.1326,0.2768,0.4264,0.1823,0.2556,0.07039,...,28.4,28.01,206.8,2360.0,0.1701,0.6997,0.9608,0.291,0.4055,0.09789
174,10.66,15.15,67.49,349.6,0.08792,0.04302,0.0,0.0,0.1928,0.05975,...,11.54,19.2,73.2,408.3,0.1076,0.06791,0.0,0.0,0.271,0.06164
473,12.27,29.97,77.42,465.4,0.07699,0.03398,0.0,0.0,0.1701,0.0596,...,13.45,38.05,85.08,558.9,0.09422,0.05213,0.0,0.0,0.2409,0.06743
514,15.05,19.07,97.26,701.9,0.09215,0.08597,0.07486,0.04335,0.1561,0.05915,...,17.58,28.06,113.8,967.0,0.1246,0.2101,0.2866,0.112,0.2282,0.06954
78,20.18,23.97,143.7,1245.0,0.1286,0.3454,0.3754,0.1604,0.2906,0.08142,...,23.37,31.72,170.3,1623.0,0.1639,0.6164,0.7681,0.2508,0.544,0.09964
178,13.01,22.22,82.01,526.4,0.06251,0.01938,0.001595,0.001852,0.1395,0.05234,...,14.0,29.02,88.18,608.8,0.08125,0.03432,0.007977,0.009259,0.2295,0.05843
218,19.8,21.56,129.7,1230.0,0.09383,0.1306,0.1272,0.08691,0.2094,0.05581,...,25.73,28.64,170.3,2009.0,0.1353,0.3235,0.3617,0.182,0.307,0.08255
488,11.68,16.17,75.49,420.5,0.1128,0.09263,0.04279,0.03132,0.1853,0.06401,...,13.32,21.59,86.57,549.8,0.1526,0.1477,0.149,0.09815,0.2804,0.08024


In [17]:
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, 
                                                    stratify=data.target, 
                                                    random_state=66, test_size=0.2)

In [26]:
pipeline_rf = Pipeline([
    ('standardize', MinMaxScaler()), 
    ('grid_search_lr', GridSearchCV(
        RandomForestClassifier(),
        param_grid={'bootstrap': [True],
                     'max_depth': [50, 100, 150],
                     'max_features': ['auto', 'sqrt'],
                     'min_samples_leaf': [2, 4],
                     'min_samples_split': [2, 5],
                     'n_estimators': [ 200, 500, 1000]},
        cv=5,
        n_jobs=-1,
        scoring='roc_auc',
        verbose=2,
        refit=True
    ))
])

In [27]:
pipeline_rf.fit(X_train, y_train)
y_pred = pipeline_rf.predict(X_test)
y_scores = pipeline_rf.predict_proba(X_test)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200 
[CV] bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200 
[CV] bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200 
[CV] bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200 
[CV]  bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200 -   0.8s
[CV] bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200 
[CV]  bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200 -   0.9s
[CV]  bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200 -   0.8s
[CV] bo

[CV]  bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=200 -   0.8s
[CV] bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=500 


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   20.3s


[CV]  bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1000 -   4.0s
[CV] bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=500 
[CV]  bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=500 -   1.9s
[CV] bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=500 
[CV]  bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=500 -   1.8s
[CV] bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=1000 
[CV]  bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=500 -   1.9s
[CV] bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=1000 
[CV]  bootstrap=True, max_depth=50, max_features=auto, mi

[CV]  bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500 -   1.9s
[CV] bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1000 
[CV]  bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1000 -   3.8s
[CV] bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1000 
[CV]  bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1000 -   3.8s
[CV] bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200 
[CV]  bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200 -   0.7s
[CV] bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200 
[CV]  bootstrap=True, max_depth=50, max_features=sqrt, m

[CV]  bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200 -   0.8s
[CV] bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200 
[CV]  bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1000 -   3.8s
[CV] bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200 
[CV]  bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200 -   0.7s
[CV] bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=500 
[CV]  bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200 -   0.7s
[CV] bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=500 
[CV]  bootstrap=True, max_depth=50, max_features=sqrt, min_

[CV]  bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1000 -   3.8s
[CV] bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=500 
[CV]  bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=500 -   1.9s
[CV] bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=500 
[CV]  bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=500 -   1.9s
[CV] bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1000 
[CV]  bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=500 -   1.9s
[CV] bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1000 
[CV]  bootstrap=True, max_depth=100, max_features

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.4min


[CV]  bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1000 -   4.1s
[CV] bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=500 
[CV]  bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=500 -   2.0s
[CV] bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=500 
[CV]  bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=500 -   1.9s
[CV] bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=1000 
[CV]  bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=500 -   1.9s
[CV] bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=1000 
[CV]  bootstrap=True, max_depth=100, max_features

[CV] bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1000 
[CV]  bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500 -   1.9s
[CV] bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1000 
[CV]  bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1000 -   3.9s
[CV] bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1000 
[CV]  bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1000 -   3.8s
[CV] bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200 
[CV]  bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200 -   0.8s
[CV] bootstrap=True, max_depth=100, max_feature

[CV]  bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1000 -   3.7s
[CV] bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200 
[CV]  bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200 -   0.7s
[CV] bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200 
[CV]  bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1000 -   3.7s
[CV] bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200 
[CV]  bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200 -   0.8s
[CV] bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=500 
[CV]  bootstrap=True, max_depth=100, max_features=

[CV] bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=500 
[CV]  bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=200 -   0.8s
[CV] bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=500 
[CV]  bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1000 -   3.9s
[CV] bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=500 
[CV]  bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=500 -   1.9s
[CV] bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=500 
[CV]  bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=500 -   1.9s
[CV]  bootstrap=True, max_depth=150, max_features=a

[CV]  bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=500 -   1.9s
[CV] bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1000 
[CV]  bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=500 -   1.8s
[CV] bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1000 
[CV]  bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=500 -   1.9s
[CV] bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1000 
[CV]  bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1000 -   3.7s
[CV] bootstrap=True, max_depth=150, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1000 
[CV]  bootstrap=True, max_depth=150, max_featur

[CV] bootstrap=True, max_depth=150, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200 
[CV]  bootstrap=True, max_depth=150, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200 -   0.8s
[CV] bootstrap=True, max_depth=150, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200 
[CV]  bootstrap=True, max_depth=150, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000 -   3.7s
[CV] bootstrap=True, max_depth=150, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200 
[CV]  bootstrap=True, max_depth=150, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200 -   0.8s
[CV] bootstrap=True, max_depth=150, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200 
[CV]  bootstrap=True, max_depth=150, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000 -   3.7s
[CV] bootstrap=True, max_depth=150, max_features=s

[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  3.3min finished


NameError: name 'classification_report' is not defined

In [32]:
print (classification_report(y_pred=y_pred, y_true=y_test))
print ("AUC-ROC: "+str(roc_auc_score(y_score=y_scores[:,1], y_true=y_test)))

             precision    recall  f1-score   support

          0       1.00      0.93      0.96        42
          1       0.96      1.00      0.98        72

avg / total       0.97      0.97      0.97       114

AUC-ROC: 0.976686507937
