## 1. Import necessasary libraries

In [49]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel,RFE
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix

import pandas as pd

## 2. Import data

In [3]:
cancer = load_breast_cancer()
cancer

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [7]:
cancer_data = pd.DataFrame(data=cancer['data'],columns=cancer['feature_names'])
cancer_data['Target'] = cancer['target']
cancer_data.head(10)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0
5,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,0
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,...,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368,0
7,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151,0
8,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072,0
9,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,...,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075,0


## 3. Model Building

In [8]:
X = cancer_data.drop(labels=['Target'],axis = 1)
y = cancer_data['Target']

In [10]:
X.shape,y.shape

((569, 30), (569,))

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.20,random_state = 0)
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((455, 30), (455,), (114, 30), (114,))

# 4. Model Training

## Technique 1: Use 'SelectFromModel' from sklearn library for feature selection

* SelectFromModel is not standalone. It expects an estimator to train the data.

In [16]:
feature_selector = SelectFromModel(estimator = RandomForestClassifier(n_estimators=100,n_jobs=-1,random_state=0),max_features=None) 
#n-jobs = -1, makes sure, it utilizes all the cores of your machine during training
feature_selector.fit(X_train,y_train)

SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1, random_state=0))

In [17]:
feature_selector.get_support()

array([ True, False,  True,  True, False, False,  True,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True, False,  True,  True, False, False, False,
        True, False, False])

In [18]:
X_train.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [20]:
features_names = X_train.columns[feature_selector.get_support()]
features_names

Index(['mean radius', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points', 'area error', 'worst radius', 'worst perimeter',
       'worst area', 'worst concave points'],
      dtype='object')

In [22]:
X_train.shape

(455, 30)

In [21]:
len(features_names)

10

In [23]:
X_train_rfc = feature_selector.transform(X_train)
X_test_rfc  = feature_selector.transform(X_test)

In [33]:
def runRandomForestClassifier(X_train,X_test,y_train,y_test):
    rf_model = RandomForestClassifier(n_estimators=100,n_jobs=-1,random_state=0)
    rf_model.fit(X_train,y_train)
    y_pred = rf_model.predict(X_test)
    print("Accuracy         : ",round(accuracy_score(y_test,y_pred),5))
    print("Precision        : ",round(precision_score(y_test,y_pred),5))
    print("Recall           : ",round(recall_score(y_test,y_pred),5))
    print("Confusion Matrix  \n",confusion_matrix(y_test,y_pred))

In [35]:
%%time
runRandomForestClassifier(X_train,X_test,y_train,y_test)

Accuracy         :  0.96491
Precision        :  0.98462
Recall           :  0.95522
Confusion Matrix  
 [[46  1]
 [ 3 64]]
Wall time: 341 ms


In [37]:
%%time
runRandomForestClassifier(X_train_rfc,X_test_rfc,y_train,y_test)

Accuracy         :  0.94737
Precision        :  0.96923
Recall           :  0.9403
Confusion Matrix  
 [[45  2]
 [ 4 63]]
Wall time: 307 ms


## Technique No 2: RFE - Recursive Feature Elimination

In [39]:
rfe_feature_selector = RFE(estimator=RandomForestClassifier(n_estimators=100,n_jobs=-1,random_state=0),n_features_to_select=None)
rfe_feature_selector.fit(X_train,y_train)

RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=0))

In [40]:
rfe_feature_selector.get_support()

array([ True,  True,  True,  True, False, False,  True,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True,  True,  True,  True,  True, False,  True,
        True,  True, False])

In [41]:
X_train.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [43]:
feature_names = X_train.columns[rfe_feature_selector.get_support()]
feature_names

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean concavity', 'mean concave points', 'area error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst smoothness',
       'worst concavity', 'worst concave points', 'worst symmetry'],
      dtype='object')

In [44]:
len(feature_names)

15

In [45]:
X_train_rfc = rfe_feature_selector.transform(X_train)
X_test_rfc  = rfe_feature_selector.transform(X_test)

In [46]:
def runRandomForestClassifier(X_train,X_test,y_train,y_test):
    rf_model = RandomForestClassifier(n_estimators=100,n_jobs=-1,random_state=0)
    rf_model.fit(X_train,y_train)
    y_pred = rf_model.predict(X_test)
    print("Accuracy         : ",round(accuracy_score(y_test,y_pred),5))
    print("Precision        : ",round(precision_score(y_test,y_pred),5))
    print("Recall           : ",round(recall_score(y_test,y_pred),5))
    print("Confusion Matrix  \n",confusion_matrix(y_test,y_pred))

In [47]:
%%time
runRandomForestClassifier(X_train,X_test,y_train,y_test)

Accuracy         :  0.96491
Precision        :  0.98462
Recall           :  0.95522
Confusion Matrix  
 [[46  1]
 [ 3 64]]
Wall time: 397 ms


In [48]:
%%time
runRandomForestClassifier(X_train_rfc,X_test_rfc,y_train,y_test)

Accuracy         :  0.97368
Precision        :  0.97059
Recall           :  0.98507
Confusion Matrix  
 [[45  2]
 [ 1 66]]
Wall time: 287 ms


## Pick important features using GradientBoosting Classifier

In [51]:
rfe_feature_selector = RFE(estimator=GradientBoostingClassifier(n_estimators=100,random_state=0),n_features_to_select=12)
rfe_feature_selector.fit(X_train,y_train)

RFE(estimator=GradientBoostingClassifier(random_state=0),
    n_features_to_select=12)

In [52]:
rfe_feature_selector.get_support()

array([False,  True, False, False,  True, False, False,  True,  True,
       False, False, False, False,  True, False, False,  True, False,
       False, False,  True,  True,  True,  True, False, False,  True,
        True, False, False])

In [53]:
feature_names = X_train.columns[rfe_feature_selector.get_support()]
feature_names

Index(['mean texture', 'mean smoothness', 'mean concave points',
       'mean symmetry', 'area error', 'concavity error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst concavity',
       'worst concave points'],
      dtype='object')

In [54]:
len(feature_names)

12

In [55]:
X_train_rfc = rfe_feature_selector.transform(X_train)
X_test_rfc  = rfe_feature_selector.transform(X_test)

In [56]:
def runRandomForestClassifier(X_train,X_test,y_train,y_test):
    rf_model = RandomForestClassifier(n_estimators=100,n_jobs=-1,random_state=0)
    rf_model.fit(X_train,y_train)
    y_pred = rf_model.predict(X_test)
    print("Accuracy         : ",round(accuracy_score(y_test,y_pred),5))
    print("Precision        : ",round(precision_score(y_test,y_pred),5))
    print("Recall           : ",round(recall_score(y_test,y_pred),5))
    print("Confusion Matrix  \n",confusion_matrix(y_test,y_pred))

In [57]:
%%time
runRandomForestClassifier(X_train,X_test,y_train,y_test)

Accuracy         :  0.96491
Precision        :  0.98462
Recall           :  0.95522
Confusion Matrix  
 [[46  1]
 [ 3 64]]
Wall time: 369 ms


In [58]:
%%time
runRandomForestClassifier(X_train_rfc,X_test_rfc,y_train,y_test)

Accuracy         :  0.97368
Precision        :  0.98485
Recall           :  0.97015
Confusion Matrix  
 [[46  1]
 [ 2 65]]
Wall time: 384 ms


## How to decide the Optimum number of features

In [60]:
for i in range(1,31):
    rfe_feature_selector = RFE(estimator=GradientBoostingClassifier(n_estimators=100,random_state=0),n_features_to_select=i)
    rfe_feature_selector.fit(X_train,y_train)
    X_train_rfc = rfe_feature_selector.transform(X_train)
    X_test_rfc  = rfe_feature_selector.transform(X_test)
    print('Selected Features : ',i)
    runRandomForestClassifier(X_train_rfc,X_test_rfc,y_train,y_test)
    print("\n")

Selected Features :  1
Accuracy         :  0.87719
Precision        :  0.86301
Recall           :  0.9403
Confusion Matrix  
 [[37 10]
 [ 4 63]]


Selected Features :  2
Accuracy         :  0.90351
Precision        :  0.88889
Recall           :  0.95522
Confusion Matrix  
 [[39  8]
 [ 3 64]]


Selected Features :  3
Accuracy         :  0.96491
Precision        :  0.97015
Recall           :  0.97015
Confusion Matrix  
 [[45  2]
 [ 2 65]]


Selected Features :  4
Accuracy         :  0.97368
Precision        :  0.97059
Recall           :  0.98507
Confusion Matrix  
 [[45  2]
 [ 1 66]]


Selected Features :  5
Accuracy         :  0.96491
Precision        :  0.97015
Recall           :  0.97015
Confusion Matrix  
 [[45  2]
 [ 2 65]]


Selected Features :  6
Accuracy         :  0.99123
Precision        :  1.0
Recall           :  0.98507
Confusion Matrix  
 [[47  0]
 [ 1 66]]


Selected Features :  7
Accuracy         :  0.97368
Precision        :  0.97059
Recall           :  0.98507
Confusion 

In [62]:
rfe_feature_selector = RFE(estimator=GradientBoostingClassifier(n_estimators=100,random_state=0),n_features_to_select=6)
rfe_feature_selector.fit(X_train,y_train)
X_train_rfc = rfe_feature_selector.transform(X_train)
X_test_rfc  = rfe_feature_selector.transform(X_test)
runRandomForestClassifier(X_train_rfc,X_test_rfc,y_train,y_test)

Accuracy         :  0.99123
Precision        :  1.0
Recall           :  0.98507
Confusion Matrix  
 [[47  0]
 [ 1 66]]


In [65]:
X_train.columns[rfe_feature_selector.get_support()]

Index(['mean concave points', 'area error', 'worst texture', 'worst perimeter',
       'worst area', 'worst concave points'],
      dtype='object')

## Model Deployement

In [63]:
from pickle import dump

In [None]:
dump()