### Feature Engineering  : whatever we do with feature is called feature engineering

1. Feature Elimination - Dropping the features
2.  Feature Addition - Adding some features.
3.  Feature Transformation - Transforming given features values into another scale - Log Transformation , Sqrt Transformation
4.  Feature Selection - Deciding whether features are importance out of many features and choosing that features for model building.
 

### Feature Selection Technique - It used ML models to select no of features 

1. Sklearn - Select From Model 
2. Sklearn - RFE - Recursive Feature Elimination



In [7]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix
from sklearn.feature_selection import RFE,SelectFromModel


import warnings
warnings.filterwarnings('ignore')

In [4]:
data = load_breast_cancer()

In [6]:
df=pd.DataFrame(columns=data.feature_names,data=data.data)
df['target']=data['target']
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [20]:
df.shape

(569, 31)

### 1.  Model Building 

In [26]:
X=df.drop(columns=['target'],axis=1)
y=df['target']
X_train,X_test,y_train,y_test=train_test_split(X,y,
                                              test_size=0.20,
                                              random_state=42,
                                              shuffle=True)

In [27]:

# Training the model with Random Forest

from sklearn.metrics import accuracy_score,precision_score,recall_score
rf_classifier=RandomForestClassifier()
rf_classifier.fit(X_train,y_train)
y_predict= rf_classifier.predict(X_test)
accuracy_score(y_test,y_predict)

0.9649122807017544

In [28]:
precision_score(y_test,y_predict)

0.958904109589041

In [29]:
recall_score(y_test,y_predict)

0.9859154929577465

In [30]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predict)

array([[40,  3],
       [ 1, 70]], dtype=int64)

## 2 .Feature Selection Techniques 

### 1.  SelectFromModel - based on weights we can select the features

In [31]:
select_from_model=SelectFromModel(estimator=RandomForestClassifier(),
                                 max_features=None)

select_from_model.fit(X_train,y_train)
                     

SelectFromModel(estimator=RandomForestClassifier())

In [32]:
select_from_model.get_support()

array([ True, False,  True,  True, False, False,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False,  True,  True, False, False,  True,
        True, False, False])

In [33]:
X_train.columns[select_from_model.get_support()].nunique()

10

In [34]:
X_train.columns[select_from_model.get_support()]

Index(['mean radius', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points', 'worst radius', 'worst perimeter', 'worst area',
       'worst concavity', 'worst concave points'],
      dtype='object')

In [35]:

# Train the model from the above features and check the accuracy
# Select Only 10 Features 
# Below will directly transform the required features , eliminating the 
# unnecessary features.

X_train_transform = select_from_model.transform(X_train)
X_test_transform = select_from_model.transform(X_test)

X_train_transform.shape

(455, 10)

In [36]:
X_test_transform.shape

(114, 10)

In [37]:
rf_classifier=RandomForestClassifier()
rf_classifier.fit(X_train_transform,y_train)
y_predict= rf_classifier.predict(X_test_transform)
accuracy_score(y_test,y_predict)

0.9649122807017544

In [55]:
def Check_accuracy(X_train,X_test):
    rf_classifier=RandomForestClassifier(random_state=42)
    rf_classifier.fit(X_train,y_train)
    y_predict= rf_classifier.predict(X_test)
    print(accuracy_score(y_test,y_predict))

### 2.  RFE - Recursive Feature Elimination

In [42]:

# n_features_to_select = By default it will select half of the features.

rfe_model = RFE(estimator=RandomForestClassifier(),
   n_features_to_select=None)
   
rfe_model.fit(X_train,y_train)    
rfe_model.get_support()

array([ True,  True,  True,  True, False, False,  True,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True, False, False])

In [43]:
X_train.columns[rfe_model.get_support()]

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean concavity', 'mean concave points', 'area error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points'],
      dtype='object')

In [44]:
X_train.columns[rfe_model.get_support()].nunique()

15

In [45]:
# Select those features from the X_train given by RFE model .

X_train_rfe_transform=rfe_model.transform(X_train)
X_test_rfe_transform=rfe_model.transform(X_test)

In [59]:
%%time
Check_accuracy(X_train_rfe_transform,X_test_rfe_transform)

0.9649122807017544
Wall time: 127 ms


### Check accuracy with Gradient descent Boosting

In [71]:
rfe_model=RFE(estimator=GradientBoostingClassifier(random_state=42),
             n_features_to_select=15)

rfe_model.fit(X_train,y_train)
rfe_model.get_support()

array([False,  True, False, False,  True,  True, False,  True, False,
       False,  True,  True, False,  True, False, False,  True, False,
       False, False,  True,  True,  True,  True,  True, False,  True,
        True, False, False])

In [72]:
X_train.columns[rfe_model.get_support()]

Index(['mean texture', 'mean smoothness', 'mean compactness',
       'mean concave points', 'radius error', 'texture error', 'area error',
       'concavity error', 'worst radius', 'worst texture', 'worst perimeter',
       'worst area', 'worst smoothness', 'worst concavity',
       'worst concave points'],
      dtype='object')

In [73]:
len(X_train.columns[rfe_model.get_support()])

15

In [74]:
# Select those features from the X_train given by RFE model .

X_train_rfe_transform=rfe_model.transform(X_train)
X_test_rfe_transform=rfe_model.transform(X_test)

In [75]:
%%time
Check_accuracy(X_train_rfe_transform,X_test_rfe_transform)

0.956140350877193
Wall time: 120 ms


In [76]:
X_train_rfe_transform.shape

(455, 15)

### How to decide the optimal no of features 

In [78]:
for i in range(1,31):
    rfe_model = RFE(estimator=GradientBoostingClassifier(random_state=42),
             n_features_to_select=i)
 
    rfe_model.fit(X_train,y_train)
    X_train_rfe_transform=rfe_model.transform(X_train)
    X_test_rfe_transform=rfe_model.transform(X_test)
    print("Selected Featueres",i)
    acc_score = Check_accuracy(X_train_rfe_transform,X_test_rfe_transform)

Selected Featueres 1
0.9035087719298246
Selected Featueres 2
0.9035087719298246
Selected Featueres 3
0.956140350877193
Selected Featueres 4
0.956140350877193
Selected Featueres 5
0.956140350877193
Selected Featueres 6
0.956140350877193
Selected Featueres 7
0.9649122807017544
Selected Featueres 8
0.9649122807017544
Selected Featueres 9
0.9736842105263158
Selected Featueres 10
0.956140350877193
Selected Featueres 11
0.956140350877193
Selected Featueres 12
0.956140350877193
Selected Featueres 13
0.9649122807017544
Selected Featueres 14
0.956140350877193
Selected Featueres 15
0.956140350877193
Selected Featueres 16
0.9649122807017544
Selected Featueres 17
0.9649122807017544
Selected Featueres 18
0.956140350877193
Selected Featueres 19
0.9649122807017544
Selected Featueres 20
0.9649122807017544
Selected Featueres 21
0.956140350877193
Selected Featueres 22
0.9649122807017544
Selected Featueres 23
0.9649122807017544
Selected Featueres 24
0.9649122807017544
Selected Featueres 25
0.956140350877

### Optimal No of Features = 9 

In [79]:
rfe_model=RFE(estimator=GradientBoostingClassifier(random_state=42),
             n_features_to_select=9)

rfe_model.fit(X_train,y_train)
rfe_model.get_support()
X_train_rfe_transform=rfe_model.transform(X_train)
X_test_rfe_transform=rfe_model.transform(X_test)
Check_accuracy(X_train_rfe_transform,X_test_rfe_transform)

0.9736842105263158


### THE END