# Feature Selection and Elimination
> Here we will walk through the use of Shapash and Scikit-Learn for feature selection 

In [None]:
import pandas as pd
from category_encoders import OrdinalEncoder
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

In [246]:
from shapash.data.data_loader import data_loading
house_df, house_dict = data_loading('house_prices')

y_df=house_df['SalePrice'].to_frame()
X_df=house_df[house_df.columns.difference(['SalePrice'])]

In [247]:
house_dict

{'MSSubClass': 'Building Class',
 'MSZoning': 'General zoning classification',
 'LotArea': 'Lot size square feet',
 'Street': 'Type of road access',
 'LotShape': 'General shape of property',
 'LandContour': 'Flatness of the property',
 'Utilities': 'Type of utilities available',
 'LotConfig': 'Lot configuration',
 'LandSlope': 'Slope of property',
 'Neighborhood': 'Physical locations within Ames city limits',
 'Condition1': 'Proximity to various conditions',
 'Condition2': 'Proximity to other various conditions',
 'BldgType': 'Type of dwelling',
 'HouseStyle': 'Style of dwelling',
 'OverallQual': 'Overall material and finish of the house',
 'OverallCond': 'Overall condition of the house',
 'YearBuilt': 'Original construction date',
 'YearRemodAdd': 'Remodel date',
 'RoofStyle': 'Type of roof',
 'RoofMatl': 'Roof material',
 'Exterior1st': 'Exterior covering on house',
 'Exterior2nd': 'Other exterior covering on house',
 'MasVnrType': 'Masonry veneer type',
 'MasVnrArea': 'Masonry venee

# Encoding categorical features

In [248]:
from category_encoders import OrdinalEncoder

X_df=house_df[house_df.columns.difference(['SalePrice'])]
categorical_features = [col for col in X_df.columns if X_df[col].dtype == 'object']

encoder = OrdinalEncoder(
    cols=categorical_features,
    handle_unknown='ignore',
    return_df=True).fit(X_df)

X_df=encoder.transform(X_df)


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead



In [249]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X_df, y_df, train_size=0.75, random_state=1)

regressor = LGBMRegressor(n_estimators=200).fit(Xtrain,ytrain)
y_pred = pd.DataFrame(regressor.predict(Xtest),columns=['pred'],index=Xtest.index)

# Initiate smart explainer

In [250]:
from shapash.explainer.smart_explainer import SmartExplainer
xpl = SmartExplainer(features_dict=house_dict)

xpl.compile(
    x=Xtest,
    model=regressor,
    preprocessing=encoder, # Optional: compile step can use inverse_transform method
    y_pred=y_pred # Optional
)

Backend: Shap TreeExplainer


In [251]:
xpl.plot.features_importance()

In [252]:
xpl.plot.contribution_plot("OverallQual")

In [253]:
xpl.plot.contribution_plot("GrLivArea")

In [254]:
xpl.plot.contribution_plot("HouseStyle")

# Converting features and target to numpy array

In [159]:
arr_features = X_df.to_numpy()
arr_target = house['SalePrice'].to_numpy()

In [63]:
from sklearn.feature_selection import RFE

clf = LGBMRegressor(n_estimators=200)
clf.fit(arr_features, arr_target)

# rfe = RFE(estimator=LGBMRegressor(n_estimators=200))
# rfe.fit(arr_features, arr_target)

LGBMRegressor(n_estimators=200)

# Get True/False for column selection

In [255]:
rfe.support_

array([ True,  True, False,  True, False, False,  True,  True, False,
        True, False,  True, False,  True,  True, False, False, False,
       False, False, False, False,  True,  True,  True, False, False,
       False,  True, False,  True, False, False,  True,  True, False,
       False,  True,  True, False, False, False, False,  True,  True,
        True, False,  True, False,  True, False, False,  True,  True,
        True,  True,  True, False, False, False,  True,  True, False,
       False, False,  True,  True, False,  True,  True,  True,  True])

# These columns were selected

In [256]:
house_full.columns[rfe.support_]

Index(['1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'BsmtExposure', 'BsmtFinSF1',
       'BsmtFinType1', 'BsmtFullBath', 'BsmtQual', 'BsmtUnfSF', 'Exterior1st',
       'Exterior2nd', 'Fireplaces', 'GarageArea', 'GarageFinish',
       'GarageYrBlt', 'GrLivArea', 'HeatingQC', 'HouseStyle', 'LotArea',
       'LotConfig', 'LotShape', 'MSSubClass', 'MasVnrArea', 'MoSold',
       'Neighborhood', 'OpenPorchSF', 'OverallCond', 'OverallQual',
       'RoofStyle', 'SaleCondition', 'TotRmsAbvGrd', 'TotalBsmtSF',
       'WoodDeckSF', 'YearBuilt', 'YearRemodAdd', 'YrSold'],
      dtype='object')

In [257]:
rfe.ranking_ 

array([ 1,  1, 23,  1,  7, 10,  1,  1, 17,  1, 19,  1, 26,  1,  1, 27,  8,
       28, 14, 11,  5, 16,  1,  1,  1,  6, 22, 21,  1, 29,  1, 25,  4,  1,
        1,  9, 32,  1,  1, 31, 12, 20, 15,  1,  1,  1, 35,  1,  3,  1,  2,
       30,  1,  1,  1,  1,  1, 18, 36, 34,  1,  1, 13, 24, 33,  1,  1, 37,
        1,  1,  1,  1])

In [258]:
for i in range(house_full.shape[1]):
#     if rfe.ranking_[i] ==1:
        print('Column: {}, {} Selected: {}, Rank: {}'.format(i, house_full.columns[i], rfe.support_[i], rfe.ranking_[i]))

Column: 0, 1stFlrSF Selected: True, Rank: 1
Column: 1, 2ndFlrSF Selected: True, Rank: 1
Column: 2, 3SsnPorch Selected: False, Rank: 23
Column: 3, BedroomAbvGr Selected: True, Rank: 1
Column: 4, BldgType Selected: False, Rank: 7
Column: 5, BsmtCond Selected: False, Rank: 10
Column: 6, BsmtExposure Selected: True, Rank: 1
Column: 7, BsmtFinSF1 Selected: True, Rank: 1
Column: 8, BsmtFinSF2 Selected: False, Rank: 17
Column: 9, BsmtFinType1 Selected: True, Rank: 1
Column: 10, BsmtFinType2 Selected: False, Rank: 19
Column: 11, BsmtFullBath Selected: True, Rank: 1
Column: 12, BsmtHalfBath Selected: False, Rank: 26
Column: 13, BsmtQual Selected: True, Rank: 1
Column: 14, BsmtUnfSF Selected: True, Rank: 1
Column: 15, CentralAir Selected: False, Rank: 27
Column: 16, Condition1 Selected: False, Rank: 8
Column: 17, Condition2 Selected: False, Rank: 28
Column: 18, Electrical Selected: False, Rank: 14
Column: 19, EnclosedPorch Selected: False, Rank: 11
Column: 20, ExterCond Selected: False, Rank: 5


# Get coefficient when rfe is not used and only the estimator is used

# clf = LGBMRegressor(n_estimators=200)
# clf.fit(arr_features, arr_target) -->

# rfe = RFE(estimator=LGBMRegressor(n_estimators=200))
# rfe.fit(arr_features, arr_target)

In [64]:
import numpy as np
coeff = np.absolute(clf.feature_importances_)
coeff

array([391, 190,   7,  32,   3,  11,  50, 360,  32,  39,  11,  21,   7,
        33, 248,  19,  46,   0,   8,  55,   8,  41,  86,  62,  35,  12,
        31,  21, 343,   5,  40,   4,  28, 179, 455,  16,   2,  20,  24,
        10,  44,  39,   5, 466,  34,  26,   8,  45,  43, 227,  25,   0,
       141, 163, 290,  96, 166,  15,   0,  12,  14,  67,  18,  28,   0,
        86, 416,   0, 153, 139, 148, 101])

# Use of Scikit-Learn in iris data

In [160]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, SelectPercentile
X, y = load_iris(return_X_y=True)
X.shape

(150, 4)

In [161]:
X = pd.DataFrame(X)
y = pd.DataFrame(y)
X.columns = ['sepal length','sepal width','petal length','petal width']
y.columns = ['target']

In [162]:
X_new = SelectKBest(chi2, k=2).fit(X, y)
feature_ids = X_new.get_support(indices=True)
feature_names = X.columns[feature_ids]
feature_names 

Index(['petal length', 'petal width'], dtype='object')

In [163]:
X.describe()

Unnamed: 0,sepal length,sepal width,petal length,petal width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


# Housing data

In [136]:
X_new = SelectKBest(chi2, k=10).fit(X_df, y_df)
feature_ids = X_new.get_support(indices=True)
feature_names = X_df.columns[feature_ids]
feature_names 

Index(['2ndFlrSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'GrLivArea',
       'LotArea', 'LowQualFinSF', 'MasVnrArea', 'MiscVal', 'PoolArea'],
      dtype='object')

# Use of Shapash

In [238]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.75, random_state=1)
regressor = LGBMRegressor(n_estimators=200).fit(Xtrain,ytrain)
y_pred = pd.DataFrame(regressor.predict(Xtest),columns=['pred'],index=Xtest.index)

In [239]:
from shapash.explainer.smart_explainer import SmartExplainer
xpl = SmartExplainer()

xpl.compile(
    x=Xtest,
    model=regressor,
#     preprocessing=encoder, # Optional: compile step can use inverse_transform method
    y_pred=y_pred # Optional
)
xpl.plot.features_importance()

Backend: Shap TreeExplainer


In [240]:
xpl.plot.contribution_plot("petal length")

In [237]:
y_pred

Unnamed: 0,pred
14,-0.016183
98,0.798744
75,1.096621
16,0.027792
131,1.973971
56,0.983864
141,1.894806
44,0.334409
29,-0.063523
120,1.973971


In [233]:
xpl.plot.local_plot(index=128)

In [243]:
xpl.plot.local_plot(index=146)