In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso,LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

### 1. Classification Feature Selection

In [3]:
# Load paribas dataset from kaggle
data=pd.read_csv('../datasets/paribas.csv',nrows=50000)
data.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,C,3.921026,7.915266,2.599278,3.176895,0.012941,...,8.0,1.98978,0.035754,AU,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,4,1,,,C,,9.191265,,,2.30163,...,,,0.598896,AF,,,1.957825,0,,
2,5,1,0.943877,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,...,9.333333,2.477596,0.013452,AE,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,C,4.22593,11.627438,2.0977,1.987549,0.171947,...,7.018256,1.812795,0.002267,CJ,1.41523,2.954381,1.990847,1,1.677108,1.034483
4,8,1,,,C,,,,,,...,,,,Z,,,,0,,


In [4]:
# Inpractise, feature selection should be done after data preprocessing
# So Ideally, all the categorical variables are enocded into numbers,
# and then you can assess whether they are correlated with other features

# here for simplicity I will use only numerical variables
# Select numerical columns
numericals=['int16','int32','int64','float16','float32','float64']
numerical_vars=list(data.select_dtypes(include=numericals).columns)
data=data[numerical_vars]
data.shape

(50000, 114)

In [6]:
# Split the dataset into training and testing to avoid overfitting.
x_train,x_test,y_train,y_test=train_test_split(data.drop(labels=['ID','target'],axis=1),
                                              data['target'],test_size=0.3,random_state=0)
x_train.shape,x_test.shape

((35000, 112), (15000, 112))

In [7]:
# linear models benefits from feature scaling
scaler=StandardScaler()
scaler.fit(x_train.fillna(0))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [8]:
# Here we'll do feature selection and model fitting together.
# first we'll specify the Logistic Regression model, and make sure to use
# Lasso penalty. 
# Then we'll use SelectFromModel object from sklearn to check the selected
# features whose coefficients are not 0.

sel_=SelectFromModel(LogisticRegression(C=1,penalty='l1'))
sel_.fit(scaler.transform(x_train.fillna(0)),y_train)

SelectFromModel(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=None)

In [9]:
# let's check the features that are kept
sel_.get_support()  # True are the one's that are kept.

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True, False,  True,
       False,  True,  True,  True, False,  True,  True,  True, False,
       False,  True,  True,  True,  True, False, False,  True,  True,
       False,  True, False,  True,  True,  True,  True,  True, False,
       False, False, False, False, False,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False, False, False,  True,  True,  True,  True, False, False,
       False, False,  True,  True,  True,  True,  True, False, False,
        True, False,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False, False,  True,  True, False,  True,
        True,  True,  True, False])

In [10]:
# list of selected features
selec_features=x_train.columns[sel_.get_support()]
print('Total features :{}'.format(x_train.shape[1]))
print('Total selected features: {}'.format(len(selec_features)))
print('Total removed features: {}'.format(np.sum(sel_.estimator_.coef_==0)))

Total features :112
Total selected features: 79
Total removed features: 33


In [19]:
# Checking the name of removed features
x_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]   # ravel() turn ND to 1D

Index(['v11', 'v20', 'v29', 'v33', 'v37', 'v41', 'v42', 'v48', 'v49', 'v53',
       'v55', 'v62', 'v63', 'v64', 'v65', 'v67', 'v68', 'v81', 'v86', 'v87',
       'v88', 'v94', 'v95', 'v96', 'v97', 'v103', 'v104', 'v106', 'v115',
       'v121', 'v122', 'v126', 'v131'],
      dtype='object')

In [21]:
# Now selecting the selected columns 
x_train_selected=sel_.transform(x_train.fillna(0))
x_test_selected=sel_.transform(x_test.fillna(0))

In [23]:
type(x_test_selected) # hence need to be turned back to dataframe

numpy.ndarray

### Performing L2 regularisation

In [25]:
# for comparison of zero coefficients we'll fit logistic regression.
lr=LogisticRegression(C=1,penalty='l2')
lr.fit(x_train.fillna(0),y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [26]:
# COUNT OF COEF WITH 0 Value
np.sum(lr.coef_ == 0)

0

### 2. Regression Feature Selection

In [27]:
# for regression problem we'll use kaggle houseprice prediction dataset
data=pd.read_csv('../datasets/houseprice.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [30]:
# Inpractise, feature selection should be done after data preprocessing
# So Ideally, all the categorical variables are enocded into numbers,
# and then you can assess whether they are correlated with other features

# here for simplicity I will use only numerical variables
# Select numerical columns

numeric_col=list(data.select_dtypes(include=numericals))
data=data[numeric_col]
data.shape

(1460, 38)

In [37]:
# split the data into training and testing
x_train,x_test,y_train,y_test=train_test_split(data.drop(labels=['Id','SalePrice'],axis=1),
                                              data['SalePrice'],test_size=0.3,random_state=0)
x_train.shape,x_test.shape

((1022, 36), (438, 36))

In [32]:
# the features in the house dataset are in very
# different scales, hence scaling needs to be done
scaler=StandardScaler()
scaler.fit(x_train.fillna(0))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [38]:
# here also we'll train laso linear model and select
# non-zero coefficient features.

# linear regression object doesn't support l1 reguralisation hence,
# it's important to perform lasso regularisation seperately. Hence,
# import 'Lasso' object. Here we'll use high value to alpha (penalty term)
# so as to select less features as possible
sel_=SelectFromModel(Lasso(alpha=100))
sel_.fit(scaler.transform(x_train.fillna(0)),y_train)

SelectFromModel(estimator=Lasso(alpha=100, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=None)

In [39]:
# agian checking selected features
sel_.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True])

In [44]:
select_feat=x_train.columns[list(sel_.get_support())]
print('Total features :{}'.format(x_train.shape[1]))
print('Total selected features: {}'.format(len(select_feat)))
print('Total removed features: {}'.format(np.sum(sel_.estimator_.coef_==0)))

Total features :36
Total selected features: 33
Total removed features: 3


Conclusion - <br>
Hence, we have implemented both Lasso regularisation for feature selection in both classification and regresssion problem. Now, if we set penalty too high then it might happen that the important features are also removed, this can be seen by the drop in the performance of the model.

For better result hyperparameter tuning can be along with k-fold cross validation to decide the optimal value of the penalty.