In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import (
    f_regression,
    SelectKBest,
    SelectFromModel,
)

from sklearn.linear_model import Lasso

from feature_engine.wrappers import SklearnTransformerWrapper

In [3]:
# Read the separate files
train_df = pd.read_csv('../data/house-prices/train.csv')
test_df = pd.read_csv('../data/house-prices/test.csv')

# Separate features and target in training data
X_train = train_df.drop(['Id', 'SalePrice'], axis=1)
y_train = train_df['SalePrice']

# For test data, you might not have the target variable
X_test = test_df.drop(['Id'], axis=1)  # Note: test data might not have SalePrice column

print("X_train :", X_train.shape)
print("X_test :", X_test.shape)

X_train : (1460, 79)
X_test : (1459, 79)


## Select K Best

In [4]:
# variables to evaluate:

cols = [var for var in X_train.columns if X_train[var].dtypes !='O']

cols

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [5]:
# let's use select K best to select the best k variables

selector = SklearnTransformerWrapper(
    transformer = SelectKBest(f_regression, k=5),
    variables = cols)

selector.fit(X_train.fillna(0), y_train)

In [6]:
selector.transformer_.get_support(indices=True)

array([ 3, 11, 15, 25, 26])

In [7]:
# selecteed features

X_train.columns[selector.transformer_.get_support(indices=True)]

Index(['LotArea', 'Neighborhood', 'HouseStyle', 'MasVnrArea', 'ExterQual'], dtype='object')

In [8]:
# the transformer returns the selected variables from the list
# we passed to the transformer PLUS the remaining variables 
# in the dataframe that were not examined

X_train_t = selector.transform(X_train.fillna(0))
X_test_t = selector.transform(X_test.fillna(0))

In [9]:
X_test_t.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RH,Pave,0,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,...,1.0,730.0,TA,TA,Y,0,MnPrv,0,WD,Normal
1,RL,Pave,0,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,...,1.0,312.0,TA,TA,Y,0,0,Gar2,WD,Normal
2,RL,Pave,0,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,...,2.0,482.0,TA,TA,Y,0,MnPrv,0,WD,Normal
3,RL,Pave,0,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,...,2.0,470.0,TA,TA,Y,0,0,0,WD,Normal
4,RL,Pave,0,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,...,2.0,506.0,TA,TA,Y,0,0,0,WD,Normal


## SelectFromModel

In [10]:
# let's select the best variables according to Lasso

lasso = Lasso(alpha=10000, random_state=0)

sfm = SelectFromModel(lasso, prefit=False)

selector = SklearnTransformerWrapper(
    transformer = sfm,
    variables = cols)

selector.fit(X_train.fillna(0), y_train)

In [11]:
selector.transformer_.get_support(indices=True)

array([ 0,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 15, 24, 26, 27, 28, 29,
       30, 31, 32, 33])

In [12]:
len(selector.transformer_.get_support(indices=True))

21

In [13]:
len(cols)

36

In [14]:
# the transformer returns the selected variables from the list
# we passed to the transformer PLUS the remaining variables 
# in the dataframe that were not examined

X_train_t = selector.transform(X_train.fillna(0))
X_test_t = selector.transform(X_test.fillna(0))

In [15]:
X_test_t.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,SaleType,SaleCondition
0,20,RH,11622,Pave,0,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,120,0,0,MnPrv,0,0,WD,Normal
1,20,RL,14267,Pave,0,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,0,0,0,0,Gar2,12500,WD,Normal
2,60,RL,13830,Pave,0,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,MnPrv,0,0,WD,Normal
3,60,RL,9978,Pave,0,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,0,0,0,WD,Normal
4,120,RL,5005,Pave,0,IR1,HLS,AllPub,Inside,Gtl,...,0,0,144,0,0,0,0,0,WD,Normal
