# Iteration 7: Feature Selection

## Load data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
#df = pd.read_csv('./data/housing_iteration_3_classification.csv')
df = pd.read_csv('./data/housing_iteration_6_regression.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
for col in df.select_dtypes(exclude="number").columns:
    #print(df[col].value_counts())
    pass

## Train-test split

In [5]:
# define X and y
id = df.pop('Id')
y = df.pop('SalePrice')
X = df.copy()
#X = X.drop(['Alley', 'PoolQC', 'MiscFeature'], axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                   random_state=42)

In [7]:
#X_train.info()

## Preprocessing

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [9]:
# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").columns
X_num_columns = X.select_dtypes(include="number").columns

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = Pipeline(
    steps=[('num_imputer', SimpleImputer(strategy='median')), 
           ('num_scaler', MinMaxScaler())
          ])

 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = Pipeline(
    steps=[('cat_imputer', SimpleImputer(strategy='constant', 
                                         fill_value='na')), 
           ('cat_encoder', OneHotEncoder(drop='first', 
                                         sparse_output=False, 
                                         handle_unknown='ignore'))
           #('cat_encoder', OrdinalEncoder())
          ])

In [30]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipe, X_num_columns), 
        ('cat', categoric_pipe, X_cat_columns)
    ],
    #remainder='passthrough'
).set_output(transform='pandas')

In [31]:
preprocessor

In [37]:
# Fit and transform the training data using the column transformer
X_train_transformed = preprocessor.fit_transform(X_train)

# Transform the test data using the fitted column transformer
X_test_transformed = preprocessor.transform(X_test)



In [13]:
X_train_transformed

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__SaleType_ConLI,cat__SaleType_ConLw,cat__SaleType_New,cat__SaleType_Oth,cat__SaleType_WD,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
254,0.000000,0.167808,0.033186,0.444444,0.625,0.615942,0.116667,0.000000,0.163359,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1066,0.235294,0.130137,0.030555,0.555556,0.750,0.876812,0.733333,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
638,0.058824,0.157534,0.034948,0.444444,0.750,0.275362,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
799,0.176471,0.133562,0.027577,0.444444,0.750,0.471014,0.000000,0.182874,0.100815,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
380,0.176471,0.099315,0.017294,0.444444,0.625,0.376812,0.000000,0.000000,0.038625,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,0.000000,0.195205,0.037472,0.555556,0.500,0.971014,0.933333,0.000000,0.004252,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1130,0.176471,0.150685,0.030400,0.333333,0.250,0.405797,0.000000,0.000000,0.110206,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1294,0.000000,0.133562,0.032120,0.444444,0.750,0.601449,0.666667,0.000000,0.029589,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
860,0.176471,0.116438,0.029643,0.666667,0.875,0.333333,0.800000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## Baseline Model

In [14]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

baseline_tree = DecisionTreeRegressor()
baseline_tree.fit(X_train_transformed, y_train)

baseline_knn = KNeighborsRegressor(n_neighbors=1)
baseline_knn.fit(X_train_transformed, y_train)

In [15]:
baseline_tree_pred = baseline_tree.predict(X_test_transformed)
baseline_knn_pred = baseline_knn.predict(X_test_transformed)

In [16]:
baseline_tree_r2 = r2_score(y_test, baseline_tree_pred)
baseline_knn_r2 = r2_score(y_test, baseline_knn_pred)

performances = pd.DataFrame({'decision_tree': baseline_tree_r2,
                             'knn': baseline_knn_r2},
                            index=['baseline'])

performances

Unnamed: 0,decision_tree,knn
baseline,0.792455,0.748055


## Manual Feature Selection

### K Best

In [17]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression, RFECV, SelectFromModel

# Run the univariate (= one feature at a time) linear regression tests.
f_stat, _ = f_regression(X_train_transformed, y_train)

# Create a DataFrame with the f-statistics and p-values.
f_test = pd.DataFrame({"f_stat":f_stat},
                      index=X_train_transformed.columns)

# Sort the values in the f-statistic column.
f_test.sort_values("f_stat", ascending=False)

Unnamed: 0,f_stat
num__OverallQual,1879.151818
num__GrLivArea,1093.390361
num__GarageCars,813.186816
num__GarageArea,744.064301
num__TotalBsmtSF,648.289096
...,...
cat__Neighborhood_SawyerW,0.007409
cat__Foundation_Stone,0.006806
cat__Condition1_RRNe,0.005176
cat__GarageCond_Gd,0.003468


In [18]:
# Initialise KBest, using the f-regression and setting k=10.
KBest = SelectKBest(score_func=f_regression, k=10)

# Transform the train set.
X_train_KBest = KBest.fit_transform(X_train_transformed, y_train).copy()

# Transform the test set.
X_test_KBest = KBest.transform(X_test_transformed).copy()

In [19]:
X_train_KBest.shape, X_test_KBest.shape

((1168, 10), (292, 10))

In [20]:
# Decision tree.
k10_tree = DecisionTreeRegressor()
k10_tree.fit(X_train_KBest, y_train)
k10_tree_pred = k10_tree.predict(X_test_KBest)

# K-Nearest Neighbors.
k10_knn = KNeighborsRegressor(n_neighbors=1)
k10_knn.fit(X_train_KBest, y_train)
k10_knn_pred = k10_knn.predict(X_test_KBest)

performances.loc["KBest_10", "decision_tree"]= r2_score(y_test, k10_tree_pred)
performances.loc["KBest_10", "knn"] = r2_score(y_test, k10_knn_pred)

performances

Unnamed: 0,decision_tree,knn
baseline,0.792455,0.748055
KBest_10,0.819368,0.721164


### Recursive Feature Elimination

In [21]:
#rfe_tree = RFECV(DecisionTreeRegressor())
#rfe_tree.fit(X_train_transformed, y_train)
#rfe_tree_pred = rfe_tree.predict(X_test_transformed)

#performances.loc["RFE", "decision_tree"] = r2_score(y_test, rfe_tree_pred)

performances

Unnamed: 0,decision_tree,knn
baseline,0.792455,0.748055
KBest_10,0.819368,0.721164


In [22]:
#rfe_tree.get_feature_names_out()

### Select from model

In [23]:
select_model_tree = SelectFromModel(DecisionTreeRegressor(),
                                    threshold=None)

In [24]:
# Transform the train set.
X_train_selected_model_tree = select_model_tree.fit_transform(X_train_transformed, y_train)

# Transform the test set.
X_test_selected_model_tree = select_model_tree.transform(X_test_transformed)

# Show shape of the train and test dataset to check for the number of features kept.
print(X_train_selected_model_tree.shape, X_test_selected_model_tree.shape)

select_model_tree.get_feature_names_out()

(1168, 17) (292, 17)


array(['num__LotFrontage', 'num__LotArea', 'num__OverallQual',
       'num__OverallCond', 'num__YearBuilt', 'num__BsmtFinSF1',
       'num__TotalBsmtSF', 'num__1stFlrSF', 'num__2ndFlrSF',
       'num__GrLivArea', 'num__GarageYrBlt', 'num__GarageCars',
       'num__GarageArea', 'num__MoSold', 'cat__BsmtExposure_No',
       'cat__CentralAir_Y', 'cat__GarageType_Detchd'], dtype=object)

In [25]:
# Decision tree.
select_model_tree = DecisionTreeRegressor()
select_model_tree.fit(X_train_selected_model_tree, y_train)
select_model_tree_pred = select_model_tree.predict(X_test_selected_model_tree)

# K-Nearest Neighbors.
select_model_knn = KNeighborsRegressor(n_neighbors=1)
select_model_knn.fit(X_train_selected_model_tree, y_train)
select_model_knn_pred = select_model_knn.predict(X_test_selected_model_tree)

performances.loc["model_selected", "decision_tree"] = r2_score(y_test, select_model_tree_pred)
performances.loc["model_selected", "knn"] = r2_score(y_test, select_model_knn_pred)

performances

Unnamed: 0,decision_tree,knn
baseline,0.792455,0.748055
KBest_10,0.819368,0.721164
model_selected,0.793912,0.782365


## Build Pipeline

In [32]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression)),
    #('feature_selection', RFECV(estimator=DecisionTreeRegressor())),
    #('feature_selection', SelectFromModel(DecisionTreeRegressor())),
    ('regressor', DecisionTreeRegressor())
     ])


In [33]:
param_grid = {
    'feature_selection__k': range(2, 20),
    #'feature_selection__threshold': ['mean', 'median', '1.25*mean'],
    'regressor__max_depth': range(3, 20)
}


In [34]:
from sklearn.model_selection import GridSearchCV

# Create GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)




In [35]:
# Evaluate the best model on the test data
score = grid_search.score(X_test, y_test)
print("Best parameters:", grid_search.best_params_)
print("Best score on test data:", score)

Best parameters: {'feature_selection__k': 8, 'regressor__max_depth': 6}
Best score on test data: 0.8382668216874851


