# Feature Engineering

This notebook is dedicated to the Engineering of the cleaned features created in the previous notebook.


### Imports


In [1]:
# Library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.stats import pearsonr

import pickle

In [2]:
#data imports
with open('pickles/df.pkl', 'rb') as f:
    df = pickle.load(f)
    
with open('pickles/df_test.pkl', 'rb') as f:
    df_test = pickle.load(f)

# variable import
with open('pickles/corr_list.pkl', 'rb') as f:
    corr_list = pickle.load(f)

#### Standardizing Continuous Features (Using StandardScaler)

The remaining features will be numeric (continuous) features, which I will be analyzing to determine if specific features would benefit from scaling. There are 39 features within this dataframe and I will preliminarily look into the top 10 continuous features (by correlation with sale price).

In [3]:
# Finding continuous features

#train
df_continuous = df.select_dtypes(include=[np.number])
df_continuous = df_continuous.loc[:, df_continuous.columns.isin(corr_list)]

#test
df_test_continuous = df_test.select_dtypes(include=[np.number])
df_test_continuous = df_test_continuous.loc[:, df_test_continuous.columns.isin(corr_list)]

# setting up Standard Scaler
scaler = StandardScaler()
#train
df_scaled = scaler.fit_transform(df_continuous)
#test
df_test_scaled = scaler.transform(df_test_continuous)

# returning scaled df
#train
df_scaled = pd.DataFrame(df_scaled, columns=df_continuous.columns)

#test
df_scaled_test = pd.DataFrame(df_test_scaled, columns=df_test_continuous.columns)

# Pulling only corr_list columns
df_scaled_final = df_scaled[corr_list]
df_scaled_test_final = df_scaled_test[corr_list]

df_scaled = df_scaled_final
df_test_scaled = df_scaled_test_final

#fixing weird error where above code creates 2 of every column
df_scaled = df_scaled.loc[:, ~df_scaled.columns.duplicated(keep='last')]
df_test_scaled = df_test_scaled.loc[:, ~df_test_scaled.columns.duplicated(keep='last')]

#### Hot Encoding Categorical Features (using get dummies)

Since there are a large number of discrete features, I'll need to hot encode them using getdummies() to be able to include them in my model. Below I've displayed a dataframe that includes all of the non-numerical features (discrete data), which will be good candidates for hot encoding.



In [4]:
# extract the categorical features from the training data
categorical_train = df.select_dtypes(exclude=[np.number])

#display(categorical_train)

# extract the categorical features from the test data
categorical_test = df_test.select_dtypes(exclude=[np.number])

# get the list of categorical columns to encode
cat_cols = categorical_train.columns.tolist()

# use get_dummies to one-hot encode the categorical features
train_encoded = pd.get_dummies(categorical_train, columns=cat_cols)
test_encoded = pd.get_dummies(categorical_test, columns=cat_cols)

# align the columns in the test data to match the columns in the training data
test_encoded = test_encoded.reindex(columns=train_encoded.columns, fill_value=0)


cat_cols_encoded = train_encoded.columns.tolist()

#### Selecting Important Categorical Features (Pearson r)

To prevent the model from being drastically overfit, I want to evaluate a subset of the categorical features. This process will be similar to how we previously evaluated the continuous features by shortening to corr_list.

Since these values are categorical features, I need to use the pearson coefficient to determine the categorical features relationship with the continuous feature of saleprice.

In [5]:
# since we seperated the two features into two bins, we inadvertently removed the saleprice column. Let's add that back in
# so that we can calculate correlations

price_col = df['saleprice']
train_encoded['saleprice'] = price_col

# Categorical Feature correlation with 'saleprice'
#correlation_encoded = train_encoded

correlations = {}
for feature in train_encoded.columns[:-1]:
    correlation, _ = pearsonr(train_encoded[feature], train_encoded['saleprice'])
    correlations[feature] = correlation

# Sort the features by their correlation values
sorted_features = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)

display(sorted_features)

# Print the sorted feature correlations
for feature, correlation in sorted_features:
    print(f'{feature}: {correlation:.3f}')


[('exter qual_TA', -0.6001947313173065),
 ('bsmt qual_Ex', 0.5865525781948304),
 ('kitchen qual_Ex', 0.551342268575298),
 ('kitchen qual_TA', -0.5405892695105362),
 ('foundation_PConc', 0.5301154578544454),
 ('exter qual_Ex', 0.4939395624118063),
 ('bsmtfin type 1_GLQ', 0.46337350027547536),
 ('bsmt qual_TA', -0.4578014762246721),
 ('heating qc_Ex', 0.4529038027606933),
 ('neighborhood_NridgHt', 0.44867813233560505),
 ('exter qual_Gd', 0.44647108299226795),
 ('garage finish_Unf', -0.43179473448119443),
 ('mas vnr type_None', -0.423809440998774),
 ('garage finish_Fin', 0.4227804380321084),
 ('fireplace qu_Gd', 0.38455206084428356),
 ('bsmt exposure_Gd', 0.3769865763228183),
 ('garage type_Detchd', -0.369689565312797),
 ('sale type_New', 0.3580719424085858),
 ('garage type_Attchd', 0.35739203268498),
 ('foundation_CBlock', -0.35628131560526183),
 ('heating qc_TA', -0.34290066130666963),
 ('exterior 1st_VinylSd', 0.3418645562982632),
 ('exterior 2nd_VinylSd', 0.33728025452594007),
 ('mas 

exter qual_TA: -0.600
bsmt qual_Ex: 0.587
kitchen qual_Ex: 0.551
kitchen qual_TA: -0.541
foundation_PConc: 0.530
exter qual_Ex: 0.494
bsmtfin type 1_GLQ: 0.463
bsmt qual_TA: -0.458
heating qc_Ex: 0.453
neighborhood_NridgHt: 0.449
exter qual_Gd: 0.446
garage finish_Unf: -0.432
mas vnr type_None: -0.424
garage finish_Fin: 0.423
fireplace qu_Gd: 0.385
bsmt exposure_Gd: 0.377
garage type_Detchd: -0.370
sale type_New: 0.358
garage type_Attchd: 0.357
foundation_CBlock: -0.356
heating qc_TA: -0.343
exterior 1st_VinylSd: 0.342
exterior 2nd_VinylSd: 0.337
mas vnr type_Stone: 0.310
lot shape_Reg: -0.307
kitchen qual_Gd: 0.306
bsmt exposure_No: -0.294
paved drive_Y: 0.290
ms zoning_RM: -0.281
central air_N: -0.278
central air_Y: 0.278
lot shape_IR1: 0.275
paved drive_N: -0.274
garage cond_TA: 0.272
roof style_Hip: 0.265
neighborhood_NoRidge: 0.263
mas vnr type_BrkFace: 0.258
neighborhood_StoneBr: 0.257
electrical_SBrkr: 0.253
garage qual_TA: 0.250
roof style_Gable: -0.249
fireplace qu_Ex: 0.235
m

As we can see, the encoded categorical features are less correlated individually than the continuous features. We'll select the top 6 features to evalate.

In [6]:
corr_cat_list = [item[0] for item in sorted_features]

In [16]:
corr_cat_list[0:5]

['exter qual_TA',
 'bsmt qual_Ex',
 'kitchen qual_Ex',
 'kitchen qual_TA',
 'foundation_PConc']

In [7]:
# let's change the input encoded df for both test and train to adjust for this change

#train
train_encoded = train_encoded.loc[:, train_encoded.columns.isin(corr_cat_list)]

#test
test_encoded = test_encoded.loc[:, test_encoded.columns.isin(corr_cat_list)]

##### Combined Result Dataframe

Now that we've adequately standardized the features of our dataframe, we'll combine the scaled and encoded continuous & categorical dataframes together

In [8]:
#train
df = pd.concat([df_scaled,train_encoded],axis = 1)
df['saleprice'] = price_col

#test
df_test = pd.concat([df_scaled_test,test_encoded],axis = 1)

In [17]:
df_test.head(5)

Unnamed: 0,overall qual,total bsmt sf,1st flr sf,gr liv area,garage cars,garage area,ms zoning_A (agr),ms zoning_C (all),ms zoning_FV,ms zoning_I (all),...,misc feature_TenC,sale type_COD,sale type_CWD,sale type_Con,sale type_ConLD,sale type_ConLI,sale type_ConLw,sale type_New,sale type_Oth,sale type_WD
0,-0.07939,-0.084549,-0.64737,0.856288,-1.01651,-0.155188,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,-0.780895,2.023169,2.024157,0.934246,0.2918,0.492681,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0.622115,-0.899147,-1.262906,-0.007245,0.2918,-0.219975,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,-0.780895,-0.200284,-0.496008,-1.062674,0.2918,0.029917,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,-0.07939,0.747855,0.578657,-0.211135,0.2918,0.187257,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


#### Last removal of missing values

It's possible that some missing values were created during the processing phase. I'll use this space to ensure no missing values enter the model.


In [9]:
# create a Boolean mask indicating where NaN values are
num_missing_per_col = df.isna().sum()

# count the number of NaN values in each column
cols_with_missing_values = df.loc[:, num_missing_per_col > 0]

# Dropping last row which is showing up as zeros
df= df.drop(df.index[-1])

#train
df = df.fillna(0)

#test
df_test = df_test.fillna(0)

In [12]:
df_test.head(5)

Unnamed: 0,overall qual,total bsmt sf,1st flr sf,gr liv area,garage cars,garage area,ms zoning_A (agr),ms zoning_C (all),ms zoning_FV,ms zoning_I (all),...,misc feature_TenC,sale type_COD,sale type_CWD,sale type_Con,sale type_ConLD,sale type_ConLI,sale type_ConLw,sale type_New,sale type_Oth,sale type_WD
0,-0.07939,-0.084549,-0.64737,0.856288,-1.01651,-0.155188,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,-0.780895,2.023169,2.024157,0.934246,0.2918,0.492681,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0.622115,-0.899147,-1.262906,-0.007245,0.2918,-0.219975,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,-0.780895,-0.200284,-0.496008,-1.062674,0.2918,0.029917,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,-0.07939,0.747855,0.578657,-0.211135,0.2918,0.187257,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [19]:
# create a Boolean mask indicating where NaN values are
num_missing_per_col = df.isna().sum()

# count the number of NaN values in each column
cols_with_missing_values = df.loc[:, num_missing_per_col > 0]

In [20]:
cols_with_missing_values.columns

Index([], dtype='object')

In [21]:
# exporting df_cat and df_cont

#setting up pickle to export variables
with open('pickles/cat_cols_encoded.pkl', 'wb') as f:
    pickle.dump(cat_cols_encoded, f)
       
with open('pickles/cat_cols.pkl', 'wb') as f:
    pickle.dump(cat_cols, f)
    
with open('pickles/df.pkl', 'wb') as f:
    pickle.dump(df, f)
      
with open('pickles/df_test.pkl', 'wb') as f:
    pickle.dump(df_test, f)
    
with open('pickles/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)