#  Preprocessing and Feature Engineering

In [1]:
# Import libraries

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

In [2]:
#  Read cleaned data

df_train = pd.read_csv('./output/train_cleaned.csv')
df_test = pd.read_csv('./output/test_cleaned.csv')

In [3]:
#  View columns and column types

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 78 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ms_zoning        2051 non-null   object 
 1   lot_frontage     2051 non-null   float64
 2   lot_area         2051 non-null   int64  
 3   street           2051 non-null   object 
 4   lot_shape        2051 non-null   int64  
 5   land_contour     2051 non-null   object 
 6   utilities        2051 non-null   object 
 7   lot_config       2051 non-null   object 
 8   land_slope       2051 non-null   int64  
 9   neighborhood     2051 non-null   object 
 10  condition_1      2051 non-null   object 
 11  condition_2      2051 non-null   object 
 12  bldg_type        2051 non-null   object 
 13  house_style      2051 non-null   object 
 14  overall_qual     2051 non-null   int64  
 15  overall_cond     2051 non-null   int64  
 16  year_built       2051 non-null   int64  
 17  year_remod/add

In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 77 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ms_zoning        878 non-null    object 
 1   lot_frontage     878 non-null    float64
 2   lot_area         878 non-null    int64  
 3   street           878 non-null    object 
 4   lot_shape        878 non-null    int64  
 5   land_contour     878 non-null    object 
 6   utilities        878 non-null    object 
 7   lot_config       878 non-null    object 
 8   land_slope       878 non-null    int64  
 9   neighborhood     878 non-null    object 
 10  condition_1      878 non-null    object 
 11  condition_2      878 non-null    object 
 12  bldg_type        878 non-null    object 
 13  house_style      878 non-null    object 
 14  overall_qual     878 non-null    int64  
 15  overall_cond     878 non-null    int64  
 16  year_built       878 non-null    int64  
 17  year_remod/add  

In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 77 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ms_zoning        878 non-null    object 
 1   lot_frontage     878 non-null    float64
 2   lot_area         878 non-null    int64  
 3   street           878 non-null    object 
 4   lot_shape        878 non-null    int64  
 5   land_contour     878 non-null    object 
 6   utilities        878 non-null    object 
 7   lot_config       878 non-null    object 
 8   land_slope       878 non-null    int64  
 9   neighborhood     878 non-null    object 
 10  condition_1      878 non-null    object 
 11  condition_2      878 non-null    object 
 12  bldg_type        878 non-null    object 
 13  house_style      878 non-null    object 
 14  overall_qual     878 non-null    int64  
 15  overall_cond     878 non-null    int64  
 16  year_built       878 non-null    int64  
 17  year_remod/add  

In [6]:
#  Stil no nulls

df_train.isnull().sum().value_counts()

0    78
dtype: int64

In [7]:
df_test.isnull().sum().value_counts()

0    77
dtype: int64

In [8]:
#  Select 'saleprice' as target

X = df_train.drop(columns='saleprice')
y = df_train['saleprice']

In [9]:
#  split into training and validation set

X_train,X_val,y_train,y_val = train_test_split(X, y, random_state = 42)

In [10]:
#  Check shape

X_train.shape,X_val.shape,y_train.shape,y_val.shape

((1538, 77), (513, 77), (1538,), (513,))

#  One-Hot Encoding and Scaling

In [11]:
#  One-Hot Encoding for variables designated as categorical during EDA

ctx = ColumnTransformer(transformers = [
    ('ohe',OneHotEncoder(
        drop='first',
        sparse=False,
        handle_unknown='ignore'),
     ['ms_zoning','lot_config',
      'street','land_contour','utilities','neighborhood',
      'condition_1','condition_2','bldg_type','house_style','roof_style','roof_matl',
      'exterior_1st','mas_vnr_type','heating','central_air',
      'garage_type','paved_drive','electrical','fence','misc_feature','foundation','sale_type'])
],remainder='passthrough')

In [12]:
#  Pipeline for column transformation and scaling
#  Estimators will be handled individually with scaled data

pipe = Pipeline(
    steps = [
        ('ctx',ctx),
        ('ss',StandardScaler())
    ]
)

In [13]:
# Fit pipeline on training data

pipe.fit(X_train,y_train)

Pipeline(steps=[('ctx',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohe',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore',
                                                                sparse=False),
                                                  ['ms_zoning', 'lot_config',
                                                   'street', 'land_contour',
                                                   'utilities', 'neighborhood',
                                                   'condition_1', 'condition_2',
                                                   'bldg_type', 'house_style',
                                                   'roof_style', 'roof_matl',
                                                   'exterior_1st',
                                                   'mas_vnr_type', 'heati

In [14]:
#  Use pipe to transform train, validation, and test

X_train_sc = pipe.transform(X_train)
X_val_sc = pipe.transform(X_val)
X_test_sc = pipe.transform(df_test)



In [15]:
#  Reassemble as dataframes

X_train_sc = pd.DataFrame(data = X_train_sc, columns = pipe.get_feature_names_out())
X_val_sc = pd.DataFrame(data = X_val_sc, columns = pipe.get_feature_names_out())
X_test_sc = pd.DataFrame(data = X_test_sc, columns = pipe.get_feature_names_out())

In [16]:
#  Check shape

X_train_sc.shape,X_val_sc.shape,X_test_sc.shape

((1538, 185), (513, 185), (878, 185))

# Reducing Multicollinearity

Variance Inflation Factor (VIF) is a strong metric for multicollinearity.  

A VIF of greater than 5 is considered problematic.  

The following loop runs through the feature space, drops the feature with highest VIF, and 
repeats until no feature has a VIF greater than 5.

In [17]:
max_x = ('feat',10)
while max_x[1] > 5:
    x = [(X_train_sc.columns[col],variance_inflation_factor(X_train_sc,col)) for col in range(len(X_train_sc.columns))]
    max_x = max(x,key=lambda x: x[1])
    print(max_x)
    X_train_sc.drop(columns = max_x[0],inplace=True)

  vif = 1. / (1. - r_squared_i)


('remainder__bsmtfin_sf_1', inf)


  vif = 1. / (1. - r_squared_i)


('remainder__1st_flr_sf', inf)
('remainder__garage_yr_blt', 1734.5408831009668)
('ohe__ms_zoning_RL', 322.29125124555117)
('ohe__misc_feature_NONE', 261.12032469374054)
('ohe__roof_style_Gable', 83.01501770203208)
('remainder__pid_1', 72.90287183917538)
('ohe__roof_matl_CompShg', 52.95513707471433)
('ohe__mas_vnr_type_None', 39.84393197181128)
('ohe__garage_type_Attchd', 35.83100267879756)
('ohe__exterior_1st_VinylSd', 20.707938013388326)
('ohe__neighborhood_NAmes', 19.124325501854738)
('remainder__2nd_flr_sf', 18.49743524187252)
('ohe__garage_type_NONE', 18.46581434665116)
('remainder__pool_area', 15.315232500086145)
('remainder__garage_qual', 12.844195960309225)
('remainder__gr_liv_area', 12.06441821680474)
('remainder__year_built', 11.728677127981305)
('remainder__garage_cars', 8.264427166587545)
('ohe__foundation_PConc', 6.91492797061613)
('ohe__neighborhood_Somerst', 6.849660773913976)
('remainder__total_bsmt_sf', 6.260419618100167)
('remainder__pid_2', 6.106434410466769)
('ohe__c

In [18]:
#  Select corresponding columns from validation and test 

X_val_sc = X_val_sc[[col for col in X_train_sc.columns]]
X_test_sc = X_test_sc[[col for col in X_train_sc.columns]]

In [19]:
#  Reassemble as dataframes

X_train_sc = pd.DataFrame(data = X_train_sc, columns = X_train_sc.columns)
X_val_sc = pd.DataFrame(data = X_val_sc, columns = X_train_sc.columns)
X_test_sc = pd.DataFrame(data = X_test_sc, columns = X_train_sc.columns)

In [22]:
X_train_sc.to_csv('./output/X_train_sc.csv',index=False)
X_val_sc.to_csv('./output/X_val_sc.csv',index=False)
X_test_sc.to_csv('./output/X_test_sc.csv',index=False)

y_train.to_csv('./output/y_train.csv',index=False)
y_val.to_csv('./output/y_val.csv',index=False)