# 2. Preprocessing and Feature Engineering

## Importing Libraries
---

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score

## Load the Data
---

In [2]:
df_train = pd.read_csv('../data/train_clean.csv')
df_train.head()

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,...,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,60,RL,68.878999,13517,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,...,44,0,0,0,0,0,3,2010,WD,130500
1,60,RL,43.0,11492,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,...,74,0,0,0,0,0,4,2009,WD,220000
2,20,RL,68.0,7922,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,52,0,0,0,0,0,1,2010,WD,109000
3,60,RL,73.0,9802,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,0,4,2010,WD,174000
4,50,RL,82.0,14235,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,59,0,0,0,0,0,3,2010,WD,138500


In [3]:
df_test = pd.read_csv('../data/test_clean.csv')
df_test.head()

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,...,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type
0,190,RM,69.0,9142,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,60,112,0,0,0,0,4,2006,WD
1,90,RL,69.545961,9662,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,170,0,0,0,0,0,0,8,2006,WD
2,60,RL,58.0,17104,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,100,24,0,0,0,0,0,9,2006,New
3,30,RM,60.0,8520,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,184,0,0,0,0,7,2007,WD
4,20,RL,69.545961,9500,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,76,0,0,185,0,0,7,2009,WD


## Feature Engineering
---

### Binarize `Central Air` variable
>`Central Air` : Central air conditioning

Since the data values are Y/N, we can convert them to boolean variables. 

In [4]:
df_train['central_air'].value_counts()

Y    1908
N     141
Name: central_air, dtype: int64

In [5]:
df_train['central_air'] = df_train['central_air'].map({'Y': 1, 'N': 0})
df_train['central_air'].value_counts()

1    1908
0     141
Name: central_air, dtype: int64

In [6]:
df_test['central_air'].value_counts()

Y    823
N     55
Name: central_air, dtype: int64

In [7]:
df_test['central_air'] = df_test['central_air'].map({'Y': 1, 'N': 0})
df_test['central_air'].value_counts()

1    823
0     55
Name: central_air, dtype: int64

### Mapping Ordinal Variables

In [8]:
def convert_ordinal_variables(df):
    df['lot_shape'] = df['lot_shape'].map({'Reg': 4, 'IR1': 3, 'IR2': 2, 'IR3':1})
    df['utilities'] = df['utilities'].map({'AllPub': 4, 'NoSewr': 3, 'NoSeWa': 2, 'ELO':1})
    df['land_slope'] = df['land_slope'].map({'Gtl': 3, 'Mod': 2, 'Sev':1})
    df['exter_qual'] = df['exter_qual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po':1})
    df['exter_cond'] = df['exter_cond'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po':1})
    df['bsmt_cond'] = df['bsmt_cond'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po':1, 'NA':0})
    df['bsmt_exposure'] = df['bsmt_exposure'].map({'Gd': 4, 'Av': 3, 'Mn': 2, 'No':1, 'NA':0})
    df['bsmtfin_type_1'] = df['bsmtfin_type_1'].map({'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf':1, 'NA':0})
    df['bsmtfin_type_2'] = df['bsmtfin_type_2'].map({'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf':1, 'NA':0})
    df['heating_qc'] = df['heating_qc'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po':1})
    df['electrical'] = df['electrical'].map({'SBrkr': 5, 'FuseA': 4, 'FuseF': 3, 'FuseP': 2, 'Mix':1})
    df['kitchen_qual'] = df['kitchen_qual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po':1})
    df['functional'] = df['functional'].map({'Typ': 8, 'Min1': 7, 'Min2': 6, 'Mod': 5, 'Maj1': 4, 'Maj2': 3, 'Sev': 2, 'Sal':1})
    df['garage_finish'] = df['garage_finish'].map({'Fin': 3, 'RFn': 2, 'Unf':1, 'NA':0})
    df['garage_qual'] = df['garage_qual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po':1, 'NA':0})
    df['garage_cond'] = df['garage_cond'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po':1, 'NA':0})
    df['paved_drive'] = df['paved_drive'].map({'Y': 3, 'P': 2, 'N':1})
    return df

In [9]:
df_train = convert_ordinal_variables(df_train)
df_train.head()

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,...,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,60,RL,68.878999,13517,Pave,3,Lvl,4,CulDSac,3,...,44,0,0,0,0,0,3,2010,WD,130500
1,60,RL,43.0,11492,Pave,3,Lvl,4,CulDSac,3,...,74,0,0,0,0,0,4,2009,WD,220000
2,20,RL,68.0,7922,Pave,4,Lvl,4,Inside,3,...,52,0,0,0,0,0,1,2010,WD,109000
3,60,RL,73.0,9802,Pave,4,Lvl,4,Inside,3,...,0,0,0,0,0,0,4,2010,WD,174000
4,50,RL,82.0,14235,Pave,3,Lvl,4,Inside,3,...,59,0,0,0,0,0,3,2010,WD,138500


In [10]:
df_test = convert_ordinal_variables(df_test)
df_test.head()

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,...,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type
0,190,RM,69.0,9142,Pave,4,Lvl,4,Inside,3,...,0,60,112,0,0,0,0,4,2006,WD
1,90,RL,69.545961,9662,Pave,3,Lvl,4,Inside,3,...,170,0,0,0,0,0,0,8,2006,WD
2,60,RL,58.0,17104,Pave,3,Lvl,4,Inside,3,...,100,24,0,0,0,0,0,9,2006,New
3,30,RM,60.0,8520,Pave,4,Lvl,4,Inside,3,...,0,0,184,0,0,0,0,7,2007,WD
4,20,RL,69.545961,9500,Pave,3,Lvl,4,Inside,3,...,0,76,0,0,185,0,0,7,2009,WD


### OHE Categorical Variables

In [11]:
def convert_nominal_variables(df):
    df_ms_zoning = pd.get_dummies(df['ms_zoning'], prefix = 'ms_zoning', drop_first = True)
    df_ms_subclass = pd.get_dummies(df['ms_subclass'], prefix = 'ms_subclass', drop_first = True)
    df_street = pd.get_dummies(df['street'], prefix = 'street', drop_first = True)
    df_land_contour = pd.get_dummies(df['land_contour'], prefix = 'land_contour', drop_first = True)
    df_lot_config = pd.get_dummies(df['lot_config'], prefix = 'lot_config', drop_first = True)
    df_neighborhood = pd.get_dummies(df['neighborhood'], prefix = 'neighborhood')
    df_condition_1 = pd.get_dummies(df['condition_1'], prefix = 'condition_1', drop_first = True)
    df_condition_2 = pd.get_dummies(df['condition_2'], prefix = 'condition_2', drop_first = True)
    df_bldg_type = pd.get_dummies(df['bldg_type'], prefix = 'bldg_type', drop_first = True)
    df_house_style = pd.get_dummies(df['house_style'], prefix = 'house_style', drop_first = True)
    df_garage_type = pd.get_dummies(df['garage_type'], prefix = 'garage_type', drop_first = True)
    df_sale_type = pd.get_dummies(df['sale_type'], prefix = 'sale_type', drop_first = True)
    df = pd.concat([df, df_ms_subclass, df_street, 
                    df_land_contour, df_lot_config, 
                    df_neighborhood, df_condition_1, 
                    df_condition_2, df_bldg_type, 
                    df_house_style, df_garage_type, 
                    df_sale_type], axis = 1, join = 'outer')
    df.drop(['ms_zoning', 'ms_subclass', 
             'street', 'land_contour', 'lot_config', 
             'neighborhood', 'condition_1', 
             'condition_2', 'bldg_type', 
             'house_style', 'garage_type', 
             'sale_type'], axis=1, inplace=True)
    return df

In [12]:
df_train = convert_nominal_variables(df_train)
df_train.head()

Unnamed: 0,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,year_built,year_remod/add,roof_style,...,garage_type_Detchd,garage_type_None,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
0,68.878999,13517,3,4,3,6,8,1976,2005,Gable,...,0,0,0,0,0,0,0,0,0,1
1,43.0,11492,3,4,3,7,5,1996,1997,Gable,...,0,0,0,0,0,0,0,0,0,1
2,68.0,7922,4,4,3,5,7,1953,2007,Gable,...,1,0,0,0,0,0,0,0,0,1
3,73.0,9802,4,4,3,5,5,2006,2007,Gable,...,0,0,0,0,0,0,0,0,0,1
4,82.0,14235,3,4,3,6,8,1900,1993,Gable,...,1,0,0,0,0,0,0,0,0,1


In [13]:
df_test = convert_nominal_variables(df_test)
df_test.head()

Unnamed: 0,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,year_built,year_remod/add,roof_style,...,garage_type_None,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD
0,69.0,9142,4,4,3,6,8,1910,1950,Gable,...,0,0,0,0,0,0,0,0,0,1
1,69.545961,9662,3,4,3,5,4,1977,1977,Gable,...,0,0,0,0,0,0,0,0,0,1
2,58.0,17104,3,4,3,7,5,2006,2006,Gable,...,0,0,0,0,0,0,1,0,0,0
3,60.0,8520,4,4,3,5,6,1923,2006,Gable,...,0,0,0,0,0,0,0,0,0,1
4,69.545961,9500,3,4,3,6,5,1963,1963,Gable,...,0,0,0,0,0,0,0,0,0,1


In [16]:
# checking the difference
set(df_train.columns) - set(df_test.columns)

{'saleprice'}

In [15]:
# dropping columns in training set as the two datasets have contain the same features
df_train.drop(columns=['condition_2_Feedr', 'condition_2_PosN', 
                       'condition_2_RRAe', 'condition_2_RRAn',
                       'condition_2_RRNn', 'ms_subclass_150',
                       'neighborhood_GrnHill', 'neighborhood_Landmrk'], axis=1, inplace=True)