# Feature Engineering and Feature Selection

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## Load Data

In [2]:
train = pd.read_csv("../train.csv", index_col = "Id")
test = pd.read_csv("../test.csv", index_col = "Id")

## Missing Value Imputation based on EDA

### Train Set NaNs

In [3]:
# Funtion for filling missingness with NA
def fill_nan(df,cols):
    df[cols] = df[cols].fillna("NA")
    return df

In [4]:
# Fill the NaNs of LotFrontage with the mean of each Neighborhood
train["LotFrontage"] = train.groupby("Neighborhood")["LotFrontage"].apply(lambda x: x.fillna(x.mean()))

In [5]:
# Fill the NaNs with NA, since the samples simply do not have these features
cols = ["PoolQC", "FireplaceQu", "GarageFinish", "GarageType", "GarageCond", "GarageQual"] 
train = fill_nan(train, cols)

In [6]:
# MCAR for one sample in BsmtExposure, hence fill NaN with mode
bsmt = train[["BsmtCond", "BsmtExposure", "BsmtQual", "BsmtFinType1","TotalBsmtSF"]]
print(bsmt[(bsmt["TotalBsmtSF"]>0) & (bsmt[["BsmtCond", "BsmtExposure", "BsmtQual", "BsmtFinType1"]].isna().any(axis=1))])
train.loc[949,"BsmtExposure"] = "No"

    BsmtCond BsmtExposure BsmtQual BsmtFinType1  TotalBsmtSF
Id                                                          
949       TA          NaN       Gd          Unf          936


In [7]:
# Fill the NaNs with NA, since they simply don't have basement
bsmt_train = ["BsmtQual","BsmtCond","BsmtFinType1","BsmtExposure"]
train = fill_nan(train, bsmt_train)

In [8]:
# Fill the NaNs with the mode 
train["MasVnrType"].fillna("None", inplace = True)
train["MasVnrArea"].fillna(0.0, inplace = True) 
train["Electrical"].fillna("SBrkr", inplace = True)

### Test Set NaNs

In [9]:
## Fill the NaNs of LotFrontage with the mean of each Neighborhood in the train set
neighbor_mean = dict(train.groupby('Neighborhood')["LotFrontage"].mean())
test["LotFrontage"] = test["LotFrontage"].fillna(test["Neighborhood"].map(neighbor_mean))

In [10]:
# PoolQC that are MCAR
# fill with mode in train set
print(test[["PoolArea","PoolQC"]][(test["PoolArea"]> 0) & (test["PoolQC"].isnull())])
test.loc[2421, "PoolQC"] = train["PoolQC"].mode()[0]
test.loc[2504, "PoolQC"] = train["PoolQC"].mode()[0]
test.loc[2600, "PoolQC"] = train["PoolQC"].mode()[0]

      PoolArea PoolQC
Id                   
2421       368    NaN
2504       444    NaN
2600       561    NaN


In [11]:
# GarageCond, GarageFinish, GarageQual completely that are MCAR
garage = test[["GarageCond","GarageFinish","GarageQual","GarageType"]]
garage[(garage[["GarageCond","GarageFinish","GarageQual"]].isna().any(axis=1)) & (garage["GarageType"].notnull())]

Unnamed: 0_level_0,GarageCond,GarageFinish,GarageQual,GarageType
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2127,,,,Detchd
2577,,,,Detchd


In [12]:
# Fill missinginess in test set with mode of train set
test.loc[2127, "GarageCond"] = train[train["GarageType"] == "Detchd"]["GarageCond"].mode()[0]
test.loc[2577, "GarageCond"] = train[train["GarageType"] == "Detchd"]["GarageCond"].mode()[0]
test.loc[2127, "GarageFinish"] = train[train["GarageType"] == "Detchd"]["GarageFinish"].mode()[0]
test.loc[2577, "GarageFinish"] = train[train["GarageType"] == "Detchd"]["GarageFinish"].mode()[0]
test.loc[2127, "GarageQual"] = train[train["GarageType"] == "Detchd"]["GarageQual"].mode()[0]
test.loc[2577, "GarageQual"] = train[train["GarageType"] == "Detchd"]["GarageQual"].mode()[0]

In [13]:
# Fill the NaNs with NA, since the samples simply do not have these features
cols = ["PoolQC", "FireplaceQu", "GarageCond", "GarageFinish", "GarageQual", "GarageType"]  
test = fill_nan(test, cols)

In [14]:
# BsmtCond, BsmtExposure, BsmtQual, BsmtFinType1 that are MCAR
# fill with mode in train set
bsmt = test[["BsmtCond", "BsmtExposure", "BsmtQual", "BsmtFinType1","TotalBsmtSF"]]
bsmt[(bsmt["TotalBsmtSF"]>0) & (bsmt[["BsmtCond", "BsmtExposure", "BsmtQual", "BsmtFinType1"]].isna().any(axis=1))]

Unnamed: 0_level_0,BsmtCond,BsmtExposure,BsmtQual,BsmtFinType1,TotalBsmtSF
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1488,TA,,Gd,Unf,1595.0
2041,,Mn,Gd,GLQ,1426.0
2186,,No,TA,BLQ,1127.0
2218,Fa,No,,Unf,173.0
2219,TA,No,,Unf,356.0
2349,TA,,Gd,Unf,725.0
2525,,Av,TA,ALQ,995.0


In [15]:
test.loc[1488, "BsmtExposure"] = train["BsmtExposure"].mode()[0]
test.loc[2041, "BsmtCond"] = train["BsmtCond"].mode()[0]
test.loc[2186, "BsmtCond"] = train["BsmtCond"].mode()[0]
test.loc[2218, "BsmtQual"] = train["BsmtQual"].mode()[0]
test.loc[2219, "BsmtQual"] = train["BsmtQual"].mode()[0]
test.loc[2349, "BsmtExposure"] = train["BsmtExposure"].mode()[0]
test.loc[2525, "BsmtCond"] = train["BsmtCond"].mode()[0]

In [16]:
# Fill the NaNs with NA, since the samples simply do not have these features
cols = ["BsmtCond", "BsmtExposure", "BsmtQual", "BsmtFinType1"]
test = fill_nan(test, cols)

In [17]:
# MasVnrType that is MCAR
# fill with mode in train set
print(test[["MasVnrType","MasVnrArea"]][(test["MasVnrArea"] > 0) & test["MasVnrType"].isna()])
test.loc[2611, "MasVnrType"] = train["MasVnrType"].mode()[0]

     MasVnrType  MasVnrArea
Id                         
2611        NaN       198.0


In [18]:
# Fill the NaNs with the mode 
test["MasVnrType"].fillna("None", inplace = True)
test["MasVnrArea"].fillna(0.0, inplace = True) 
test["MSZoning"].fillna(train["MSZoning"].mode()[0], inplace = True)
test["BsmtHalfBath"].fillna(train["BsmtHalfBath"].mode()[0], inplace = True)
test["BsmtFullBath"].fillna(train["BsmtFullBath"].mode()[0], inplace = True)
test["Functional"].fillna(train["Functional"].mode()[0], inplace = True)
test["Exterior2nd"].fillna(train["Exterior2nd"].mode()[0], inplace = True)
test["SaleType"].fillna(train["SaleType"].mode()[0], inplace = True)
test["Exterior1st"].fillna(train["Exterior1st"].mode()[0], inplace = True)
test["KitchenQual"].fillna(train["KitchenQual"].mode()[0], inplace = True)
test["GarageCars"].fillna(train["GarageCars"].mode()[0], inplace = True)

## Feature Transformation based on EDA

### Features from 10 critical factors

In [19]:
def imp_feat_eng(df):
    # Change Month Sold to categorical variable
    df['MoSold'] = df['MoSold'].astype("category")
    
    # TotalBsmtSF is a linear combination of BsmtFinSF1, BsmtFinSF2, and BsmtUnfSF
    # Also, TotalBsmtSF is highly correlated to 1stFlrSF
    df.drop("BsmtFinSF1", axis = 1, inplace = True)
    df.drop("BsmtFinSF2", axis = 1, inplace = True)
    df.drop("BsmtUnfSF", axis = 1, inplace = True)
    df.drop("TotalBsmtSF", axis = 1, inplace = True)

    # Drop 1st, 2nd Floor, and LowQualFin SF since there is GrLivArea(The linear combo of three) 
    #and create a dummy for 2nd Floor
    df["SecondFlr"] = df["2ndFlrSF"].apply(lambda x: 1 if x > 0 else 0)
    df.drop("1stFlrSF", axis = 1, inplace = True)
    df.drop("2ndFlrSF", axis = 1, inplace = True)
    df.drop("LowQualFinSF", axis = 1, inplace = True)
    # Drop 4 outliers of GrLivArea in train set
    train.drop(train[train["GrLivArea"] > 4000].index, inplace = True)

    # Combine the SF for Porch since, except OpenPorchSF, all three are zero inflated
    df["PorchSF"] = df["OpenPorchSF"]+df["EnclosedPorch"]+df["3SsnPorch"]+df["ScreenPorch"]
    df.drop("OpenPorchSF", axis = 1, inplace = True)
    df.drop("EnclosedPorch", axis = 1, inplace = True)
    df.drop("3SsnPorch", axis = 1, inplace = True)
    df.drop("ScreenPorch", axis = 1, inplace = True)

    # BedroomAbvGr is a part of TotRmsAbvGrd, hence highly correlated to TotRmsAbvGrd
    # Therefore, separate extra rooms from bedrooms 
    df["ExtraRoom"] = df["TotRmsAbvGrd"] - df["BedroomAbvGr"]
    df.drop("TotRmsAbvGrd", axis = 1, inplace = True)

    # GarageYrBlt is highly correlated to YearBuilt
    df.drop("GarageYrBlt", axis = 1, inplace = True)
    # GarageCars is highly correlated to GarageArea
    # Yet GarageCars is more highly correlated to SalePrice
    df.drop("GarageArea", axis = 1, inplace = True)
    # Combine the categories into Attached and Detached Garages
    garagetype_ord = {"BuiltIn":"Attchd", "Basment": "Attchd", "CarPort": "Detchd", "2Types": "Attchd"}
    df["GarageType"] = df["GarageType"].replace(garagetype_ord)
    
    # Create SinceRemod that takes into account how many years past since remodelling and before sold
    # Drop YearSold and YearBuilt, since they are correlated to SinceRemod
    # But SinceRemod is more highly correlated to SalePrice
    df["SinceRemod"] = df["YrSold"].astype(int) - df["YearRemodAdd"].astype(int)
    df.drop("YearRemodAdd", axis = 1, inplace = True)
    df.drop("YrSold", axis = 1, inplace = True)
    df.drop("YearBuilt", axis = 1, inplace = True)

    # Combine fullbaths and halfbaths
    # needs imputation of missing NaN before tho
    df["FullBaths"] = df["BsmtFullBath"] + df["FullBath"]
    df["HalfBaths"] = df["BsmtHalfBath"] + df["HalfBath"]
    df.drop("BsmtFullBath", axis = 1, inplace = True)
    df.drop("FullBath", axis = 1, inplace = True)
    df.drop("BsmtHalfBath", axis = 1, inplace = True)
    df.drop("HalfBath", axis = 1, inplace = True)

    # Combine the irregular categories of LotShape together
    df["LotShape"] = df["LotShape"].apply(lambda x: "IR" 
                            if ((x == "IR1") | (x == "IR2") | (x == "IR3")) 
                            else x)

    # Combine the LotConfig categories, FR2 and FR3, to FR (Frontage on 2>= sides)
    lotconf_ord = {"FR2":"FR", "FR3":"FR"}
    df["LotConfig"] = df["LotConfig"].replace(lotconf_ord)

    return df

### Features not from 10 critical factors

In [20]:
def unimp_feat_eng(df):
    # MSSubClass is a combination of HouseStyle, BdlgType, YearBuilt, YearRemodAdd
    df.drop("MSSubClass", axis= 1, inplace = True)

    # drop columns that are zero-inflated or has mostly one category
    df.drop("MiscVal", axis = 1, inplace = True)
    df.drop("Street", axis = 1, inplace = True)
    df.drop("Alley", axis = 1, inplace = True)
    df.drop("Utilities", axis = 1, inplace = True)
    df.drop("Condition2", axis = 1, inplace = True)
    df.drop("RoofMatl", axis = 1, inplace = True)
    df.drop("BsmtFinType2", axis = 1, inplace = True)
    df.drop("Heating", axis = 1, inplace = True)
    df.drop("Fence", axis = 1, inplace = True)
    df.drop("LandSlope", axis = 1, inplace = True)
    df.drop("MiscFeature", axis = 1, inplace = True)

    # Combine the land contours that are not flat/level
    landcont_ord = {"Bnk":"NotLvl","HLS":"NotLvl","Low":"NotLvl"}
    df["LandContour"] = df["LandContour"].replace(landcont_ord)

    # Combine Condition1 categories that are not normal
    df["Condition1"] = df["Condition1"].apply(lambda x: "Abnorm" if x != "Norm" else x)

    # Combine the Roofstyle categories with few samples
    df["RoofStyle"] = df["RoofStyle"].apply(lambda x: "Other" if x not in ["Gable", "Hip"] else x)

    # Combine Poor with Fair category and call it BA (Below Average) 
    # since there are only a couple of samples 
    df["HeatingQC"] = df["HeatingQC"].apply(lambda x: "BA" if ((x == "Fa") | (x == "Po")) else x)

    # Combine Functional categories that are not typical
    df["Functional"] = df["Functional"].apply(lambda x: "Nottyp" if x != "Typ" else x)

    # Combine SaleType and SaleCondition category that are not conventional
    df["SaleType"] = df["SaleType"].apply(lambda x: "Unconv" if x != "WD" else x)
    df["SaleCondition"] = df["SaleCondition"].apply(lambda x: "Unconv" if x != "Normal" else x)

    # Categorize and bin the Quality and Condition features
    # BA: Below Average, Avg: Average, AA: Above Average, GO: Good, EX: Excellent
    df["OverallQual"] = \
        df["OverallQual"].apply(lambda x: 
            "BA" if x < 5 else "Avg" if x == 5 else "AA" if x == 6 else "EX" if x > 8 else "GO")
    df["OverallCond"] = \
        df["OverallCond"].apply(lambda x: 
            "BA" if x < 5 else "Avg" if x == 5 else "AA" if x == 6 else "EX" if x > 8 else "GO")
    
    return df

In [21]:
print("Number of features in train set before feature engineering: " + str(train.shape[1]))
print("-"*60)
train = imp_feat_eng(train)
new_train = unimp_feat_eng(train)
print("Number of features in train set after feature engineering: " + str(new_train.shape[1]))

Number of features in train set before feature engineering: 80
------------------------------------------------------------
Number of features in train set after feature engineering: 53


In [22]:
print("Number of features in test set before feature engineering: " + str(test.shape[1]))
print("-"*60)
test = imp_feat_eng(test)
new_test = unimp_feat_eng(test)
print("Number of features in test set after feature engineering: " + str(new_test.shape[1]))

Number of features in test set before feature engineering: 79
------------------------------------------------------------
Number of features in test set after feature engineering: 52


### One-Hot Encoding

In [23]:
new_train["train"] = 1
new_test["train"] = 0
combined = pd.concat([new_train,new_test], axis = 0)
df = pd.get_dummies(combined, drop_first = True)
train_final = df[df["train"] == 1]
test_final = df[df["train"] == 0]
train_final = train_final.drop("train", axis = 1)
test_final = test_final.drop(["train","SalePrice"], axis = 1)

In [24]:
print("Number of features in train set after one-hot encoding for train: " + str(train_final.shape[1]))
print("-"*70)
print("Number of features in test set after one-hot encoding for test: " + str(test_final.shape[1]))

Number of features in train set after one-hot encoding for train: 184
----------------------------------------------------------------------
Number of features in test set after one-hot encoding for test: 183


### Final Check for missingvalues

In [25]:
train_final.isna().sum()

LotFrontage             0
LotArea                 0
MasVnrArea              0
GrLivArea               0
BedroomAbvGr            0
KitchenAbvGr            0
Fireplaces              0
GarageCars              0
WoodDeckSF              0
PoolArea                0
SalePrice               0
SecondFlr               0
PorchSF                 0
ExtraRoom               0
SinceRemod              0
FullBaths               0
HalfBaths               0
MSZoning_FV             0
MSZoning_RH             0
MSZoning_RL             0
MSZoning_RM             0
LotShape_Reg            0
LandContour_NotLvl      0
LotConfig_CulDSac       0
LotConfig_FR            0
LotConfig_Inside        0
Neighborhood_Blueste    0
Neighborhood_BrDale     0
Neighborhood_BrkSide    0
Neighborhood_ClearCr    0
Neighborhood_CollgCr    0
Neighborhood_Crawfor    0
Neighborhood_Edwards    0
Neighborhood_Gilbert    0
Neighborhood_IDOTRR     0
Neighborhood_MeadowV    0
Neighborhood_Mitchel    0
Neighborhood_NAmes      0
Neighborhood

In [26]:
test_final.isna().sum()

LotFrontage             0
LotArea                 0
MasVnrArea              0
GrLivArea               0
BedroomAbvGr            0
KitchenAbvGr            0
Fireplaces              0
GarageCars              0
WoodDeckSF              0
PoolArea                0
SecondFlr               0
PorchSF                 0
ExtraRoom               0
SinceRemod              0
FullBaths               0
HalfBaths               0
MSZoning_FV             0
MSZoning_RH             0
MSZoning_RL             0
MSZoning_RM             0
LotShape_Reg            0
LandContour_NotLvl      0
LotConfig_CulDSac       0
LotConfig_FR            0
LotConfig_Inside        0
Neighborhood_Blueste    0
Neighborhood_BrDale     0
Neighborhood_BrkSide    0
Neighborhood_ClearCr    0
Neighborhood_CollgCr    0
Neighborhood_Crawfor    0
Neighborhood_Edwards    0
Neighborhood_Gilbert    0
Neighborhood_IDOTRR     0
Neighborhood_MeadowV    0
Neighborhood_Mitchel    0
Neighborhood_NAmes      0
Neighborhood_NPkVill    0
Neighborhood