# Data Preprocessing and Feature Engineering

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## Load Data

In [62]:
train = pd.read_csv("../train.csv", index_col = "Id")
test = pd.read_csv("../test.csv", index_col = "Id")

## Feature Transformation based on EDA

In [9]:
# TotalBsmtSF is a linear combination of BsmtFinSF1, BsmtFinSF2, and BsmtUnfSF
# Also, TotalBsmtSF is highly correlated to 1stFlrSF
train.drop("BsmtFinSF1", axis = 1, inplace = True)
train.drop("BsmtFinSF2", axis = 1, inplace = True)
train.drop("BsmtUnfSF", axis = 1, inplace = True)
train.drop("TotalBsmtSF", axis = 1, inplace = True)

In [56]:
# Drop 1st, 2nd Floor, and LowQualFin SF since there is GrLivArea(The combo of three) 
#and create a dummy for 2nd Floor
train["SecondFlr"] = train["2ndFlrSF"].apply(lambda x: 1 if x > 0 else 0)
train.drop("1stFlrSF", axis = 1, inplace = True)
train.drop("2ndFlrSF", axis = 1, inplace = True)
train.drop("LowQualFinSF", axis = 1, inplace = True)

In [23]:
# Combine the SF for Porch since, except OpenPorchSF, all three are zero inflated
train["PorchSF"] = train["OpenPorchSF"]+train["EnclosedPorch"]+train["3SsnPorch"]+train["ScreenPorch"]
train.drop("OpenPorchSF", axis = 1, inplace = True)
train.drop("EnclosedPorch", axis = 1, inplace = True)
train.drop("3SsnPorch", axis = 1, inplace = True)
train.drop("ScreenPorch", axis = 1, inplace = True)

In [32]:
# BedroomAbvGr is a part of TotRmsAbvGrd, hence highly correlated to TotRmsAbvGrd
# Therefore, separate extra rooms from bedrooms 
train["ExtraRoom"] = train["TotRmsAbvGrd"] - train["BedroomAbvGr"]
train.drop("TotRmsAbvGrd", axis = 1, inplace = True)
train.drop("BedroomAbvGr", axis = 1, inplace = True)

In [None]:
# GarageYrBlt is highly correlated to YearBuilt
train.drop("GarageYrBlt", axis = 1, inplace = True)
# GarageCars is highly correlated to GarageArea
# Yet GarageCars is more highly correlated to SalePrice
train.drop("GarageArea", axis = 1, inplace = True)

In [48]:
# Create SinceRemod that takes into account how many years past since remodelling and before sold
# Drop YearSold and YearBuilt, since they are correlated to SinceRemod
# But SinceRemod is more highly correlated to SalePrice
train["SinceRemod"] = train["YrSold"] - train["YearRemodAdd"]
train.drop("YearRemodAdd", axis = 1, inplace = True)
train.drop("YrSold", axis = 1, inplace = True)
train.drop("YearBuilt", aixs = 1, inplace = True)

In [60]:
# Combine fullbaths and halfbaths
# needs imputation of missing NaN before tho
train["FullBaths"] = train["BsmtFullBath"] + train["FullBath"]
train["HalfBaths"] = train["BsmtHalfBath"] + train["HalfBath"]

In [70]:
# Combine the irregular categories of LotShape together
train["LotShape"] = train["LotShape"].apply(lambda x: "IR" 
                        if ((x == "IR1") | (x == "IR2") | (x == "IR3")) 
                        else x)

In [76]:
# Combine the categories into Attached and Detached Garages
garagetype_ord = {"BuiltIn":"Attchd", "Basment": "Attchd", "CarPort": "Detchd", "2Types": "Attchd"}
train["GarageType"] = train["GarageType"].replace(garagetype_ord)

In [None]:
masvnrarea lotfrontage lotarea lotconfig 

In [None]:
# MSSubClass is a combination of HouseStyle, BdlgType, YearBuilt, YearRemodAdd
train.drop("MsSubClass", axis= 1, inplace = True)

In [86]:
# drop columns that are Zero-inflated or has mostly one category
train.drop("MiscVal", axis = 1, inplace = True)
train.drop("Street", axis = 1, inplace = True)
train.drop("Alley", axis = 1, inplace = True)
train.drop("Utilities", axis = 1, inplace = True)
train.drop("Condition2", axis = 1, inplace = True)
train.drop("RoofMatl", axis = 1, inplace = True)
train.drop("BsmtFinType2", axis = 1, inplace = True)
train.drop("Heating", axis = 1, inplace = True)
train.drop("Fence", axis = 1, inplace = True)
train.drop("LandSlope", axis = 1, inplace = True)
train.drop("MiscFeature", axis = 1, inplace = True)


In [93]:
# Combine the land contours that are not flat/level
landcont_ord = {"Bnk":"NotLvl","HLS":"NotLvl","Low":"NotLvl"}
train["LandContour"] = train["LandContour"].replace(landcont_ord)

In [98]:
# Combine Condition1 categories that are not normal
train["Condition1"] = train["Condition1"].apply(lambda x: "Abnorm" if x != "Norm" else x)

In [106]:
# Combine Functional categories that are not typical
train["Functional"] = train["Functional"].apply(lambda x: "Nottyp" if x != "Typ" else x)

In [104]:
# Combine SaleType and SaleCondition category that are not conventional
train["SaleType"] = train["SaleType"].apply(lambda x: "Unconv" if x != "WD" else x)
train["SaleCondition"] = train["SaleCondition"].apply(lambda x: "Unconv" if x != "Normal" else x)

### Dealing with missing values

In [6]:
missing = pd.DataFrame(features.isna().sum().sort_values(ascending=False), columns = ["Count_NaN"])
missing[missing["Count_NaN"] > 0]

Unnamed: 0,Count_NaN
PoolQC,1453
MiscFeature,1406
Alley,1369
Fence,1179
FireplaceQu,690
LotFrontage,259
GarageCond,81
GarageType,81
GarageYrBlt,81
GarageFinish,81


In [7]:
# PoolQC, MiscFeature, Alley, Fence are missing not at random.
# They are missing simply because the houses do not have those features.
# I will drop PoolQC, MiscFeature, Alley, Fence because there are just too many houses without those features.
# These variable won't have much predictive power. 
features.drop(["PoolQC","MiscFeature","Alley","Fence"], axis = 1, inplace = True)

#### Imputing Missing Values

In [8]:
# non-existant
features["FireplaceQu"].fillna("No", inplace = True)

# LotFrontage may be similar for houses in the same neighborhood
features["LotFrontage"] = features.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.mean()))

# non-existant
features["GarageYrBlt"].fillna(0, inplace = True)
features.fillna({"GarageCond":"No","GarageType":"No","GarageFinish":"No","GarageQual":"No"}, inplace = True)

# non-existant
features.fillna({"BsmtExposure":"No","BsmtFinType2":"No",
                 "BsmtFinType1":"No","BsmtCond":"No","BsmtQual":"No"}, inplace = True)

# non-existant
features["MasVnrArea"].fillna(0, inplace = True)
features["MasVnrType"].fillna("None", inplace = True)

# Vast majority of the Electrical is SBrkr
features["Electrical"].fillna("SBrkr", inplace = True)

In [13]:
features.isna().any()

MSSubClass       False
MSZoning         False
LotFrontage      False
LotArea          False
Street           False
LotShape         False
LandContour      False
Utilities        False
LotConfig        False
LandSlope        False
Neighborhood     False
Condition1       False
Condition2       False
BldgType         False
HouseStyle       False
OverallQual      False
OverallCond      False
YearBuilt        False
YearRemodAdd     False
RoofStyle        False
RoofMatl         False
Exterior1st      False
Exterior2nd      False
MasVnrType       False
MasVnrArea       False
ExterQual        False
ExterCond        False
Foundation       False
BsmtQual         False
BsmtCond         False
BsmtExposure     False
BsmtFinType1     False
BsmtFinSF1       False
BsmtFinType2     False
BsmtFinSF2       False
BsmtUnfSF        False
TotalBsmtSF      False
Heating          False
HeatingQC        False
CentralAir       False
Electrical       False
1stFlrSF         False
2ndFlrSF         False
LowQualFinS

# Data Description