# Data Preprocessing and Feature Engineering

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## Load Data

In [2]:
train = pd.read_csv("train.csv",index_col = "Id")
print(train.shape)
print(train.columns)
salesprice = train[["SalePrice"]]
features = train.iloc[:,:-1]

(1460, 80)
Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'Paved

### Dealing with missing values

In [6]:
missing = pd.DataFrame(features.isna().sum().sort_values(ascending=False), columns = ["Count_NaN"])
missing[missing["Count_NaN"] > 0]

Unnamed: 0,Count_NaN
PoolQC,1453
MiscFeature,1406
Alley,1369
Fence,1179
FireplaceQu,690
LotFrontage,259
GarageCond,81
GarageType,81
GarageYrBlt,81
GarageFinish,81


In [7]:
# PoolQC, MiscFeature, Alley, Fence are missing not at random.
# They are missing simply because the houses do not have those features.
# I will drop PoolQC, MiscFeature, Alley, Fence because there are just too many houses without those features.
# These variable won't have much predictive power. 
features.drop(["PoolQC","MiscFeature","Alley","Fence"], axis = 1, inplace = True)

#### Imputing Missing Values

In [8]:
# non-existant
features["FireplaceQu"].fillna("No", inplace = True)

# LotFrontage may be similar for houses in the same neighborhood
features["LotFrontage"] = features.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.mean()))

# non-existant
features["GarageYrBlt"].fillna(0, inplace = True)
features.fillna({"GarageCond":"No","GarageType":"No","GarageFinish":"No","GarageQual":"No"}, inplace = True)

# non-existant
features.fillna({"BsmtExposure":"No","BsmtFinType2":"No",
                 "BsmtFinType1":"No","BsmtCond":"No","BsmtQual":"No"}, inplace = True)

# non-existant
features["MasVnrArea"].fillna(0, inplace = True)
features["MasVnrType"].fillna("None", inplace = True)

# Vast majority of the Electrical is SBrkr
features["Electrical"].fillna("SBrkr", inplace = True)

In [13]:
features.isna().any()

MSSubClass       False
MSZoning         False
LotFrontage      False
LotArea          False
Street           False
LotShape         False
LandContour      False
Utilities        False
LotConfig        False
LandSlope        False
Neighborhood     False
Condition1       False
Condition2       False
BldgType         False
HouseStyle       False
OverallQual      False
OverallCond      False
YearBuilt        False
YearRemodAdd     False
RoofStyle        False
RoofMatl         False
Exterior1st      False
Exterior2nd      False
MasVnrType       False
MasVnrArea       False
ExterQual        False
ExterCond        False
Foundation       False
BsmtQual         False
BsmtCond         False
BsmtExposure     False
BsmtFinType1     False
BsmtFinSF1       False
BsmtFinType2     False
BsmtFinSF2       False
BsmtUnfSF        False
TotalBsmtSF      False
Heating          False
HeatingQC        False
CentralAir       False
Electrical       False
1stFlrSF         False
2ndFlrSF         False
LowQualFinS

# Data Description