# Feature Engineering and Feature Selection

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## Load Data

In [2]:
train = pd.read_csv("../train.csv", index_col = "Id")
test = pd.read_csv("../test.csv", index_col = "Id")

## Feature Transformation based on EDA

### Features from 10 critical factors

In [3]:
def imp_feat_eng(df):
    # TotalBsmtSF is a linear combination of BsmtFinSF1, BsmtFinSF2, and BsmtUnfSF
    # Also, TotalBsmtSF is highly correlated to 1stFlrSF
    df.drop("BsmtFinSF1", axis = 1, inplace = True)
    df.drop("BsmtFinSF2", axis = 1, inplace = True)
    df.drop("BsmtUnfSF", axis = 1, inplace = True)
    df.drop("TotalBsmtSF", axis = 1, inplace = True)

    # Drop 1st, 2nd Floor, and LowQualFin SF since there is GrLivArea(The combo of three) 
    #and create a dummy for 2nd Floor
    df["SecondFlr"] = df["2ndFlrSF"].apply(lambda x: 1 if x > 0 else 0)
    df.drop("1stFlrSF", axis = 1, inplace = True)
    df.drop("2ndFlrSF", axis = 1, inplace = True)
    df.drop("LowQualFinSF", axis = 1, inplace = True)

    # Combine the SF for Porch since, except OpenPorchSF, all three are zero inflated
    df["PorchSF"] = df["OpenPorchSF"]+df["EnclosedPorch"]+df["3SsnPorch"]+df["ScreenPorch"]
    df.drop("OpenPorchSF", axis = 1, inplace = True)
    df.drop("EnclosedPorch", axis = 1, inplace = True)
    df.drop("3SsnPorch", axis = 1, inplace = True)
    df.drop("ScreenPorch", axis = 1, inplace = True)

    # BedroomAbvGr is a part of TotRmsAbvGrd, hence highly correlated to TotRmsAbvGrd
    # Therefore, separate extra rooms from bedrooms 
    df["ExtraRoom"] = df["TotRmsAbvGrd"] - df["BedroomAbvGr"]
    df.drop("TotRmsAbvGrd", axis = 1, inplace = True)

    # GarageYrBlt is highly correlated to YearBuilt
    df.drop("GarageYrBlt", axis = 1, inplace = True)
    # GarageCars is highly correlated to GarageArea
    # Yet GarageCars is more highly correlated to SalePrice
    df.drop("GarageArea", axis = 1, inplace = True)

    # Create SinceRemod that takes into account how many years past since remodelling and before sold
    # Drop YearSold and YearBuilt, since they are correlated to SinceRemod
    # But SinceRemod is more highly correlated to SalePrice
    df["SinceRemod"] = df["YrSold"] - df["YearRemodAdd"]
    df.drop("YearRemodAdd", axis = 1, inplace = True)
    df.drop("YrSold", axis = 1, inplace = True)
    df.drop("YearBuilt", axis = 1, inplace = True)

    # Combine fullbaths and halfbaths
    # needs imputation of missing NaN before tho
    df["FullBaths"] = df["BsmtFullBath"] + df["FullBath"]
    df["HalfBaths"] = df["BsmtHalfBath"] + df["HalfBath"]
    df.drop("BsmtFullBath", axis = 1, inplace = True)
    df.drop("FullBath", axis = 1, inplace = True)
    df.drop("BsmtHalfBath", axis = 1, inplace = True)
    df.drop("HalfBath", axis = 1, inplace = True)

    # Combine the irregular categories of LotShape together
    df["LotShape"] = df["LotShape"].apply(lambda x: "IR" 
                            if ((x == "IR1") | (x == "IR2") | (x == "IR3")) 
                            else x)

    # Combine the LotConfig categories, FR2 and FR3, to FR (Frontage on 2>= sides)
    lotconf_ord = {"FR2":"FR", "FR3":"FR"}
    df["LotConfig"] = df["LotConfig"].replace(lotconf_ord)

    # Combine the categories into Attached and Detached Garages
    garagetype_ord = {"BuiltIn":"Attchd", "Basment": "Attchd", "CarPort": "Detchd", "2Types": "Attchd"}
    df["GarageType"] = df["GarageType"].replace(garagetype_ord)
    
    return df

### Features not from 10 critical factors

In [4]:
def unimp_feat_eng(df):
    # MSSubClass is a combination of HouseStyle, BdlgType, YearBuilt, YearRemodAdd
    df.drop("MSSubClass", axis= 1, inplace = True)

    # drop columns that are zero-inflated or has mostly one category
    df.drop("MiscVal", axis = 1, inplace = True)
    df.drop("Street", axis = 1, inplace = True)
    df.drop("Alley", axis = 1, inplace = True)
    df.drop("Utilities", axis = 1, inplace = True)
    df.drop("Condition2", axis = 1, inplace = True)
    df.drop("RoofMatl", axis = 1, inplace = True)
    df.drop("BsmtFinType2", axis = 1, inplace = True)
    df.drop("Heating", axis = 1, inplace = True)
    df.drop("Fence", axis = 1, inplace = True)
    df.drop("LandSlope", axis = 1, inplace = True)
    df.drop("MiscFeature", axis = 1, inplace = True)

    # Combine the land contours that are not flat/level
    landcont_ord = {"Bnk":"NotLvl","HLS":"NotLvl","Low":"NotLvl"}
    df["LandContour"] = df["LandContour"].replace(landcont_ord)

    # Combine Condition1 categories that are not normal
    df["Condition1"] = df["Condition1"].apply(lambda x: "Abnorm" if x != "Norm" else x)

    # Combine the Roofstyle categories with few samples
    df["RoofStyle"] = df["RoofStyle"].apply(lambda x: "Other" if x not in ["Gable", "Hip"] else x)

    # Combine Poor with Fair category and call it BA (Below Average) 
    # since there are only a couple of samples 
    df["HeatingQC"] = df["HeatingQC"].apply(lambda x: "BA" if ((x == "Fa") | (x == "Po")) else x)

    # Combine Functional categories that are not typical
    df["Functional"] = df["Functional"].apply(lambda x: "Nottyp" if x != "Typ" else x)

    # Combine SaleType and SaleCondition category that are not conventional
    df["SaleType"] = df["SaleType"].apply(lambda x: "Unconv" if x != "WD" else x)
    df["SaleCondition"] = df["SaleCondition"].apply(lambda x: "Unconv" if x != "Normal" else x)

    # Categorize and bin the Quality and Condition features
    # BA: Below Average, Avg: Average, AA: Above Average, GO: Good, EX: Excellent
    df["OverallQual"] = \
        df["OverallQual"].apply(lambda x: 
            "BA" if x < 5 else "Avg" if x == 5 else "AA" if x == 6 else "EX" if x > 8 else "GO")
    df["OverallCond"] = \
        df["OverallCond"].apply(lambda x: 
            "BA" if x < 5 else "Avg" if x == 5 else "AA" if x == 6 else "EX" if x > 8 else "GO")
    
    return df

In [5]:
print("Number of features in train set before feature engineering: " + str(train.shape[1]))
print("-"*50)
train = imp_feat_eng(train)
new_train = unimp_feat_eng(train)
print("Number of features in train set after feature engineering: " + str(new_train.shape[1]))

Number of features in train set before feature engineering: 80
--------------------------------------------------
Number of features in train set after feature engineering: 53


In [6]:
print("Number of features in test set before feature engineering: " + str(test.shape[1]))
print("-"*50)
test = imp_feat_eng(test)
new_test = unimp_feat_eng(test)
print("Number of features in test set after feature engineering: " + str(new_test.shape[1]))

Number of features in test set before feature engineering: 79
--------------------------------------------------
Number of features in test set after feature engineering: 52


### One-Hot Encoding

In [7]:
new_train["train"] = 1
new_test["train"] = 0
combined = pd.concat([new_train,new_test], axis = 0)
df = pd.get_dummies(combined, drop_first = True)
train_final = df[df["train"] == 1]
test_final = df[df["train"] == 0]
train_final.drop("train", axis = 1, inplace = True)
test_final.drop("train", axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [9]:
print("Number of features in train set after one-hot encoding for train: " + str(train_final.shape[1]))
print("-"*60)
print("Number of features in test set after one-hot encoding for test: " + str(test_final.shape[1]))

Number of features in train set after one-hot encoding for train: 164
------------------------------------------------------------
Number of features in test set after one-hot encoding for test: 164


#### Quality and Condition Features we might consider dropping since there are OverallQual and OverallCond