In [15]:
import pandas as pd
import numpy as np

In [16]:
# Import raw data
data = 'train'
raw_df = pd.read_csv(f'../rawData/{data}.csv') 

In [17]:
# follow advice from http://jse.amstat.org/v19n3/decock.pdf to remove outliers
raw_df = raw_df[raw_df['GrLivArea']<=4000]

In [18]:
raw_df = raw_df.drop(columns = ['PoolQC', 'MiscFeature', 'Alley'])
clean_df = raw_df.copy()

In [19]:
# All columns that contains at least 1 NA
col_na = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage',
           'GarageQual', 'GarageFinish', 'GarageYrBlt', 'GarageType', 'GarageCond',
           'BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1',
           'MasVnrArea', 'MasVnrType', 'Electrical']

In [20]:
# Handle Lot Frontage by average for neighborhood

# dict neighborhood : median(LotFrontage)
dict_neigh = raw_df.groupby(by = 'Neighborhood').agg({'LotFrontage':'median'}).to_dict()
clean_df.loc[clean_df['LotFrontage'].isna(), 'LotFrontage'] = \
    list(map(lambda n : dict_neigh['LotFrontage'][n], raw_df.loc[raw_df['LotFrontage'].isna()]['Neighborhood']))

In [21]:
# Impute year built if garage year built is NA
clean_df.loc[clean_df['GarageYrBlt'].isna(), 'GarageYrBlt'] = clean_df.loc[clean_df['GarageYrBlt'].isna()]['YearBuilt']
clean_df.loc[clean_df['MasVnrArea'].isna(), 'MasVnrArea'] = 0

In [22]:
# Fill anything else that's left with modal values
miss_cols = clean_df.columns[clean_df.isna().any(axis=0)]
for col in miss_cols:
    miss_rows = clean_df[col].isna()
    mode = clean_df.loc[miss_rows==False, col].value_counts().index[0]
    clean_df.loc[miss_rows, col] = mode

# Formatting

- Continuous features remains unchanged
- Ordinal categorical features has int labels
- Non ordinal categorical features are dummified

In [23]:
contin = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1',
        'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
        '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea',
        'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
        'ScreenPorch', 'PoolArea', 'MiscVal']

ord_cat = ['LotShape', 'LandContour', 'Utilities', 'LandSlope', 'HouseStyle',
           'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
           'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
           'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'BsmtFullBath',
           'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
           'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu',
           'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageQual',
           'GarageCond', 'Fence', 'MoSold', 'YrSold']

cat = ['MSSubClass', 'MSZoning', 'Street', 'LotConfig', 'Neighborhood',
      'Condition1', 'Condition2', 'BldgType', 'RoofStyle', 'RoofMatl',
      'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 
      'CentralAir', 'Electrical', 'GarageType', 'PavedDrive',
      'SaleType', 'SaleCondition']

## Ordinal categorical

In [24]:
mapping1 = {np.nan:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
mapping2 = {np.nan:0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4}
mapping3 = {np.nan:0, 'IR3':1, 'IR2':2, 'IR1':3, 'Reg':4}
mapping4 = {np.nan:0, 'Lvl':1, 'Bnk':2, 'HLS':3, 'Low':4}
mapping5 = {np.nan:0, 'ELO':1, 'NoSeWa':2, 'NoSewr':3, 'AllPub':4}
mapping6 = {'Gtl':1, 'Mod':2, 'Sev':3}
mapping7 = {np.nan:0, '1Story':1, '1.5Unf':2, '1.5Fin':3, '2Story':4, '2.5Unf':5, '2.5Fin':6,
           'SFoyer':7, 'SLvl':8}
mapping8 = {np.nan:0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}
mapping9 = {np.nan:0, 'Typ':1, 'Min1':2, 'Min2':3, 'Mod':4, 'Maj1':5, 'Maj2':6,
            'Sev':7, 'Sal':8}
mapping10 = {np.nan:0, 'Unf':1, 'RFn':2, 'Fin':3}
mapping11 = {np.nan:0, 'MnWw':1, 'GdWo':2, 'MnPrv':3, 'GdPrv':4}

In [25]:
clean_df['LotShape'] = [mapping3[val] for val in raw_df['LotShape']]
clean_df['LandContour'] = [mapping4[val] for val in raw_df['LandContour']]
clean_df['Utilities'] = [mapping5[val] for val in raw_df['Utilities']]
clean_df['BsmtExposure'] = [mapping2[val] for val in raw_df['BsmtExposure']]
clean_df['Functional'] = [mapping9[val] for val in raw_df['Functional']]
clean_df['GarageFinish'] = [mapping10[val] for val in raw_df['GarageFinish']]
clean_df['Fence'] = [mapping11[val] for val in raw_df['Fence']]
clean_df['LandSlope'] = [mapping6[val] for val in raw_df['LandSlope']]
clean_df['HouseStyle'] = [mapping7[val] for val in raw_df['HouseStyle']]

tmpCol = ['ExterQual', 'BsmtQual', 'BsmtCond', 'ExterCond', 'BsmtQual', 'BsmtCond',
         'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']
for col in tmpCol :
    try:
        clean_df[col] = [mapping1[val] for val in raw_df[col]]
    except:
        print(col)

tmpCol = ['BsmtFinType1', 'BsmtFinType2']
for col in tmpCol :
    clean_df[col] = [mapping8[val] for val in raw_df[col]]

In [26]:
clean_df.to_csv(f'../derivedData/{data}_NotDum.csv')

## Non ordinal categorical

In [27]:
# Dummification
for col in cat :
    tmp_dum = pd.get_dummies(clean_df[col])
    tmp_dum.columns = [f'{col}.{lvl}' for lvl in tmp_dum.columns]
    clean_df = pd.concat([clean_df.drop(columns = col), tmp_dum], sort = False, axis = 1)

# Pushing clean data

In [28]:
clean_df.to_csv(f'../derivedData/{data}_cleaned.csv', index=False)