In [55]:
import pandas as pd
import numpy as np

In [87]:
# Import raw data
data = 'train'
raw_df = pd.read_csv('../data/' + data + '.csv') 

In [77]:
# All columns that contains at least 1 NA
col_na = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage',
           'GarageQual', 'GarageFinish', 'GarageYrBlt', 'GarageType', 'GarageCond',
           'BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1',
           'MasVnrArea', 'MasVnrType', 'Electrical']

# Formatting

- Continuous features remains unchanged
- Ordinal categorical features has int labels
- Non ordinal categorical features are dummified

In [80]:
contin = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1',
        'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
        '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea',
        'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
        'ScreenPorch', 'PoolArea', 'MiscVal']

ord_cat = ['LotShape', 'LandContour', 'Utilities', 'LandSlope', 'HouseStyle',
           'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
           'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
           'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'BsmtFullBath',
           'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
           'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu',
           'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageQual',
           'GarageCond', 'PoolQC', 'Fence', 'MoSold', 'YrSold']

cat = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotConfig', 'Neighborhood',
      'Condition1', 'Condition2', 'BldgType', 'RoofStyle', 'RoofMatl',
      'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 
      'CentralAir', 'Electrical', 'GarageType', 'PavedDrive', 'MiscFeature',
      'SaleType', 'SaleCondition']

## Ordinal categorical

In [172]:
mapping1 = {np.nan:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
mapping2 = {np.nan:0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4}
mapping3 = {'IR3':1, 'IR2':2, 'IR1':3, 'Reg':4}
mapping4 = {'Lvl':1, 'Bnk':2, 'HLS':3, 'Low':4}
mapping5 = {'ELO':1, 'NoSeWa':2, 'NoSewr':3, 'AllPub':4}
mapping6 = {'Gtl':1, 'Mod':2, 'Sev':3}
mapping7 = {'1Story':1, '1.5Unf':2, '1.5Fin':3, '2Story':4, '2.5Unf':5, '2.5Fin':6,
           'SFoyer':7, 'SLvl':8}
mapping8 = {np.nan:0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}
mapping9 = {'Typ':0, 'Min1':1, 'Min2':2, 'Mod':3, 'Maj1':4, 'Maj2':5,
            'Sev':6, 'Sal':7}
mapping10 = {np.nan:0, 'Unf':1, 'RFn':2, 'Fin':3}
mapping11 = {np.nan:0, 'MnWw':1, 'GdWo':2, 'MnPrv':3, 'GdPrv':4}

In [194]:
raw_df = pd.read_csv('../data/' + data + '.csv') 

In [195]:
raw_df['LotShape'] = [mapping3[val] for val in raw_df['LotShape']]
raw_df['LandContour'] = [mapping4[val] for val in raw_df['LandContour']]
raw_df['Utilities'] = [mapping5[val] for val in raw_df['Utilities']]
raw_df['BsmtExposure'] = [mapping2[val] for val in raw_df['BsmtExposure']]
raw_df['Functional'] = [mapping9[val] for val in raw_df['Functional']]
raw_df['GarageFinish'] = [mapping10[val] for val in raw_df['GarageFinish']]
raw_df['Fence'] = [mapping11[val] for val in raw_df['Fence']]
raw_df['LandSlope'] = [mapping6[val] for val in raw_df['LandSlope']]
raw_df['HouseStyle'] = [mapping7[val] for val in raw_df['HouseStyle']]

tmpCol = ['ExterQual', 'BsmtQual', 'BsmtCond', 'ExterCond', 'BsmtQual', 'BsmtCond',
         'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond',
         'PoolQC']
for col in tmpCol :
    try:
        raw_df[col] = [mapping1[val] for val in raw_df[col]]
    except:
        print(col)

tmpCol = ['BsmtFinType1', 'BsmtFinType2']
for col in tmpCol :
    raw_df[col] = [mapping8[val] for val in raw_df[col]]

BsmtQual
BsmtCond


## Non ordinal categorical

In [196]:
for col in cat :
    tmp_dum = pd.get_dummies(raw_df[col])
    tmp_dum.columns = [f'{col}.{lvl}' for lvl in tmp_dum.columns]
    raw_df = pd.concat([raw_df.drop(columns = col), tmp_dum], sort = False, axis = 1)

## Pushing in csv file

In [197]:
raw_df.to_csv('./train_cleaned.csv', index=False)

# Imputing NA

# Feature selection

## Drop of quasi empty columns

In [7]:
raw_df = raw_df.drop(columns = ['PoolQC', 'MiscFeature', 'Alley'])

## Drop repeated information

In [8]:
raw_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodD