In [1]:
import pandas as pd

In [9]:
# Import raw data
data = 'train'
raw_df = pd.read_csv('../data/' + data + '.csv') 

In [10]:
# All columns that contains at least 1 NA
col_na = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage',
           'GarageQual', 'GarageFinish', 'GarageYrBlt', 'GarageType', 'GarageCond',
           'BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1',
           'MasVnrArea', 'MasVnrType', 'Electrical']

# Formatting

- Continuous features remains unchanged
- Ordinal categorical features has int labels
- Non ordinal categorical features are dummified

In [69]:
# ---------------------------- In progress -------------------------------

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
oe = OrdinalEncoder(categories='auto')
le = LabelEncoder()

labels = le.fit(raw_df['LotShape']).classes_

labels

oe.fit(raw_df[['LotShape']])
oe.categories_

# ---------------------------- In progress -------------------------------

[array(['IR1', 'IR2', 'IR3', 'Reg'], dtype=object)]

In [71]:
raw_df['LandContour'].unique()

array(['Lvl', 'Bnk', 'Low', 'HLS'], dtype=object)

In [78]:
contin = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1',
        'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
        '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea',
        'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
        'ScreenPorch', 'PoolArea', 'MiscVal']

ord_cat = ['LotShape', 'LandContour', 'Utilities', 'Landslope', 'HouseStyle',
           'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
           'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
           'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'BsmtFullBath',
           'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
           'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu',
           'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageQual',
           'GarageCond', 'PoolQC', 'Fence', 'MoSold', 'YrSold']

cat = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotConfig', 'Neighborhood',
      'Condition1', 'Condition2', 'BldgType', 'RoofStyle', 'RoofMatl',
      'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 
      'CentralAir', 'Electrical', 'GarageType', 'PavedDrive', 'MiscFeature',
      'SaleType', 'SaleCondition']

In [40]:
# ------------------ Working on a better solution than doing by hand -----------------------

# Replace each nominal ordinal variable with corresponding value (increasing quality/condition - value)

# BsmtQual : 0 - No basement to 5 - Excellent
raw_df['BsmtQual'] = [0 if l == 'NA' else 1 if l == "Po" else 2 if l == "Fa" \
                          else 3 if l == "TA" else 4 if l == "Gd" else 5 for l in raw_df['BsmtQual']]

# BsmtCond : 0 - No basement to 5 - Excellent
raw_df['BsmtCond'] = [0 if l == 'NA' else 1 if l == "Po" else 2 if l == "Fa" \
                          else 3 if l == "TA" else 4 if l == "Gd" else 5 for l in raw_df['BsmtCond']]

# BsmtExposure : 0 - No basement to 4 - Good exposure
raw_df['BsmtExposure'] = [0 if l == 'NA' else 1 if l == "No" else 2 if l == "Mn" \
                              else 3 if l == "Av" else 4 for l in raw_df['BsmtExposure']]

# BsmtFinType1
raw_df['BsmtFinType1'] = [0 if l == 'NA' else 1 if l == "Unf" \
                              else 2 if l == "LwQ" else 3 if l == "Rec" else 4 if l == "BLQ" \
                              else 5 if l == "ALQ" else 6 for l in raw_df['BsmtFinType1']]

# BsmtFinType2
raw_df['BsmtFinType2'] = [0 if l == 'NA' else 1 if l == "Unf" \
                              else 2 if l == "LwQ" else 3 if l == "Rec" else 4 if l == "BLQ" \
                              else 5 if l == "ALQ" else 6 for l in raw_df['BsmtFinType2']]

# GarageFinish
raw_df['GarageFinish'] = [0 if l == 'NA' else 1 if l =='Unf' else 2 if l == 'RFn' \
                              else 3 for l in raw_df['GarageFinish']]

# GarageQual
raw_df['GarageQual'] = [0 if l == 'NA' else 1 if l == "Po" \
                            else 2 if l == "Fa" else 3 if l == "TA" else 4 if l == "Gd" \
                            else 5 for l in raw_df['GarageQual']]

# GarageCond
raw_df['GarageCond'] = [0 if l == 'NA' else 1 if l == "Po" \
                            else 2 if l == "Fa" else 3 if l == "TA" else 4 if l == "Gd" \
                            else 5 for l in raw_df['GarageCond']]

# Fence (quality)
raw_df['Fence'] = [0 if l == 'NA' else 1 if l == "MnWw" \
                       else 2 if l == "GdWo" else 3 if l == "MnPrv" else 4 for l in raw_df['Fence']]

# Imputing NA

# Feature selection

## Drop of quasi empty columns

In [7]:
raw_df = raw_df.drop(columns = ['PoolQC', 'MiscFeature', 'Alley'])

## Drop repeated information

In [8]:
raw_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodD