<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup-and-Data-Import" data-toc-modified-id="Setup-and-Data-Import-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup and Data Import</a></span></li><li><span><a href="#Cleaning-Missing-Values" data-toc-modified-id="Cleaning-Missing-Values-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Cleaning Missing Values</a></span><ul class="toc-item"><li><span><a href="#Examining-Columns-with-Missing-Values" data-toc-modified-id="Examining-Columns-with-Missing-Values-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Examining Columns with Missing Values</a></span></li><li><span><a href="#Dropping-Columns-with-Majority-NaN" data-toc-modified-id="Dropping-Columns-with-Majority-NaN-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Dropping Columns with Majority NaN</a></span></li><li><span><a href="#Imputing-Values-Not-Actually-Missing" data-toc-modified-id="Imputing-Values-Not-Actually-Missing-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Imputing Values Not Actually Missing</a></span></li><li><span><a href="#Imputing-Remaining-Missing-Values" data-toc-modified-id="Imputing-Remaining-Missing-Values-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Imputing Remaining Missing Values</a></span></li></ul></li><li><span><a href="#Feature-Selection" data-toc-modified-id="Feature-Selection-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Feature Selection</a></span><ul class="toc-item"><li><span><a href="#Numeric-Variables-Correlation-Matrix" data-toc-modified-id="Numeric-Variables-Correlation-Matrix-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Numeric Variables Correlation Matrix</a></span></li><li><span><a href="#Categorical-Variables-Feature-Importances" data-toc-modified-id="Categorical-Variables-Feature-Importances-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Categorical Variables Feature Importances</a></span></li><li><span><a href="#Separating-Target-and-Features" data-toc-modified-id="Separating-Target-and-Features-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Separating Target and Features</a></span></li></ul></li></ul></div>

## Setup and Data Import

In [1]:
import numpy as np
import pandas as pd

import plotly.express as px

pd.options.display.max_rows = None

In [2]:
train_raw = pd.read_csv('../train.csv')
test_raw = pd.read_csv('../test.csv')

In [3]:
train_raw.set_index('Id', inplace=True)
test_raw.set_index('Id', inplace=True)

In [4]:
print('Train:', train_raw.shape)
print('Test:', test_raw.shape)

Train: (1460, 80)
Test: (1459, 79)


## Cleaning Missing Values

### Examining Columns with Missing Values

In [5]:
def missing_val_info(df):
    '''Accepts a dataframe and prints a Series containing column names and
    the sum of their NaN values.'''
    
    print('Number of rows with NaN:', len(df[df.isna().any(axis=1)]), '\n')
    cols_na = df.loc[:, df.isna().any()] # df with only columns that have missing values
    
    if (len(df[df.isna().any(axis=1)]) > 0):
        print('Columns with NaN:\n', cols_na.isna().sum())

In [6]:
missing_val_info(train_raw)
print('-' * 28)
missing_val_info(test_raw)

Number of rows with NaN: 1460 

Columns with NaN:
 LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64
----------------------------
Number of rows with NaN: 1459 

Columns with NaN:
 MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        16
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu     

### Dropping Columns with Majority NaN

In [7]:
def drop_cols_majority_nan(df):
    '''Accepts a dataframe and returns a copy of the dataframe minus columns
    that had more than 90% missing values.'''
    
    drop_thresh = df.shape[0] * 0.9
    new_df = df.dropna(axis=1, how='all', thresh=drop_thresh).copy()  
    return new_df

In [8]:
train = drop_cols_majority_nan(train_raw)
print('Train dropped columns:',
      train_raw.columns.difference(train.columns).tolist())
print('Train shape:', train.shape)

print('-' * 96)

test = drop_cols_majority_nan(test_raw)
print('Test dropped columns:',
      test_raw.columns.difference(test.columns).tolist())
print('Test shape:', test.shape)

Train dropped columns: ['Alley', 'Fence', 'FireplaceQu', 'LotFrontage', 'MiscFeature', 'PoolQC']
Train shape: (1460, 74)
------------------------------------------------------------------------------------------------
Test dropped columns: ['Alley', 'Fence', 'FireplaceQu', 'LotFrontage', 'MiscFeature', 'PoolQC']
Test shape: (1459, 73)


In [9]:
# as expected since MiscFeature dropped, Misc Val is mostly not applicable
print(train.MiscVal.value_counts()[0] / train.MiscVal.value_counts().sum())

train.drop('MiscVal', axis=1, inplace=True)
test.drop('MiscVal', axis=1, inplace=True)

0.9643835616438357


### Imputing Values Not Actually Missing

In [10]:
def impute(df, cols):
    '''Accepts a df and list of column names to fill NaN with the string N/A.'''
    
    df[cols] = df[cols].fillna('N/A')

# If observation has TotalBsmtSF > 0, NaN in any Bsmt column != No Basement
def impute_bsmt(df, cols):
    '''Accepts a df and list of column names to fill NaN with the string N/A,
       excluding cols that have TotalBsmtSF > 0.'''

    # Store rows where NaN != No Basement
    to_drop = df[cols][(df[cols].isna().any(axis=1)) & (df[cols].TotalBsmtSF > 0)]
    # After dropping rows, fill NaNs
    dframe = df[cols].drop(to_drop.index).fillna('N/A')
    df[cols] = pd.concat([dframe, to_drop])

In [11]:
impute(train, list(train.columns[train.columns.str.contains('Garage')]))
impute_bsmt(train, list(train.columns[train.columns.str.contains('Bsmt')]))

### Imputing Remaining Missing Values

In [12]:
def missing_cols(df):
    '''Accepts a dataframe and returns a list of its columns containing NaN.'''

    return df.columns[df.isna().any()].tolist()

def impute_categorical_mode(df, cols):
    '''Accepts a df and list of column names to fill NaN in categorical columns
    with the mode of each column.'''
    
    df[cols] = df[cols].fillna(df.mode().iloc[0])

impute_categorical_mode(train, missing_cols(train))
impute_categorical_mode(test, missing_cols(test))

In [13]:
missing_val_info(train)
print('-' * 27)
missing_val_info(test)

Number of rows with NaN: 0 

---------------------------
Number of rows with NaN: 0 



## Feature Selection

In [14]:
# # Grouping the features by topic
# addons = ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
#           'ScreenPorch', 'PoolArea']

# basement = list(train.columns[train.columns.str.contains('Bsmt')])

# exterior = ['RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
#             'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation']

# garage = list(train.columns[train.columns.str.contains('Garage')])

# general = ['MSSubClass', 'BldgType', 'HouseStyle', 'YearBuilt', 'YearRemodAdd']

# interior = ['1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath',
#             'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
#             'TotRmsAbvGrd']

# lot = ['LotArea', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope',
#        'PavedDrive', 'Fence']

# neighborhood = ['MSZoning', 'Neighborhood', 'Condition1', 'Condition2']

# overall = ['OverallQual', 'OverallCond', 'Functional']

# sale = ['MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice']

# generated_sqft = list(train.columns[train.columns.str.contains('SF')])

# utilities = ['Utilities', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
#              'Fireplaces']

In [15]:
def num_to_cat_variable(df, cols):
    '''Accepts a df and list of columns to convert into dtype category.'''
    df[cols] = df[cols].astype('category')
    
date_cols = list(train.columns[train.columns.str.contains('Year')
                               | train.columns.str.contains('Yr')
                               | train.columns.str.contains('Mo')])

num_to_cat_variable(train, date_cols)
num_to_cat_variable(test, date_cols)

In [58]:
num_cols = train.select_dtypes(include=['int64', 'float64'])
cat_cols = train.select_dtypes(include=['object', 'category'])

In [22]:
num_cols.columns

Index(['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'SalePrice'],
      dtype='object')

In [24]:
cat_cols.columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir',
       'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'MoSold',
       'YrSold', 'SaleType', 'SaleCondition'],
      dtype='object')

### Numeric Variables Correlation Matrix

In [60]:
num_corr_matrix = num_cols.corr()[(num_cols.corr() > np.abs(0.5)) & (num_cols.corr() < 1)
                         ].dropna(how='all').dropna(axis=1, how='all')

px.imshow(img=num_corr_matrix, x=num_corr_matrix.index, y=num_corr_matrix.columns)

### Categorical Variables Feature Importances

In [55]:
from sklearn.ensemble import ExtraTreesClassifier

num_tree_model = ExtraTreesClassifier()
num_tree_model.fit(num_cols, train.SalePrice);

In [56]:
feat_importances = pd.Series(num_tree_model.feature_importances_, index=num_cols.columns
                            ).sort_values(ascending=True)

px.bar(x=feat_importances.values, y=feat_importances.index)

### Separating Target and Features

In [20]:
# def separate_variables(df):
#     '''Splits a training dataframe into target and feature dataframes.'''
    
#     target = df.SalePrice
#     features = df.drop('SalePrice', 1)
#     return target, features

# y_train, X_train = separate_variables(train)