<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup-and-Data-Import" data-toc-modified-id="Setup-and-Data-Import-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup and Data Import</a></span></li><li><span><a href="#Organizing" data-toc-modified-id="Organizing-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Organizing</a></span><ul class="toc-item"><li><span><a href="#ID-to-Index" data-toc-modified-id="ID-to-Index-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>ID to Index</a></span></li><li><span><a href="#Numerical-Variables-to-Categorical" data-toc-modified-id="Numerical-Variables-to-Categorical-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Numerical Variables to Categorical</a></span></li><li><span><a href="#Impute-LotFrontage" data-toc-modified-id="Impute-LotFrontage-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Impute LotFrontage</a></span></li><li><span><a href="#Columns-by-Data-Type" data-toc-modified-id="Columns-by-Data-Type-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Columns by Data Type</a></span></li></ul></li><li><span><a href="#Cleaning-Missing-Values" data-toc-modified-id="Cleaning-Missing-Values-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Cleaning Missing Values</a></span><ul class="toc-item"><li><span><a href="#See-Missing-Values" data-toc-modified-id="See-Missing-Values-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>See Missing Values</a></span></li><li><span><a href="#Drop-Columns-with-Majority-NaN" data-toc-modified-id="Drop-Columns-with-Majority-NaN-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Drop Columns with Majority NaN</a></span></li><li><span><a href="#Impute-Values-Not-Actually-Missing" data-toc-modified-id="Impute-Values-Not-Actually-Missing-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Impute Values Not Actually Missing</a></span></li><li><span><a href="#Impute-Remaining-Missing-Values" data-toc-modified-id="Impute-Remaining-Missing-Values-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Impute Remaining Missing Values</a></span></li></ul></li><li><span><a href="#Feature-Selection" data-toc-modified-id="Feature-Selection-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Feature Selection</a></span><ul class="toc-item"><li><span><a href="#Numeric-Variables-Correlation-Matrix" data-toc-modified-id="Numeric-Variables-Correlation-Matrix-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Numeric Variables Correlation Matrix</a></span></li><li><span><a href="#Numeric-Variables-Feature-Importances" data-toc-modified-id="Numeric-Variables-Feature-Importances-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Numeric Variables Feature Importances</a></span></li><li><span><a href="#Separating-Target-and-Features" data-toc-modified-id="Separating-Target-and-Features-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Separating Target and Features</a></span></li></ul></li></ul></div>

## Setup and Data Import

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

import plotly.express as px

pd.options.display.max_rows = None

import os
if not os.path.exists("images"):
    os.mkdir("images")

In [2]:
train_raw = pd.read_csv('../train.csv')
test_raw = pd.read_csv('../test.csv')

train = train_raw.copy()
test = test_raw.copy()

In [3]:
print('Train:', train.shape)
print('Test:', test.shape)

Train: (1460, 81)
Test: (1459, 80)


## Organizing

### ID to Index

In [4]:
def set_df_index(*dfs):
    '''
    Sets the index to the Id column.
    
    Arguments:
        dfs: One or more dataframes.
    
    Returns:
        df(s) altered in place.
    '''    
    
    for df in dfs:
        df.set_index('Id', inplace=True)

In [5]:
set_df_index(train, test)

### Numerical Variables to Categorical

In [6]:
def num_to_cat_variable(*dfs):
    '''
    Converts columns containing date keywords into dtype 'category'.
    
    Arguments:
        dfs: One or more dataframes.
    
    Returns:
        df(s) altered in place.
    '''
    for df in dfs:
        date_cols = list(df.columns[df.columns.str.contains('Year')
                                   | df.columns.str.contains('Yr')
                                   | df.columns.str.contains('Month')                                
                                   | df.columns.str.contains('Mo')])

        df[date_cols] = df[date_cols].astype('str')
        df[date_cols] = df[date_cols].astype('category')
        df[date_cols] = df[date_cols].apply(lambda x: x.cat.add_categories('N/A'))

In [7]:
num_to_cat_variable(train, test)

### Impute LotFrontage
- Assumption: No observation has zero LotFrontage because all properties have 
street access.
- LotFrontage needs to be imputed before separating numeric and categorical variables
(for use of groupby), and before removing columns that have majority NaN (so
that it does not get dropped prematurely).

In [8]:
def impute_lot_frontage(*dfs):
    '''
    Fills the missing values of the LotFrontage column to the means of
    LotFrontage grouped by Neighborhood from the training dataset.
    
    Arguments:
        dfs: One or more dataframes.
    
    Returns:
        df(s) altered in place.
    '''
    
    for df in dfs:
        train_hood_means = dict(train_raw.groupby('Neighborhood').LotFrontage.mean())
        df.LotFrontage = df.LotFrontage.fillna(df.Neighborhood.map(train_hood_means))

In [9]:
impute_lot_frontage(train, test)

### Columns by Data Type

In [10]:
# def separate_dtypes(df):
#     '''  
#     Assigns to two variables a dataframe's numeric columns and categorical
#     columns, respectively. You must assign this function to two variables -
#     the first representing the numeric columns, the second representing the
#     categorical columns.
    
#     Arguments:
#         df: A dataframe. If dataframe has either no numeric or no categorical
#         columns, the associated variable will return a df with only an index.
    
#     Returns:
#         num_cols: A copy of df containing only numerical columns.
#         cat_cols: A copy of df containing only categorical columns.
#     '''
    
#     num_cols = df.select_dtypes(include=[
#         'int', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32',
#         'uint64', 'float', 'float16', 'float32', 'float64']).copy()
    
#     cat_cols = df.select_dtypes(include=['object', 'category', 'bool']).copy()
    
#     return num_cols, cat_cols

In [11]:
# num_train, cat_train = separate_dtypes(train_raw)
# num_test, cat_test = separate_dtypes(test_raw)

## Cleaning Missing Values

### See Missing Values

In [12]:
def missing_val_info(*dfs):
    '''
    Prints the sum of rows with missing values and the names of columns
    containing NaNs with the sum of their NaN values.
    
    Arguments:
        dfs: One or more dataframes.
        
    Returns:
        Printed output.
    '''
    
    for df in dfs:   
        print('Number of rows with NaN:', len(df[df.isna().any(axis=1)]), '\n')
        cols_na = df.loc[:, df.isna().any()] # df with only columns that have missing values

        if (len(df[df.isna().any(axis=1)]) > 0):
            print('Columns with NaN:\n', cols_na.isna().sum())
        
        print('-'*30)

In [13]:
missing_val_info(train, test)

Number of rows with NaN: 1460 

Columns with NaN:
 Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64
------------------------------
Number of rows with NaN: 1459 

Columns with NaN:
 MSZoning           4
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        16
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageFinish      78
GarageCars    

### Drop Columns with Majority NaN

In [14]:
def drop_cols_majority_nan(*dfs):
    '''
    Removes columns that have more than 90% missing values.
    
    Arguments:
        dfs: One or more dataframes.
    
    Returns:
        df(s) altered in place.
    '''
    
    for df in dfs:
        drop_thresh = df.shape[0] * 0.9
        df = df.dropna(axis=1, how='all', thresh=drop_thresh, inplace=True)

In [15]:
drop_cols_majority_nan(train, test)

In [16]:
# as expected since MiscFeature dropped, Misc Val is mostly not applicable
print(train.MiscVal.value_counts()[0] / train.MiscVal.value_counts().sum())

def drop_misc_val(*dfs):
    '''
    Removes the MiscVal column.
    
    Arguments:
        dfs: One or more dataframes.
    
    Returns:
        df(s) altered in place.
    '''
    
    for df in dfs:
        df.drop('MiscVal', axis=1, inplace=True)

0.9643835616438357


In [17]:
drop_misc_val(train, test)

### Impute Values Not Actually Missing

In [18]:
def impute_not_missing(df, cols):
    '''
    Fills NaN values with the string 'N/A' or the integer 0.
    
    Arguments:
        df: A dataframe.
        cols: A list of column names as strings.
    
    Returns:
        df altered in place.
    '''
    
    cat_dtypes = ['object', 'category']
    num_dtypes = ['int', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16',
                  'uint32', 'uint64', 'float', 'float16', 'float32', 'float64']
    
    cat_cols = df[cols].select_dtypes(cat_dtypes).columns.tolist()
    num_cols = df[cols].select_dtypes(num_dtypes).columns.tolist()
        
    df.fillna({x:'N/A' for x in cat_cols}, inplace=True)
    df.fillna({x:0 for x in num_cols}, inplace=True)

In [19]:
gar_cols = list(train.columns[train.columns.str.contains('Garage')])
impute_not_missing(train, gar_cols)
impute_not_missing(test, gar_cols)

In [20]:
# If observation has TotalBsmtSF > 0, NaN in any Bsmt column != No Basement
def impute_bsmt(df, cols):
    '''
    Fills NaN values with the string 'N/A' or the integer 0, excluding cols
    that have TotalBsmtSF > 0.
    
    Arguments:
        df: A dataframe.
        cols: A list of column names as strings.
    
    Returns:
        df altered in place.
    '''
    
    cat_dtypes = ['object', 'category']
    num_dtypes = ['int', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16',
                  'uint32', 'uint64', 'float', 'float16', 'float32', 'float64']

    cat_cols = df[cols].select_dtypes(cat_dtypes).columns.tolist()
    num_cols = df[cols].select_dtypes(num_dtypes).columns.tolist()

    # Store rows where NaN != No Basement
    to_drop = df[cols][(df[cols].isna().any(axis=1)) & (df[cols].TotalBsmtSF > 0)]
    dropped = df[cols].drop(to_drop.index)
    
    # After dropping rows, fill NaNs 
    dropped.fillna({x:'N/A' for x in cat_cols}, inplace=True)
    dropped.fillna({x:0 for x in num_cols}, inplace=True)
    
    df[cols] = pd.concat([dropped, to_drop])

In [21]:
bsmt = list(train.columns[train.columns.str.contains('Bsmt')])
impute_bsmt(train, bsmt)
impute_bsmt(test, bsmt)

In [22]:
missing_val_info(train, test)

Number of rows with NaN: 11 

Columns with NaN:
 MasVnrType      8
MasVnrArea      8
BsmtExposure    1
BsmtFinType2    1
Electrical      1
dtype: int64
------------------------------
Number of rows with NaN: 32 

Columns with NaN:
 MSZoning         4
Utilities        2
Exterior1st      1
Exterior2nd      1
MasVnrType      16
MasVnrArea      15
BsmtQual         2
BsmtCond         3
BsmtExposure     2
KitchenQual      1
Functional       2
SaleType         1
dtype: int64
------------------------------


### Impute Remaining Missing Values

In [23]:
def impute_mean_mode(*dfs):
    '''
    Fills NaN values with the mean or the mode of each column.
    
    Arguments:
        dfs: One or more dataframes.
    
    Returns:
        df(s) altered in place.
    '''
    
    for df in dfs:
        cat_dtypes = ['object', 'category']
        num_dtypes = ['int', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16',
                      'uint32', 'uint64', 'float', 'float16', 'float32', 'float64']

        cat_cols = df.select_dtypes(cat_dtypes).columns.tolist()
        num_cols = df.select_dtypes(num_dtypes).columns.tolist()

        imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

        df[cat_cols] = imp_mode.fit_transform(df[cat_cols])
        df[num_cols] = imp_mean.fit_transform(df[num_cols])

In [24]:
impute_mean_mode(train, test)

In [25]:
missing_val_info(train, test)

Number of rows with NaN: 0 

------------------------------
Number of rows with NaN: 0 

------------------------------


## Feature Selection

In [30]:
# # Grouping the features by topic
# addons = ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
#           'ScreenPorch', 'PoolArea']

# basement = list(train.columns[train.columns.str.contains('Bsmt')])

# exterior = ['RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
#             'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation']

# garage = list(train.columns[train.columns.str.contains('Garage')])

# general = ['MSSubClass', 'BldgType', 'HouseStyle', 'YearBuilt', 'YearRemodAdd']

# interior = ['1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath',
#             'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
#             'TotRmsAbvGrd']

# lot = ['LotArea', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope',
#        'PavedDrive', 'Fence']

# neighborhood = ['MSZoning', 'Neighborhood', 'Condition1', 'Condition2']

# overall = ['OverallQual', 'OverallCond', 'Functional']

# sale = ['MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice']

# generated_sqft = list(train.columns[train.columns.str.contains('SF')])

# utilities = ['Utilities', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
#              'Fireplaces']

### Numeric Variables Correlation Matrix

In [31]:
num_corr_matrix = num_cols.corr()[(num_cols.corr() > np.abs(0.5)) & (num_cols.corr() < 1)
                         ].dropna(how='all').dropna(axis=1, how='all')

px.imshow(img=num_corr_matrix, x=num_corr_matrix.index, y=num_corr_matrix.columns)

NameError: name 'num_cols' is not defined

### Numeric Variables Feature Importances

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

num_tree_model = ExtraTreesClassifier()
num_tree_model.fit(num_cols, train.SalePrice);

In [None]:
feat_importances = pd.Series(num_tree_model.feature_importances_, index=num_cols.columns
                            ).sort_values(ascending=True)

px.bar(x=feat_importances.values, y=feat_importances.index)

In [None]:
len(train)

### Separating Target and Features

In [None]:
# def separate_variables(df):
#     '''Splits a training dataframe into target and feature dataframes.'''
    
#     target = df.SalePrice
#     features = df.drop('SalePrice', 1)
#     return target, features

# y_train, X_train = separate_variables(train)