<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup-and-Data-Import" data-toc-modified-id="Setup-and-Data-Import-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup and Data Import</a></span><ul class="toc-item"><li><span><a href="#Exploring" data-toc-modified-id="Exploring-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Exploring</a></span></li><li><span><a href="#Organizing" data-toc-modified-id="Organizing-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Organizing</a></span></li><li><span><a href="#Imputing-Values-Not-Actually-Missing" data-toc-modified-id="Imputing-Values-Not-Actually-Missing-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Imputing Values Not Actually Missing</a></span></li><li><span><a href="#Cleaning-Missing-Values" data-toc-modified-id="Cleaning-Missing-Values-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Cleaning Missing Values</a></span></li></ul></li></ul></div>

## Setup and Data Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_rows = None

In [2]:
df = pd.read_csv('train.csv')

FileNotFoundError: [Errno 2] File train.csv does not exist: 'train.csv'

### Exploring

In [None]:
print('Dataframe shape:', df.shape)
print('\nColumns:\n', df.columns)

### Organizing

In [None]:
# Grouping the features by topic
basement = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
                    'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']
exterior = ['RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
                    'ExterQual', 'ExterCond', 'Foundation']
ext_addons = ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
                      'PoolQC']
garage = ['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
                  'GarageCond']
general = ['MSSubClass', 'BldgType', 'HouseStyle', 'YearBuilt', 'YearRemodAdd', 'MiscFeature',
                   'MiscVal']
interior = ['1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath', 'HalfBath',
                    'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd']
lot = ['LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig',
               'LandSlope', 'PavedDrive', 'Fence']
neighborhood = ['MSZoning', 'Neighborhood', 'Condition1', 'Condition2']

overall = ['OverallQual', 'OverallCond', 'Functional']

sale = ['MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice']

utilities = ['Utilities', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'Fireplaces',
                     'FireplaceQu']

### Imputing Values Not Actually Missing

In [None]:
# Function that can be called with list of column names to fill NaNs with 'N/A' string
def impute(cols):
    df[cols] = df[cols].fillna('N/A')
impute(['Alley', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond',
        'PoolQC', 'Fence', 'MiscFeature'])

# If observation has TotalBsmtSF > 0, NaN in any Bsmt column != No Basement
def impute_bsmt(cols):
    # Store rows where NaN != No Basement
    to_drop = df[cols][(df[cols].isna().any(axis=1)) & (df[cols].TotalBsmtSF > 0)]
    # After dropping rows, fill NaNs
    dframe = df[cols].drop(to_drop.index).fillna('N/A') 
    df[cols] = pd.concat([dframe, to_drop])
impute_bsmt(df[basement].columns)

### Cleaning Missing Values

In [None]:
print('Number of rows with NaN:', len(df[df.isna().any(axis=1)]), '\n')
cols_na = df.loc[:, df.isna().any()] # df with only columns that have missing values
print('Number of cols with NaN:', len(cols_na.columns), '\n')
print('Columns with NaN:\n', cols_na.columns.tolist())
# Values are missing in columns mostly regarding basements and garages, and therefore not missing at random,
# since the reason they are missing is because the house doesn't have a basement or garage

In [None]:
# No obvious influence, potential relationships with other variables should be explored.
# Also unlikely value should be 0; the lowest value in LotFrontage is 21.
len(df[lot][df.LotFrontage.isna()])

# MAR? Could these be None, since the same observations are also NaN in MasVnrArea (mostly 0 values)?
df[exterior][df.MasVnrType.isna()]
df[exterior][df.MasVnrArea.isna()]

# MCAR? Not obvious that any other features would influence reporting of this feature.
df[basement][df.BsmtExposure.isna()]
# MCAR? BsmtFinType2 is missing but BsmtFinSF2 has a value, meaning BsmtFinType2 should have a rating.
df[basement][df.BsmtFinType2.isna()]

df[df.Electrical.isna()].T
df.Electrical.value_counts()

df.YearBuilt[df.Electrical == 'FuseA'].value_counts()


In [None]:
df[df.columns[df.columns.str.contains('Bsmt')]]