- explore distribution of missing features to determine a suitable strategy for IMPUTING

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [17]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [70]:
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [69]:
# list of cols having
cols_missing = ['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 
                'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu',
                'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 
                'PoolQC', 'Fence', 'MiscFeature']
print(train.shape)
print(test.shape)

(1460, 81)
(1459, 80)


In [7]:
train.dtypes[cols_missing]

LotFrontage     float64
Alley            object
MasVnrType       object
MasVnrArea      float64
BsmtQual         object
BsmtCond         object
BsmtExposure     object
BsmtFinType1     object
BsmtFinType2     object
Electrical       object
FireplaceQu      object
GarageType       object
GarageYrBlt     float64
GarageFinish     object
GarageQual       object
GarageCond       object
PoolQC           object
Fence            object
MiscFeature      object
dtype: object

In [28]:
train.shape

(1460, 81)

In [16]:
train[cols_missing].isnull().sum()

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [27]:
test[cols_missing].isnull().sum()

LotFrontage      227
Alley           1352
MasVnrType        16
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinType2      42
Electrical         0
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish      78
GarageQual        78
GarageCond        78
PoolQC          1456
Fence           1169
MiscFeature     1408
dtype: int64

In [47]:
def create_missing_df(df):
    cols_missing = df.columns[df.isnull().any()].tolist()
    series_missing_count = df[cols_missing].isnull().sum()
    series_missing_dtypes = df.dtypes[cols_missing]
    df_missing = pd.concat(
        [series_missing_count, series_missing_dtypes], 
        axis=1, 
        keys=['count', 'dtype'])
    return df_missing.sort_values(by=['count'], ascending=False)
    
train_missing = create_missing_df(train)
train_missing

Unnamed: 0,count,dtype
PoolQC,1453,object
MiscFeature,1406,object
Alley,1369,object
Fence,1179,object
FireplaceQu,690,object
LotFrontage,259,float64
GarageType,81,object
GarageYrBlt,81,float64
GarageFinish,81,object
GarageQual,81,object


In [44]:
cols_missing_test = test.columns[test.isnull().any()].tolist()
test_missing_count = test[cols_missing_test].isnull().sum()
test_missing_dtypes = test.dtypes[cols_missing_test]
test_missing = pd.concat(
    [test_missing_count, test_missing_dtypes], 
    axis=1, 
    keys=['count', 'dtype'])
test_missing.sort_values(by=['count'], ascending=False)

Unnamed: 0,count,dtype
PoolQC,1456,object
MiscFeature,1408,object
Alley,1352,object
Fence,1169,object
FireplaceQu,730,object
LotFrontage,227,float64
GarageCond,78,object
GarageYrBlt,78,float64
GarageQual,78,object
GarageFinish,78,object


In [68]:
print(train['GarageCond'].value_counts(dropna=False), end="\n\n")
print(train['GarageType'].value_counts(dropna=False), end="\n\n")

TA     1326
NaN      81
Fa       35
Gd        9
Po        7
Ex        2
Name: GarageCond, dtype: int64

Attchd     870
Detchd     387
BuiltIn     88
NaN         81
Basment     19
CarPort      9
2Types       6
Name: GarageType, dtype: int64



In [33]:
cols_test_not_train = ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'BsmtFinSF1', 
 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 
 'KitchenQual', 'Functional', 'GarageCars', 'GarageArea', 'SaleType']

test.dtypes[cols_test_not_train]

MSZoning         object
Utilities        object
Exterior1st      object
Exterior2nd      object
BsmtFinSF1      float64
BsmtFinSF2      float64
BsmtUnfSF       float64
TotalBsmtSF     float64
BsmtFullBath    float64
BsmtHalfBath    float64
KitchenQual      object
Functional       object
GarageCars      float64
GarageArea      float64
SaleType         object
dtype: object

# Imputing

In [1]:
import sklearn

In [3]:
sklearn.impute.SimpleImputer

AttributeError: module 'sklearn' has no attribute 'impute'

In [2]:
sklearn.__version__

'0.19.1'