In [2]:
import pandas as pd
house = pd.read_csv('./train.csv')

In [4]:
# The data description identifies the following features as using NA to represent none
has_na =  ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
         'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 
         'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 
         'Fence', 'MiscFeature']
# These are all categorical with string values.  Let's replace with 'None'
for col in has_na:
    house.loc[house[col].isna(), col] = 'None'

In [7]:
house.columns[house.isna().any()]

Index(['LotFrontage', 'MasVnrType', 'MasVnrArea', 'Electrical', 'GarageYrBlt'], dtype='object')

In [8]:
# Lot Frontage
# It's likely that LotFrontage is NaN when there is none.  
# The R dataset has a comparable proportion of zeros and no Nans
house.loc[house['LotFrontage'].isna(), 'LotFrontage'] = 0

In [14]:
# MasVrnArea is na when MasVnrType is also NA.  Probably best to zero the float and 'None' the string
house.loc[house['MasVnrArea'].isna(), 'MasVnrArea'] = 0
house.loc[house['MasVnrType'].isna(), 'MasVnrType'] = 'None'

In [19]:
# There's only one missing value in Electrical
house['Electrical'].value_counts()
# Let's replace that with the modal value
house.loc[house['Electrical'].isna(), 'Electrical'] = 'SBrkr'

In [23]:
# GarageYrBlt has NA when GarageType = 'None'
house.loc[house['GarageYrBlt'].isna(), 'GarageType'].value_counts()
# We could encode that as 0 but this will lead to a wide data range (0s and 1985 for example)
# I think it might be better to define it as equal to house built date to avoid this
house.loc[house['GarageYrBlt'].isna(), 'GarageYrBlt'] = house.loc[house['GarageYrBlt'].isna(), 'YearBuilt']

In [27]:
house.columns[house.isna().any()]

Index([], dtype='object')

In [None]:
house.write_csv('./train_cleaned.csv')