In [14]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from scipy.stats import norm
from scipy import stats

In [15]:
train = pd.read_csv("data/train.csv")
test    = pd.read_csv("data/test.csv")
full_df = train.append(test, ignore_index = True)

In [16]:
# Pool - pool quality is n.a. when poolarea is null so we can set to 'No'
train[(train.PoolQC.isnull()) & (train.PoolArea == 0)].shape

(1453, 81)

In [17]:
train["PoolQC"].fillna("No", inplace=True)
train["PoolArea"].fillna(0, inplace=True)


In [18]:
# MiscFeature - we can replace NA with None if no features are available
train['MiscFeature'].fillna('None', inplace=True)

In [19]:
# Alley - 
train['Alley'].fillna('None', inplace=True)

In [20]:
# Fence
train['Fence'].fillna('None', inplace=True)

In [21]:
# FireplaceQu
train['FireplaceQu'].fillna('None', inplace=True)

In [22]:
# LotFrontage
train['LotFrontage'].fillna(0, inplace=True)

In [23]:
# GarageYrBlt - we are going to use the year the house was built 
train['GarageYrBlt'].fillna(train['YearBuilt'], inplace=True)

In [24]:
train["GarageType"].fillna("None", inplace=True)
train["GarageQual"].fillna("None", inplace=True)
train["GarageCond"].fillna("None", inplace=True)
train["GarageFinish"].fillna("None", inplace=True)

In [25]:
# BsmtFinType2

train['BsmtFinType1'].fillna('None', inplace=True)
train['BsmtFinType2'].fillna('None', inplace=True)
train['BsmtExposure'].fillna('None', inplace=True)
train['BsmtQual'].fillna('None', inplace=True)
train['BsmtCond'].fillna('None', inplace=True)



In [29]:
# MasVnrType
train['MasVnrType'].fillna('None', inplace=True)
train['MasVnrArea'].fillna(0, inplace=True)

In [32]:
# Electrical

train['Electrical'].fillna('None', inplace=True)

In [33]:
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
SalePrice,0,0.0
Heating,0,0.0
RoofStyle,0,0.0
RoofMatl,0,0.0
Exterior1st,0,0.0
Exterior2nd,0,0.0
MasVnrType,0,0.0
MasVnrArea,0,0.0
ExterQual,0,0.0
ExterCond,0,0.0


In [36]:
# some numerical features are actually categories. 
def add_prefix(x):
    return 'SC' + str(x)

train.MSSubClass = train.MSSubClass.apply(add_prefix)

0    SC60
1    SC20
2    SC60
3    SC70
4    SC60
Name: MSSubClass, dtype: object

In [38]:
# MoSold is the month so we need to map it to avoid giving these numbers an order

import calendar

train['MoSold'] = train['MoSold'].apply(lambda x: calendar.month_abbr[x])

0    Feb
1    May
2    Sep
3    Feb
4    Dec
Name: MoSold, dtype: object

In [41]:
train = pd.get_dummies(train)

In [42]:
types = train.columns.to_series().groupby(train.dtypes).groups
for k,v in types.items():
    print(k, v)

uint8 Index(['MSSubClass_SC120', 'MSSubClass_SC160', 'MSSubClass_SC180',
       'MSSubClass_SC190', 'MSSubClass_SC20', 'MSSubClass_SC30',
       'MSSubClass_SC40', 'MSSubClass_SC45', 'MSSubClass_SC50',
       'MSSubClass_SC60',
       ...
       'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=294)
int64 Index(['Id', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'Yr