In [1]:
import os

import pandas as pd

import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

%matplotlib inline

#load the dataframes
df_train  = pd.read_csv('~/git/homeprices_kaggle/train.csv', parse_dates=True)
df_test = pd.read_csv('~/git/homeprices_kaggle/test.csv',  parse_dates=True)



# Helper Functions

In [2]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())


# EDA

In [21]:
# Ok, so our goal is to predict the sale price of the home in the test dataset.  

# First, lets take a look at our columns in the dataframe to see if we have any nulls to deal with
df_train.columns[df_train.isnull().any()]
#temp_ar = df_train.columns[df_train.isnull().any()]

Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence'],
      dtype='object')

In [4]:
# QUESTION:  ok, quite a few of them.  How many nulls in each column?
# SOLUTION:  Lets put all the columns with nulls into a subset to analyze further

missingHousingData = df_train[['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature']].copy()


In [6]:
#what does that missingHousingData dataframe look like?
missingHousingData.head()

Unnamed: 0,LotFrontage,Alley,MasVnrType,MasVnrArea,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,65.0,,BrkFace,196.0,Gd,TA,No,GLQ,Unf,SBrkr,,Attchd,2003.0,RFn,TA,TA,,,
1,80.0,,,0.0,Gd,TA,Gd,ALQ,Unf,SBrkr,TA,Attchd,1976.0,RFn,TA,TA,,,
2,68.0,,BrkFace,162.0,Gd,TA,Mn,GLQ,Unf,SBrkr,TA,Attchd,2001.0,RFn,TA,TA,,,
3,60.0,,,0.0,TA,Gd,No,ALQ,Unf,SBrkr,Gd,Detchd,1998.0,Unf,TA,TA,,,
4,84.0,,BrkFace,350.0,Gd,TA,Av,GLQ,Unf,SBrkr,TA,Attchd,2000.0,RFn,TA,TA,,,


# Examing the missing data -- MiscFeature and corresponding MiscVal columns.

In [10]:
#how many for each column?
missingHousingData.isnull().sum()

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [19]:
#quite a few more missing from some columns vs others, what about % missing?
missingHousingData.isnull().sum()/len(missingHousingData)

LotFrontage     0.177397
Alley           0.937671
MasVnrType      0.005479
MasVnrArea      0.005479
BsmtQual        0.025342
BsmtCond        0.025342
BsmtExposure    0.026027
BsmtFinType1    0.025342
BsmtFinType2    0.026027
Electrical      0.000685
FireplaceQu     0.472603
GarageType      0.055479
GarageYrBlt     0.055479
GarageFinish    0.055479
GarageQual      0.055479
GarageCond      0.055479
PoolQC          0.995205
Fence           0.807534
MiscFeature     0.000000
dtype: float64

In [None]:
# I'm seeing columns with more than 25% of data missing, lets see whats in those before we drop them
# MiscFeature could be so random and unique per row if its like 'english tea parlor room'

In [16]:
#show me all the unique values for MiscFeature
missingHousingData.MiscFeature.unique()

array([nan, 'Shed', 'Gar2', 'Othr', 'TenC'], dtype=object)

In [17]:
# are they all just one-offs?  No, Shed has 49 occurences, a decent amount.  
# I don't think I want to drop this column, as it being populated probably ups the sale price
missingHousingData['MiscFeature'].value_counts()

Shed    49
Gar2     2
Othr     2
TenC     1
Name: MiscFeature, dtype: int64

In [28]:
# From reviewing the data_description.txt file, we know there is a column in the original 
# training set called MiscVal that is the #value of the MiscFeature item.  Those have to impact sale price,
# so i think we just want to fill in the na values with a 'NA' string for now, and drop it from our 
# missingHousingData dataframe.
missingHousingData.drop(['MiscFeature'], axis=1, inplace=True)
#df_train['MiscFeature'].fillna('NA', inplace=True)
missingHousingData.head()

Unnamed: 0,LotFrontage,Alley,MasVnrType,MasVnrArea,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageQual,GarageCond,PoolQC,Fence
0,65.0,,BrkFace,196.0,Gd,TA,No,GLQ,Unf,SBrkr,,Attchd,2003.0,RFn,TA,TA,,
1,80.0,,,0.0,Gd,TA,Gd,ALQ,Unf,SBrkr,TA,Attchd,1976.0,RFn,TA,TA,,
2,68.0,,BrkFace,162.0,Gd,TA,Mn,GLQ,Unf,SBrkr,TA,Attchd,2001.0,RFn,TA,TA,,
3,60.0,,,0.0,TA,Gd,No,ALQ,Unf,SBrkr,Gd,Detchd,1998.0,Unf,TA,TA,,
4,84.0,,BrkFace,350.0,Gd,TA,Av,GLQ,Unf,SBrkr,TA,Attchd,2000.0,RFn,TA,TA,,


# Execute Helper functions / transform data

# Hyperparameter and rf or regression execution

# Post Model analysis

# submission file generation