In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#Loading the data
housing_data = pd.read_csv("train.csv")

#Target selection
y = housing_data.SalePrice

#For simplicity, only using numerical predictors
predictors = housing_data.drop("SalePrice", axis = 1)
X = housing_data.select_dtypes(exclude = ["object"])

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [54]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches for missing values
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

# Basic Exploratory Data Analysis

In [61]:
X_train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
618,619,20,90.0,11694,9,5,2007,2007,452.0,48,...,0,108,0,0,260,0,0,7,2007,314813
870,871,20,60.0,6600,5,5,1962,1962,0.0,0,...,0,0,0,0,0,0,0,8,2009,109500
92,93,30,80.0,13360,5,7,1921,2006,0.0,713,...,0,0,44,0,0,0,0,8,2009,163500
817,818,20,,13265,8,5,2002,2002,148.0,1218,...,150,59,0,0,0,0,0,7,2008,271000
302,303,20,118.0,13704,7,5,2001,2002,150.0,0,...,468,81,0,0,0,0,0,1,2006,205000


To keep things simple

## Dealing with missing values

Three approaches that exist are:
- Drop values i.e. the rows/ columns with missing values:
This is the simplest option, however a lot of information that could be potentially useful is lost
- Imputation:
This fills in the missing values with some number e.g the mean value along each column. This usually leads to more accurate models than dropping the entire column
- Extension to Imputation: impute as before, and for each column with missing entries in the original data, add a new column that shows the location of imputed entries

In [57]:
#number of missing data points per column
missing_value_count = X_train.isna().sum()

percentage_missing = missing_value_count.sum()/np.product(X_train.shape)*100

print("The percentage of missing data is:", percentage_missing)

#Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns if X_train[col].isna().any()]
print(len(cols_with_missing))

The percentage of missing data is: 0.6218457101658255
3


In [60]:
# As a starting point, lets drop columns with missing values, as all rows have 
# missing values

reduced_X_train = X_train.dropna(axis = 1)
reduced_X_valid = X_valid.dropna(axis = 1)

#or

"""
reduced_housing_data = housing_data.drop(cols_with_missing, axis = 1)
"""

# just how much data did we lose?
print("Columns in original dataset: %d \n" % X_train.shape[1])
print("Columns with na's dropped: %d" % reduced_X_train.shape[1])

print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

Columns in original dataset: 38 

Columns with na's dropped: 35
MAE from Approach 1 (Drop columns with missing values):
1093.0359589041097


In [41]:
# could use housing_data.fillna(method='bfill', axis=0).fillna(0) to
# replace all NA's the value that comes directly after it in the same column, 
# then replace all the remaining na's with 0