In [14]:
import pandas as pd

# Path of the file to read
iowa_file_path = 'train.csv'

# load data
home_data = pd.read_csv(iowa_file_path)


from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

iow_target = home_data.SalePrice
iow_predictors=home_data.drop(['SalePrice'],axis=1)

# For the sake of keeping the example simple, we'll use only numeric predictors.

iow_numeric_predictors = iow_predictors.select_dtypes(exclude=['object'])





In [35]:
# Split into validation and training data
X_train, X_test, y_train, y_test = train_test_split(iow_numeric_predictors, iow_target,train_size=0.7,test_size=0.3,random_state=0)


#function score_dataset(X_train, X_test, y_train, y_test) to compare the quality of diffrent approaches to missing values. This function reports the out-of-sample MAE score from a RandomForest.
def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

In [41]:
#Get Model Score from Dropping Columns with Missing Values

cols_with_missing = [col for col in X_train.columns 
                                 if X_train[col].isnull().any()]
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_test  = X_test.drop(cols_with_missing, axis=1)
print("Mean Absolute Error from dropping columns with Missing Values:")
print(score_dataset(reduced_X_train, reduced_X_test, y_train, y_test))

Mean Absolute Error from dropping columns with Missing Values:
18740.637442922372


In [43]:
#Get Model Score from Imputation

from sklearn.preprocessing import Imputer


my_imputer = Imputer()
imputed_X_train = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.transform(X_test)
print("Mean Absolute Error from Imputation:")
print(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test))


Mean Absolute Error from Imputation:
19006.417123287672


In [28]:
#Get Score from Imputation with Extra Columns Showing What Was Imputed

imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()

cols_with_missing = (col for col in X_train.columns 
                                 if X_train[col].isnull().any())
for col in cols_with_missing:
    imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()

# Imputation
my_imputer = Imputer()
imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test_plus = my_imputer.transform(imputed_X_test_plus)

print("Mean Absolute Error from Imputation while Track What Was Imputed:")
print(score_dataset(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test))

Mean Absolute Error from Imputation while Track What Was Imputed:
19311.100913242008
