# Data setup

In [40]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Load data
train = pd.read_csv('../data/train.csv')
train_y = train.SalePrice
train_x = train.drop(['SalePrice'], axis=1)
train_x = train_x.select_dtypes(exclude=['object'])


train_x, test_x, train_y, test_y = train_test_split(train_x, 
                                                    train_y,
                                                    train_size=0.7, 
                                                    test_size=0.3, 
                                                    random_state=0)

def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

#### Get Model Score from Dropping Columns with Missing Values

In [39]:
cols_with_missing = [col for col in train_x.columns if train_x[col].isnull().any()]
r_train_x = train_x.drop(cols_with_missing, axis=1)
r_test_x  = test_x.drop(cols_with_missing, axis=1)
print("Mean Absolute Error from dropping columns with Missing Values:")
print(score_dataset(r_train_x, r_test_x, train_y, test_y))

Mean Absolute Error from dropping columns with Missing Values:
19199.778082191777


#### Get Model Score from Imputation

In [49]:
from sklearn.preprocessing import Imputer

imputer = Imputer()
imp_train_x = imputer.fit_transform(train_x)
imp_test_x = imputer.transform(test_x)
print("Mean Absolute Error from Imputation:")
print(score_dataset(imp_train_x, imp_test_x, train_y, test_y))

Mean Absolute Error from Imputation:
19002.674200913243


#### Get Score from Imputation with Extra Columns Showing What Was Imputed

In [51]:
imp_train_x_plus = train_x.copy()
imp_test_x_plus = test_x.copy()

cols_with_missing = (col for col in train_x.columns 
                                 if train_x[col].isnull().any())
for col in cols_with_missing:
    imp_train_x_plus[col + '_was_missing'] = imp_train_x_plus[col].isnull()
    imp_test_x_plus[col + '_was_missing'] = imp_test_x_plus[col].isnull()

# Imputation
imputer = Imputer()
imp_train_x_plus = imputer.fit_transform(imp_train_x_plus)
imp_test_x_plus = imputer.transform(imp_test_x_plus)

print("Mean Absolute Error from Imputation while Track What Was Imputed:")
print(score_dataset(imp_train_x_plus, imp_test_x_plus, train_y, test_y))

Mean Absolute Error from Imputation while Track What Was Imputed:
19280.078310502286
