In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('data/melb_data.csv')

In [3]:
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [4]:
data.shape

(13580, 21)

In [5]:
data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [6]:
# Select target
y = data.Price

melb_predictor = data.drop('Price',axis=1)
X = melb_predictor.select_dtypes(exclude=['object'])

In [7]:
X.columns

Index(['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude',
       'Propertycount'],
      dtype='object')

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Define Function to Measure Quality of Each Approach

In [10]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [11]:
def score_mae(X_train, X_test, y_train, y_test):
    rf_model = RandomForestRegressor(random_state=0)
    rf_model.fit(X_train,y_train)
    preds = rf_model.predict(X_test)
    return mean_absolute_error(y_test,preds)

In [1]:
X_train.isnull().sum()

NameError: name 'X_train' is not defined

**Approach 1 (Drop Columns with Missing Values)**

In [12]:
# Get names of columns with missing values
col_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]


# Drop columns in training and validation data
reduced_X_train = X_train.drop(col_with_missing,axis=1)
reduced_X_test = X_test.drop(col_with_missing,axis=1)

print('MAE score for Approach 1:')
print(score_mae(reduced_X_train,reduced_X_test,y_train, y_test))


MAE score for Approach 1:
179830.71756552003


**Approach 2 (Imputation)**

we use `SimpleImputer` to replace missing values with the mean value along each column.

In [13]:
import sklearn

In [14]:
print(sklearn.__version__)

0.23.2


In [19]:
from sklearn.impute import SimpleImputer

In [20]:
my_imputer = SimpleImputer()

In [48]:
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(my_imputer.transform(X_test))

In [49]:
#Imputation removes column names. Put them back
imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

In [50]:
print('MAE score for Approach 2:')
print(score_mae(imputed_X_train,imputed_X_test,y_train, y_test))

MAE score for Approach 2:
172995.99074417911


We see that `Approach 2` has lower `MAE` than `Approach 1`, so `Approach 2` performed better on this dataset.

**Approach 3 (An Extension to Imputation)**

In [28]:
X_train_plus = X_train.copy()
X_test_plus = X_test.copy()

In [30]:
for col in col_with_missing:
    X_train_plus[col+'_was_missing']=X_train_plus[col].isnull()
    X_test_plus[col+'_was_missing']=X_test_plus[col].isnull()

In [31]:
X_train_plus.columns

Index(['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude',
       'Propertycount', 'Car_was_missing', 'BuildingArea_was_missing',
       'YearBuilt_was_missing'],
      dtype='object')

In [51]:
#Impute missing values
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_test_plus = pd.DataFrame(my_imputer.transform(X_test_plus))

In [52]:
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_test_plus.columns = X_test_plus.columns

In [53]:
print('MAE score for Approach 3:')
print(score_mae(imputed_X_train_plus,imputed_X_test_plus,y_train, y_test))

MAE score for Approach 3:
173354.4780920822
