In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv('melb_data.csv')

# Select target
y = data.Price

# use only numerical predictors
melb_predictors = data.drop(['Price'], axis=1)
X = melb_predictors.select_dtypes(exclude=['object'])

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

print(X_valid)

       Rooms  Distance  Postcode  Bedroom2  Bathroom  Car  Landsize   
8505       4       8.0    3016.0       4.0       2.0  2.0     450.0  \
5523       2       6.6    3011.0       2.0       1.0  0.0     172.0   
12852      3      10.5    3020.0       3.0       1.0  1.0     581.0   
4818       3       4.5    3181.0       2.0       2.0  1.0     128.0   
12812      3       8.5    3044.0       3.0       2.0  2.0     480.0   
...      ...       ...       ...       ...       ...  ...       ...   
2664       2       6.4    3011.0       2.0       1.0  1.0      47.0   
8513       4       8.0    3016.0       4.0       2.0  4.0     551.0   
12922      3      10.8    3105.0       3.0       1.0  1.0     757.0   
10761      4       6.2    3039.0       4.0       1.0  3.0     478.0   
2110       2       1.6    3066.0       2.0       1.0  2.0     159.0   

       BuildingArea  YearBuilt  Lattitude  Longtitude  Propertycount  
8505          190.0     1910.0  -37.86100   144.89850         6380.0  
5523 

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)


In [6]:
# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]

# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop columns with missing values):
183550.22137772635


In [9]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print(imputed_X_train)

       Rooms  Distance  Postcode  Bedroom2  Bathroom  Car  Landsize   
0        1.0       5.0    3182.0       1.0       1.0  1.0       0.0  \
1        2.0       8.0    3016.0       2.0       2.0  1.0     193.0   
2        3.0      12.6    3020.0       3.0       1.0  1.0     555.0   
3        3.0      13.0    3046.0       3.0       1.0  1.0     265.0   
4        3.0      13.3    3020.0       3.0       1.0  2.0     673.0   
...      ...       ...       ...       ...       ...  ...       ...   
10859    3.0       5.2    3056.0       3.0       1.0  2.0     212.0   
10860    3.0      10.5    3081.0       3.0       1.0  1.0     748.0   
10861    4.0       6.7    3058.0       4.0       2.0  2.0     441.0   
10862    3.0      12.0    3073.0       3.0       1.0  1.0     606.0   
10863    4.0       6.4    3011.0       4.0       2.0  1.0     319.0   

       BuildingArea    YearBuilt  Lattitude  Longtitude  Propertycount  
0        153.764119  1940.000000  -37.85984   144.98670        13240.0  
1

In [11]:
print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE from Approach 2 (Imputation):
178166.46269899711


In [12]:
# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

In [18]:
# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

# X_train_plus.info()
# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print("MAE from Approach 3 (An Extension to Imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))

MAE from Approach 3 (An Extension to Imputation):
178927.503183954


In [23]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])
sum(missing_val_count_by_column[missing_val_count_by_column > 0])

(10864, 12)
Car               49
BuildingArea    5156
YearBuilt       4307
dtype: int64


9512