In [None]:
# to measure the quality of the model using 3 different approaches of dealing missing data
# 1) Dropping the missing data
# 2) Using imputated values of the missing data
# 3) Extended imputation, additional column showcasing the loction of the imputed data

In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv(r"C:\Users\VI00592\GitHub\kaggle_learn\Intermediate Machine Learning\Missing numbers\melb_data.csv\melb_data.csv")

#select target
y = data.Price

#assigning predictor(features) restricting to only numerical predictors
mel_predictors = data.drop(["Price"], axis = 1)
X = mel_predictors.select_dtypes(exclude = ["object"])

#divide data into training and validation sets
X_t, X_v ,y_t ,y_v = train_test_split(X, y, train_size= 0.8, test_size=0.2, random_state=0)

X_t.head()


Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12167,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0
6524,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0
8413,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0
2919,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0
6043,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


In [59]:
# function to measure the mean absolute error of the models
# randomforestregression model

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

#function to compare different approaches
def score_dataset(X_t, X_v, y_t, y_v):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_t,y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)
    

In [61]:
#Score from Approach 1 (Drop Columns with Missing Values)

cols_with_missing = [col for col in X_t.columns
                     if X_t[col].isnull().any()]
X_t.head()
# #drop from training data
reduced_X_t = X_t.drop(cols_with_missing, axis=1)
reduced_X_v = X_v.drop(cols_with_missing, axis=1)

print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_X_t, reduced_X_v, y_t, y_v))
# y_v.head()

MAE from Approach 1 (Drop columns with missing values):
183550.22137772635


In [64]:
#score of Approach 2 - imputation

from sklearn.impute import SimpleImputer

#Imputation
my_imputer = SimpleImputer() 
imputed_X_t = pd.DataFrame(my_imputer.fit_transform(X_t))
imputed_X_v = pd.DataFrame(my_imputer.fit_transform(X_v))

#imputation removes the columns names, add the column from the original test and validation data

imputed_X_t.columns = X_t.columns
imputed_X_v.columns = X_v.columns

print("MAE from Approach 2 (imputation):")
print(score_dataset(imputed_X_t, imputed_X_v, y_t, y_v))

                           

MAE from Approach 2 (imputation):
179816.89508731329


In [73]:
#score of Approach 3 - Extention of imputation

#making copy of orginal data to avoid changes
X_t_plus = X_t.copy()
X_v_plus = X_v.copy()

#adding column to indicate what will be imputed
for col in cols_with_missing:
    X_t_plus[col + "_was_missing"] = X_t_plus[col].isnull()
    X_v_plus[col + "_was_missing"] = X_v_plus[col].isnull()

#imputation
my_imputer = SimpleImputer()
imputed_X_t_plus = pd.DataFrame(my_imputer.fit_transform(X_t_plus))
imputed_X_v_plus = pd.DataFrame(my_imputer.fit_transform(X_v_plus))

#imputation removes the column names, so put it back ---
imputed_X_t_plus.columns = X_t_plus.columns
imputed_X_v_plus.columns = X_v_plus.columns

print("MAE from Approach 3 (extension of imputation):")
print(score_dataset(imputed_X_t_plus,imputed_X_v_plus,y_t,y_v))

imputed_X_t_plus.head()

MAE from Approach 3 (extension of imputation):
179986.2708570026


Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount,Car_was_missing,BuildingArea_was_missing,YearBuilt_was_missing
0,1.0,5.0,3182.0,1.0,1.0,1.0,0.0,153.764119,1940.0,-37.85984,144.9867,13240.0,0.0,1.0,0.0
1,2.0,8.0,3016.0,2.0,2.0,1.0,193.0,153.764119,1964.839866,-37.858,144.9005,6380.0,0.0,1.0,1.0
2,3.0,12.6,3020.0,3.0,1.0,1.0,555.0,153.764119,1964.839866,-37.7988,144.822,3755.0,0.0,1.0,1.0
3,3.0,13.0,3046.0,3.0,1.0,1.0,265.0,153.764119,1995.0,-37.7083,144.9158,8870.0,0.0,1.0,0.0
4,3.0,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0,0.0,0.0,0.0


In [74]:
imputed_X_t.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,1.0,5.0,3182.0,1.0,1.0,1.0,0.0,153.764119,1940.0,-37.85984,144.9867,13240.0
1,2.0,8.0,3016.0,2.0,2.0,1.0,193.0,153.764119,1964.839866,-37.858,144.9005,6380.0
2,3.0,12.6,3020.0,3.0,1.0,1.0,555.0,153.764119,1964.839866,-37.7988,144.822,3755.0
3,3.0,13.0,3046.0,3.0,1.0,1.0,265.0,153.764119,1995.0,-37.7083,144.9158,8870.0
4,3.0,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0
