In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('../input/melb-data/melb_data.csv')

# Select target
y = data.Price

In [None]:
data.head()

In [None]:
#len(data.Regionname.unique())

In [None]:
data.dtypes

In [None]:
import missingno as msno
%matplotlib inline

In [None]:
msno.matrix(data)

In [None]:
msno.heatmap(data)

In [None]:
msno.bar(data)

In [None]:
from sklearn import preprocessing
# label_encoder object knows how to understand word labels.
label_encoder_Suburb = preprocessing.LabelEncoder()
data['Suburb']= label_encoder_Suburb.fit_transform(data['Suburb'])
label_encoder_Type = preprocessing.LabelEncoder()
data['Type']= label_encoder_Type.fit_transform(data['Type'])
label_encoder_Method = preprocessing.LabelEncoder()
data['Method']= label_encoder_Method.fit_transform(data['Method'])
label_encoder_SellerG = preprocessing.LabelEncoder()
data['SellerG']= label_encoder_SellerG.fit_transform(data['SellerG'])
label_encoder_RegionName = preprocessing.LabelEncoder()
data['Regionname']= label_encoder_RegionName.fit_transform(data['Regionname'])

In [None]:
data.dtypes

In [None]:
data.head()

In [None]:
# To keep things simple, we'll use only numerical predictors
melb_predictors = data.drop(['Price'], axis=1)
X = melb_predictors.select_dtypes(exclude=['object'])

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=1000)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

Score from Approach 1 (Drop Columns with Missing Values)

In [None]:
# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]

# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

Score from Approach 2 (Imputation)

In [None]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

Score from Approach 3 (An Extension to Imputation)

In [None]:
# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print("MAE from Approach 3 (An Extension to Imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))

So, why did imputation perform better than dropping the columns?
The training data has 10864 rows and 12 columns, where three columns contain missing data. For each column, less than half of the entries are missing. Thus, dropping the columns removes a lot of useful information, and so it makes sense that imputation would perform better.


 Conclusion
As is common, imputing missing values (in Approach 2 and Approach 3) yielded better results, relative to when we simply dropped columns with missing values (in Approach 1).

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('../input/melb-data/melb_data.csv')

# Select target
y = data.Price

from sklearn import preprocessing
# label_encoder object knows how to understand word labels.
label_encoder_Suburb = preprocessing.LabelEncoder()
data['Suburb']= label_encoder_Suburb.fit_transform(data['Suburb'])
label_encoder_Type = preprocessing.LabelEncoder()
data['Type']= label_encoder_Type.fit_transform(data['Type'])
label_encoder_Method = preprocessing.LabelEncoder()
data['Method']= label_encoder_Method.fit_transform(data['Method'])
label_encoder_SellerG = preprocessing.LabelEncoder()
data['SellerG']= label_encoder_SellerG.fit_transform(data['SellerG'])
label_encoder_RegionName = preprocessing.LabelEncoder()
data['Regionname']= label_encoder_RegionName.fit_transform(data['Regionname'])

# To keep things simple, we'll use only numerical predictors
melb_predictors = data.drop(['Price'], axis=1)
X = melb_predictors.select_dtypes(exclude=['object'])

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)


import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='mean')
imputed_X_train = pd.DataFrame(imp.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(imp.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("MAE from Approach 4 (Imputation with mean):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('../input/melb-data/melb_data.csv')

# Select target
y = data.Price

from sklearn import preprocessing
# label_encoder object knows how to understand word labels.
label_encoder_Suburb = preprocessing.LabelEncoder()
data['Suburb']= label_encoder_Suburb.fit_transform(data['Suburb'])
label_encoder_Type = preprocessing.LabelEncoder()
data['Type']= label_encoder_Type.fit_transform(data['Type'])
label_encoder_Method = preprocessing.LabelEncoder()
data['Method']= label_encoder_Method.fit_transform(data['Method'])
label_encoder_SellerG = preprocessing.LabelEncoder()
data['SellerG']= label_encoder_SellerG.fit_transform(data['SellerG'])
label_encoder_RegionName = preprocessing.LabelEncoder()
data['Regionname']= label_encoder_RegionName.fit_transform(data['Regionname'])

# To keep things simple, we'll use only numerical predictors
melb_predictors = data.drop(['Price'], axis=1)
X = melb_predictors.select_dtypes(exclude=['object'])

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)


import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='most_frequent')
imputed_X_train = pd.DataFrame(imp.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(imp.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("MAE from Approach 4 (Imputation with most frequent):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('../input/melb-data/melb_data.csv')

# Select target
y = data.Price

from sklearn import preprocessing
# label_encoder object knows how to understand word labels.
label_encoder_Suburb = preprocessing.LabelEncoder()
data['Suburb']= label_encoder_Suburb.fit_transform(data['Suburb'])
label_encoder_Type = preprocessing.LabelEncoder()
data['Type']= label_encoder_Type.fit_transform(data['Type'])
label_encoder_Method = preprocessing.LabelEncoder()
data['Method']= label_encoder_Method.fit_transform(data['Method'])
label_encoder_SellerG = preprocessing.LabelEncoder()
data['SellerG']= label_encoder_SellerG.fit_transform(data['SellerG'])
label_encoder_RegionName = preprocessing.LabelEncoder()
data['Regionname']= label_encoder_RegionName.fit_transform(data['Regionname'])

# To keep things simple, we'll use only numerical predictors
melb_predictors = data.drop(['Price'], axis=1)
X = melb_predictors.select_dtypes(exclude=['object'])

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

import numpy as np
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=10, weights="uniform")
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("MAE from Approach 4 (Imputation by using KNN):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))