## Import libraries and Read the data set 

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [7]:
# read data from train and test csv files
X_full = pd.read_csv('train.csv',na_values=['','NA'],keep_default_na=False,index_col='Id')
X_test_full = pd.read_csv('test.csv',na_values=['','NA'],keep_default_na=False,index_col='Id')

# remove rows with missing target, separate target from predictors
X_full.dropna(axis=0,subset=['SalePrice'],inplace=True)
y= X_full.SalePrice
X_full.drop(['SalePrice'],axis=1,inplace=True)

# to keep things simple, we will use numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

# break off validation set from training data
xtrain,xtest,ytrain,ytest = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=0)

## Handling Missing Values  : Preliminary Investigation

In [9]:
print(xtrain.shape) 

# missing value count by column
s = xtrain.isnull().sum()
print(s[s>0])

(1168, 36)
LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64


### To compare different approaches to dealing with missing values, you'll use the score_dataset() function. This function reports the mean absolute error (MAE) from a random forest model.

``` python
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
 
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
 
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

## Drop columns with missing values

In [14]:
# get names of columns with missing values
col_miss = [col for col in xtrain.columns
            if xtrain[col].isnull().any()]

# drop columns in training and validation data
reduced_xtrain = xtrain.drop(col_miss,axis=1)
reduced_xtest = xtest.drop(col_miss,axis=1)

In [13]:
print('MAE of Drop columns method : ')
print(score_dataset(reduced_xtrain,reduced_xtest,ytrain,ytest))

MAE of Drop columns method : 
17837.82570776256


## Imputation

In [24]:
from sklearn.impute import SimpleImputer
# imputation
my_imputer = SimpleImputer()
imputed_xtrain = pd.DataFrame(my_imputer.fit_transform(xtrain))
imputed_xtest = pd.DataFrame(my_imputer.fit_transform(xtest))

# imputation removed column names; put them back
imputed_xtrain.columns = xtrain.columns
imputed_xtest.columns = xtest.columns

In [26]:
print('MAE of Imputation method')
print(score_dataset(imputed_xtrain,imputed_xtest,ytrain,ytest))

MAE of Imputation method
18056.85163242009


## If we look at the MAE results, MAE for dropping columns is less than Imputation. Lets proceed with dropping columns method

In [27]:
final_xtrain = xtrain.drop(col_miss,axis=1)
final_xtest = xtest.drop(col_miss,axis=1)

## Define and fit model

In [29]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100,random_state=0)
model.fit(final_xtrain,ytrain)

# get predications and calculate MAE
ypreds= model.predict(final_xtest)
print(f'MAE : {mean_absolute_error(ytest,ypreds)}')

MAE : 17837.82570776256
