# ML Model Pipeline: Model-Building

**Predicting House Sale Price**

The aim of the project is to develop a machine learning model to predict house sale prices using features that describe various aspects of a house.

**Goals**

- To be able to understand the features that affect the price of houses in the Market.
- To determine the significant features on which the house price depend on.
- To develop a model that predicts the price of a house using the selected/important factors.

**Data Source:**
- Ames Housing dataset 
    - This dataset originates from https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data
    
**Credits:**
- www.kaggle.com - This project wouldnt have been possible without Ames dataset from www.kaggle.com

# Import Libraries

In [1]:
#libs for data processing
import pandas as pd
import numpy as np

#libs for plotting
import matplotlib.pyplot as plt
import seaborn as sns

#size plot/graph dimensions for matplotlib
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

#Display all the columns
pd.pandas.set_option('display.max_columns',None)

#Display matplotlib output inline.
%matplotlib inline

#Model training
from sklearn.linear_model import Lasso

#suppress warnings
import warnings
warnings.simplefilter(action='ignore')

#Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

#Serializing model using pickle
import pickle


# Read Data

In [2]:
try:
    X_train = pd.read_csv('my_Xtrain.csv')
    X_test = pd.read_csv('my_Xtest.csv')
    print(f'[SUCCESS] Done loading the dataset...')
    
except:
    print(f'Unable to load the dataset!')

[SUCCESS] Done loading the dataset...


##### Visualize the table/Dataframe

In [3]:
#Visualize the X_train table/Dataframe
X_train.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,136,0.0,0.75,0.495064,0.406983,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.541667,0.375,0.6,0.75,0.571429,0.666667,0.625,0.279412,0.639344,0.8,0.333333,0.692308,0.666667,0.5,0.208999,0.333333,1.0,1.0,0.5,0.75,0.25,0.833333,0.0,0.666667,0.0,0.558219,0.213421,1.0,0.75,1.0,1.0,0.611775,0.0,0.0,0.571872,0.0,0.0,0.666667,0.0,0.375,0.5,0.333333,0.416667,1.0,0.333333,0.8,0.833333,0.35514,0.333333,0.5,0.373766,0.6,1.0,1.0,0.114352,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.363636,0.5,0.625,0.6,12.066811
1,1453,0.941176,0.25,0.189077,0.203387,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.166667,0.375,0.6,1.0,0.714286,0.444444,0.5,0.007353,0.032787,0.2,0.333333,0.846154,0.733333,0.5,0.058055,0.333333,1.0,1.0,0.75,0.75,1.0,1.0,0.096917,0.666667,0.0,0.0,0.089525,1.0,0.75,1.0,1.0,0.441306,0.0,0.0,0.412522,0.333333,0.0,0.333333,0.0,0.25,0.5,0.333333,0.25,1.0,0.0,0.0,0.666667,0.009346,1.0,0.5,0.37024,0.6,1.0,1.0,0.0,0.051188,0.0,0.0,0.0,0.0,0.0,0.75,0.5,0.0,0.363636,0.0,0.625,0.6,11.884489
2,763,0.235294,1.0,0.456066,0.370696,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.791667,0.375,0.6,0.75,0.857143,0.666667,0.5,0.007353,0.032787,0.2,0.333333,0.846154,0.733333,0.25,0.0,0.333333,1.0,1.0,0.75,0.75,0.5,1.0,0.004252,0.666667,0.0,0.313356,0.123732,1.0,1.0,1.0,1.0,0.313126,0.379177,0.0,0.542275,0.0,0.0,0.666667,0.5,0.375,0.5,0.666667,0.416667,1.0,0.0,0.0,0.833333,0.009346,0.333333,0.5,0.433004,0.6,1.0,1.0,0.1972,0.082267,0.0,0.0,0.0,0.0,0.0,0.75,0.5,0.0,0.454545,1.0,1.0,0.6,12.279323


In [4]:
#Visualize the X_train table/Dataframe
X_test.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,893,0.0,0.75,0.445638,0.365508,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.291667,0.375,0.6,0.75,0.571429,0.555556,0.875,0.316176,0.065574,0.8,0.333333,0.538462,0.6,0.25,0.0,0.333333,1.0,0.4,0.5,0.75,0.25,1.0,0.11747,0.666667,0.0,0.169521,0.173322,1.0,0.5,1.0,1.0,0.439892,0.0,0.0,0.4112,0.0,0.5,0.333333,0.0,0.375,0.5,0.333333,0.333333,1.0,0.0,0.0,0.833333,0.401869,0.666667,0.25,0.186178,0.6,1.0,1.0,0.224037,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.090909,0.0,0.625,0.6,11.947949
1,1106,0.235294,0.75,0.57018,0.439121,1.0,1.0,0.333333,0.333333,1.0,0.25,0.0,1.0,0.375,0.6,0.75,0.857143,0.777778,0.5,0.117647,0.262295,0.2,0.333333,0.538462,0.6,0.5,0.2627,0.666667,1.0,1.0,1.0,0.75,0.75,1.0,0.182849,0.666667,0.0,0.184503,0.239444,1.0,1.0,1.0,1.0,0.568437,0.543341,0.0,0.728921,0.333333,0.0,0.666667,0.5,0.375,0.5,0.666667,0.583333,1.0,0.666667,0.6,0.833333,0.149533,0.666667,0.5,0.502116,0.6,1.0,1.0,0.217036,0.058501,0.0,0.0,0.0,0.0,0.0,0.75,0.5,0.0,0.272727,1.0,0.625,0.6,12.69158
2,414,0.058824,0.25,0.363044,0.377814,1.0,0.0,0.0,0.333333,1.0,0.0,0.0,0.208333,0.0,0.6,0.75,0.571429,0.444444,0.625,0.610294,1.0,0.2,0.333333,0.384615,0.4,0.25,0.0,0.333333,1.0,0.4,0.5,0.75,0.25,0.833333,0.0,0.666667,0.0,0.431507,0.164975,1.0,0.75,1.0,0.5,0.425446,0.0,0.0,0.397696,0.0,0.0,0.333333,0.0,0.25,0.5,0.333333,0.25,1.0,0.333333,0.8,0.333333,0.775701,0.333333,0.5,0.253879,0.6,1.0,1.0,0.0,0.0,0.235507,0.0,0.0,0.0,0.0,0.75,0.5,0.0,0.181818,1.0,0.625,0.6,11.652687


##### Capture the dependent variable
- Recall: The dependent variable is log transformed.

In [5]:
y_train = X_train['SalePrice']
y_test = X_test['SalePrice']

##### Load Selected features from Feature selection phase

In [6]:
selected_features = pd.read_csv('selected_features.csv')
selected_features = selected_features['0'].to_list()#tolist() is used to convert a series to list
selected_features

['MSSubClass',
 'MSZoning',
 'LotArea',
 'LandContour',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'Exterior1st',
 'ExterQual',
 'Foundation',
 'BsmtQual',
 'BsmtExposure',
 'BsmtUnfSF',
 'HeatingQC',
 'CentralAir',
 '1stFlrSF',
 'GrLivArea',
 'BsmtFullBath',
 'FullBath',
 'HalfBath',
 'KitchenQual',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageCars',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'ScreenPorch',
 'Fence',
 'SaleCondition']

##### Reduce the X_train and X_test using  set the selected features

In [7]:
print(f'X_train shape:{X_train.shape}')
print(f'X_test shape:{X_test.shape}')

X_train shape:(1022, 81)
X_test shape:(438, 81)


In [8]:
#Reduce the X_train and X_test using set the selected features
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [9]:
print(f'X_train shape:{X_train.shape}')
print(f'X_test shape:{X_test.shape}')

X_train shape:(1022, 38)
X_test shape:(438, 38)


In [10]:
print(f'Features have been reduced from 81 to {X_train.shape[1]}.')

Features have been reduced from 81 to 38.


# Build Model

Use the optimal value saved in Feature selection notebook.
The optimal values is 0.001.

In [11]:
#initialize model. 
lasso_mdl = Lasso(alpha=0.001, random_state=42)
#fit model
lasso_mdl.fit(X_train, y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=42,
   selection='cyclic', tol=0.0001, warm_start=False)

# Predicting using X_test dataset


In [12]:
#prediction 
preds = lasso_mdl.predict(X_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
#create a scatterplot for real test values vs predicted values
plt.scatter(y_test, preds)
plt.xlabel('Y test')
plt.ylabel('Predictions')

There is a linear positive relationship between real test and predicted values.

# Evaluating the Model

In [None]:
# Recall:We log-transformed the dependent variable(SalePrice)in feature engineering notebook.
# To get the true performance of the Lasso, we need to transform both the target and the predictions back to the original house price values.

# make predictions for X_train dataset
preds_train = lasso_mdl.predict(X_train)
# determine mse, rmse, and r2 for the train dataset
print(f'Train set MSE : {int(mean_squared_error(np.exp(y_train), np.exp(preds_train)))}')
print(f'Train set RMSE : {int(sqrt(mean_squared_error(np.exp(y_train), np.exp(preds_train))))}')
print(f'Train set r2 : {r2_score(np.exp(y_train), np.exp(preds_train))}')
print('\n')

# make predictions for X_test dataset
preds_test = lasso_mdl.predict(X_test)
# determine mse, rmse, and r2  for the Test dataset
print(f'Test set MSE : {int(mean_squared_error(np.exp(y_test), np.exp(preds_test)))}')
print(f'Test set RMSE : {int(sqrt(mean_squared_error(np.exp(y_test), np.exp(preds_test))))}')
print(f'Test set r2 : {r2_score(np.exp(y_test), np.exp(preds_test))}')
print('\n')

#print median sale price for easier interpretation of RMSE.
print('Average house price: ', int(np.exp(y_train).median()))

# Residuals
 - evaluate the distribution of the residuals/errors.

In [None]:
#plot histogram of residuals and ensure. For an ideal situation, the plot should be normally distributed
sns.distplot((y_test-preds_test),bins=30)

Residual plot above follows gaussian distributions and hence the lasso model prediction is "good".

# What are the coefficeints for the selected variable that were used for predictions.

In [None]:
#Create a df using coeeficeints
model_coefficients = pd.DataFrame(lasso_mdl.coef_,X_train.columns)
model_coefficients.columns = ['Coefficient']
model_coefficients

# Feature Importance

In [None]:
feat_importance = pd.Series(np.abs(lasso_mdl.coef_.ravel()))
feat_importance.index = selected_features
feat_importance.sort_values(inplace=True, ascending=False)
feat_importance.plot.bar(figsize=(18,6))
plt.ylabel('Lasso Model Coefficients')
plt.title('Feature Importance')
plt.show()

In [None]:
 print(f'GrLivArea and Foundation are the most and the least important features respectively.')

# Serialize/Save the Model

In [None]:
#serializing/saving model
model = pickle.dump(lasso_mdl, open('lasso_regression_model.pkl', 'wb'))