# HOUSE PRICE PREDICTION

IMPORTING LIBRARIES

In [1]:
# import sklearn librarys
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error

from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import recall_score,precision_score,confusion_matrix, classification_report

from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import  OneHotEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.feature_selection import SelectKBest,chi2

from matplotlib import pyplot as plt

from pandas_profiling import ProfileReport as profile

import pandas as pd
import numpy as np
import os
import seaborn as sns

from sklearn.impute import SimpleImputer

from sklearn.metrics import precision_recall_curve, roc_curve, auc

from xgboost import XGBRegressor 

from sklearn.compose import make_column_transformer

from sklearn.pipeline import make_pipeline

# READING DATASET

In [None]:
os.chdir("c:/Users/User/Desktop/kaggle datasets/housing dataset/")
df=pd.read_csv('house_train_data.csv')
df.head()

# DATA EXPLORATION - pandas profiling

In [None]:
# profile=df.profile_report(minimal=False,progress_bar=False, explorative=True, pool_size=0, samples=None,
#                                 missing_diagrams=None, duplicates=None);
# profile.to_file("output.html")

# HANDLING MISSING DATASET

In [None]:
print(df.isnull().mean().sort_values(ascending=False))

 Droping columns with more than 40% missing dataset

In [None]:
df=df.loc[: ,  df.isnull().mean()<0.40]

# Dropping the id columns because it has no predictive power
df.drop(columns=['Id'] ,axis=1, inplace=True)

print(df.isnull().mean().sort_values(ascending=False))

In [None]:
numerical_columns=list(df.select_dtypes(include=['int64','float64']).columns)

In [None]:
imp_mean=SimpleImputer(missing_values=np.nan,strategy='mean')
imp_frequent=SimpleImputer(missing_values=np.nan,strategy="most_frequent")

transformed_columns= make_column_transformer  (
                                                ( imp_mean, list(df.select_dtypes(include=['int64','float64']).columns) ),
                                                ( imp_frequent,list(df.select_dtypes(exclude=['int64','float64']).columns) ),
                                                 remainder = 'passthrough'
                                              )

list_columns=list(df.select_dtypes(include=['int64','float64']).columns) + list(df.select_dtypes(exclude=['int64','float64']).columns)

df=pd.DataFrame(transformed_columns.fit_transform(df), columns=list_columns)

df[numerical_columns]=df.loc[:,numerical_columns].astype(float)
df.isnull().sum()

In [None]:
df.head(3)

# CORRELATION ANALYSIS

In [None]:
# Features and their correlation with the target variable
ans=df.iloc[:,:].corr()['SalePrice'].abs()     # using the absolute values
ans.sort_values(ascending=False).head(15)

In [None]:
ans=df.corr().abs().unstack().sort_values(ascending=False).drop_duplicates()
ans.head(48)

Dropping features with correlation >=60%

In [None]:
# GrLivArea was left out because not all houses were story buildings
df.drop(columns=['GarageArea','GarageYrBlt','TotalBsmtSF','2ndFlrSF','BedroomAbvGr','BsmtFinSF1',
                'FullBath','HalfBath'], inplace=True)

# TRANSFORMING FEATURES - dates

In [None]:
df['YearRemodAdd']=2020-df['YearRemodAdd']
df['YearBuilt']=2020-df['YearBuilt']
df.head(3)

# SPLITTING DATA INTO TARGET VARIABLE AND FEATURE SETS

In [None]:
y=df.loc[: , ['SalePrice']]
X=df.drop(columns=['SalePrice'])

In [None]:
X_CAT=X.select_dtypes(exclude=['int64','float64'])
X_NUM=X.select_dtypes(include=['int64','float64'])

In [None]:
scaler=MinMaxScaler()
ohe=OneHotEncoder(handle_unknown='error',sparse=False, dtype='float32' , drop='first')
X_CAT_TRANSFORM=pd.DataFrame(ohe.fit_transform(X_CAT))
X_CAT_TRANSFORM.columns=ohe.get_feature_names(X_CAT.columns)

X_NUM_TRANSFORM_SCALED=pd.DataFrame(scaler.fit_transform(X_NUM),columns=X_NUM.columns)

#  concatenating to create the feature set
X=pd.concat(
           [X_NUM_TRANSFORM_SCALED.reset_index(drop=True), 
            X_CAT_TRANSFORM.reset_index(drop=True)],
            axis=1
           )

In [None]:
X.head()

# SHAPE OF CLEANED DATASET TO BE USED FOR THE MODEL TRAINING

In [None]:
X.shape

# SPLITTING DATASET INTO TRAIN AND TEST SETS

In [None]:
# splitting Dataset into train set and test set ( ratio 80 : 20 )
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y, random_state=15,test_size=0.20)

# FEATURE IMPORTANCE SELECTION

In [None]:
%%time
parameter_random_forest=dict({ 'max_features': ['auto'] ,
                               'random_state':[0] ,
                               'n_estimators':[20]
                             })

grid_random_forest_feat=GridSearchCV (   RandomForestRegressor(), 
                                         cv=2, 
                                         param_grid=parameter_random_forest,
                                         scoring='r2', 
                                         n_jobs=-1
                                      )


grid_random_forest_feat.fit(X_train,y_train)

y_predicted_random=grid_random_forest_feat.predict(X_train)            # using the gridsearchcv object for prediction

print('\n R2_score \t', grid_random_forest_feat.score(X_train , y_train),'\n')
pd.DataFrame(grid_random_forest_feat.cv_results_)                      # tabulating the outcome of gridsearchcv object

In [None]:
clf_rand_forest=grid_random_forest_feat.best_estimator_  # passing the instance of best search
feat_importance=pd.DataFrame(clf_rand_forest.feature_importances_).sort_values(by=0,ascending=False)


feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X_train.columns,clf_rand_forest.feature_importances_):
    feats[feature] = importance #add the name/value pair 

ans=pd.DataFrame(feats,index=np.arange(len(feats)))
ans=(ans.T)
ans=ans.sort_values(by=0,ascending=False)
feat_importance=pd.DataFrame(ans.loc[:,0])
feat_importance.columns=['features']
print('List of features according to their importance in descending order')
feat_importance.head(18)

# FEATURE IMPORTANCE RANKING  - GRAPHICAL REPRESENTATION

Following is the code snippet to plot the n most important features of a random forest model.

In [None]:
%matplotlib inline

feat_importance=feat_importance.iloc[0:20,:]
# X_train=X_train.loc[:,feat_importance.index]    #  X_train 30 most important features

# plot the 50 most important features 
plt.figure(figsize=(6,8))
plt.barh(y=feat_importance.index,width=feat_importance['features']);
plt.title(' Featureimportance - TOP 20 , fontsize=16')
plt.box(None)

# DISTRIBUTION OF THE TARGET VARIABLE - Price of house y

In [None]:
sns.distplot(y, bins=None, hist=True, kde=True, rug=False, fit=None, hist_kws=None)

In [None]:
sns.scatterplot(data=y)

# COMPARING VARIOUS MACHINE LEARNING MODELS

In [None]:
# ESTABLISHING A BASELINE  -  DUMMY CLASSIFIER

In [None]:
%%time
# DUMMY CLASSIFIER to serve as the baseline for comparism with the more advanced classification models below
grid_dummy=GridSearchCV(DummyRegressor( ), 
                        param_grid={'strategy':['median']},          # Using the median because the target variable is skewed
                        cv=2,
                        scoring='r2',
                        n_jobs=-1
                         )
grid_dummy.fit(X_train,y_train)
y_predicted_dummy=grid_dummy.predict(X_train)

print('\n R2  score \t', grid_dummy.score(X_train , y_train),'\n')
pd.DataFrame(grid_dummy.cv_results_) 

# LASSO REGRESSION

In [None]:
%%time

parameter_Lasso=            dict({ 
                                            'alpha':np.arange(1,50),
                                            'random_state':[0],               
                                          })

grid_Lasso=GridSearchCV (    
                                 Lasso(), 
                                 cv=2, 
                                 param_grid=parameter_Lasso,
                                 scoring= 'r2',
                                 n_jobs= -1
                                )

grid_Lasso.fit(X_train,y_train)

print('R-squared score (training): {:.3f}'.format(grid_Lasso.score(X_train, y_train)))
print('R-squared score (test): {:.3f}\n'.format(grid_Lasso.score(X_test, y_test)))

y_test_predicted_Lasso=grid_Lasso.predict(X_test)     # using the gridsearchcv object for prediction
print('Test RMSE \t:',np.sqrt(mean_squared_error( y_test, y_test_predicted_Lasso)))

# pd.DataFrame(grid_Lasso.cv_results_)       # tabulating the outcome of gridsearchcv object

# SUPPORT VECTOR REGRESSOR - SVR

In [None]:
%%time

parameter_SVR=            dict({ 
                                            'degree':[2,3,4],   
                                            'gamma': ['scale','auto'],
                                            'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
                                             'C' : [0.01,0.1,1,2,3,5,10]
                                          })

grid_SVR=GridSearchCV (    
                                 SVR(), 
                                 cv=2, 
                                 param_grid=parameter_SVR,
                                 scoring= 'r2',
                                 n_jobs= -1 
                                )

grid_SVR.fit(X_train,y_train)

print('R-squared score (training): {:.3f}'.format(grid_SVR.score(X_train, y_train)))
print('R-squared score (test): {:.3f}\n'.format(grid_SVR.score(X_test, y_test)))

y_test_predicted_svr=grid_SVR.predict(X_test)     # using the gridsearchcv object for prediction
print('Test RMSE \t:',np.sqrt(mean_squared_error( y_test, y_test_predicted_svr)))

# pd.DataFrame(grid_SVR.cv_results_)       # tabulating the outcome of gridsearchcv object

# RANDOM FOREST REGRESSOR

In [None]:
%%time

parameter_random_forest=            dict({ 
                                            'n_estimators':[100,150,200,300,400,500],
                                            'criterion': ['mse'],
                                            'max_features': ['auto'],
                                            'bootstrap': [True],
                                            'oob_score': [True],
                                            'n_jobs' : [-1],
                                            'random_state':[0],
                                            'warm_start':[False],                
                                          })


grid_random_forest=GridSearchCV (    
                                 RandomForestRegressor(), 
                                 cv=2, 
                                 param_grid=parameter_random_forest,
                                 scoring= 'r2'
                                )

grid_random_forest.fit(X_train,y_train)

print('R-squared score (training): {:.3f}'.format(grid_random_forest.score(X_train, y_train)))
print('R-squared score (test): {:.3f}\n'.format(grid_random_forest.score(X_test, y_test)))

y_test_predicted_forest=grid_random_forest.predict(X_test)
print('Test RMSE \t:',np.sqrt(mean_squared_error(y_test, y_test_predicted_forest)))

y_predicted=grid_random_forest.predict(X_train)     # using the gridsearchcv object for prediction
# pd.DataFrame(grid_random_forest.cv_results_)       # tabulating the outcome of gridsearchcv object

In [None]:
predicted=pd.DataFrame(y_test_predicted_forest,columns=['predicted'])
actual=pd.DataFrame(y_test.astype('float64'))
com=pd.concat(
           [actual.reset_index(drop=True), 
            predicted.reset_index(drop=True)],
            axis=1
           )
com['Difference']=(com['SalePrice']-com['predicted'])

com.head(10)

# ADA BOOST REGRESSOR

In [None]:
%%time

parameter_AdaBoost=            dict({ 
                                            'n_estimators':[100,200,300],
                                            'random_state':[0],    
                                            'learning_rate': [0.001,0.01,0.1],
                                            'loss': ['linear','square','exponential']
                                          })

grid_AdaBoost=GridSearchCV (    
                                 AdaBoostRegressor(), 
                                 cv=2, 
                                 param_grid=parameter_AdaBoost,
                                 scoring= 'r2',
                                 n_jobs= -1 
                                )

grid_AdaBoost.fit(X_train,y_train)

print('R-squared score (training): {:.3f}'.format(grid_AdaBoost.score(X_train, y_train)))
print('R-squared score (test): {:.3f}\n'.format(grid_AdaBoost.score(X_test, y_test)))

y_test_predicted_AdaBoost=grid_AdaBoost.predict(X_test)      # using the gridsearchcv object for prediction
print('Test RMSE \t:',np.sqrt(mean_squared_error( y_test, y_test_predicted_AdaBoost)))

# pd.DataFrame(grid_AdaBoost.cv_results_)       # tabulating the outcome of gridsearchcv object

# GRADIENT BOOSTING REGRESSOR

In [None]:
%%time

parameter_grid_GBR=            dict({ 
                                            'n_estimators':[100,200,300,600],
                                            'criterion': ['friedman_mse'],
                                            'max_features': ['auto'],
                                            'random_state':[0],    
                                            'subsample': [0.85,1.0],
                                            'learning_rate': [0.001,0.01,0.1,]
                                          })


grid_GBR=GridSearchCV (    
                                 GradientBoostingRegressor(), 
                                 cv=2, 
                                 param_grid=parameter_grid_GBR,
                                 scoring= 'r2',
                                 n_jobs= -1
                                )

grid_GBR.fit(X_train,y_train)

print('R-squared score (training): {:.3f}'.format(grid_GBR.score(X_train, y_train)))
print('R-squared score (test): {:.3f}\n'.format(grid_GBR.score(X_test, y_test)))

y_test_predicted_GBR=grid_GBR.predict(X_test)
print('Mean Square Error \t:',np.sqrt(mean_squared_error(y_test, y_test_predicted_GBR)))

# pd.DataFrame(grid_GBR.cv_results_)       # tabulating the outcome of gridsearchcv object

# XGBOOSTING REGRESSOR

In [None]:
%%time
parameter_XGBoost=            dict({ 
                                    'objective':['reg:squarederror'],
                                    'max_depth':[3,4,5,8,10],
                                    'learning_rate' : [0.05,0.01,0.1],
                                    'n_estimators':[100,200,300],
                                    'gamma':[0,1,2,3],
                                    'subsample': [0.9],
                                    'colsample_bytree': [1],
                                    'reg_alpha' : [0,1,2,3,5,7,10],
                                    'scale_pos_weight':[1],
                                    'random_state':[0],
                                  })

grid_XGBoost=GridSearchCV (    
                                 XGBRegressor(), 
                                 cv=2, 
                                 param_grid=parameter_XGBoost,
                                 scoring= 'r2',
                                 n_jobs= -1 
                                )

grid_XGBoost.fit(X_train,y_train)

print('R-squared score (training): {:.3f}'.format(grid_XGBoost.score(X_train, y_train)))
print('R-squared score (test): {:.3f}\n'.format(grid_XGBoost.score(X_test, y_test)))

y_test_predicted_XGBoost=grid_XGBoost.predict(X_test)     # using the gridsearchcv object for prediction
print('Test RMSE \t:',np.sqrt(mean_squared_error( y_test, y_test_predicted_XGBoost)))

df_XGBoost=pd.DataFrame(grid_XGBoost.cv_results_)       # tabulating the outcome of gridsearchcv object

In [None]:
ans=df_XGBoost.sort_values(by=['rank_test_score'])
ans.head(5)

Using the ideal parameter values to build the final XGBOOTED model

In [None]:
best_parameter=grid_XGBoost.best_params_
best_parameter

.

.

# BUILDING THE OPTIMISED XGBOOSTED MODEL

In [2]:
# Re-reading the entire dataset for the to train the optimised model
os.chdir("c:/Users/User/Desktop/kaggle datasets/housing dataset/")
df=pd.read_csv('house_train_data.csv')

In [3]:
# Transforming the two date columns into years 
df['YearRemodAdd']=2020-df['YearRemodAdd']
df['YearBuilt']=2020-df['YearBuilt']

# Selecting the top 18 features according to their importance

In [4]:
df=df[['OverallQual','GrLivArea','1stFlrSF','GarageCars','LotArea','MasVnrArea','YearBuilt','YearRemodAdd','LotFrontage',
       'BsmtUnfSF','TotRmsAbvGrd','MoSold','Fireplaces','WoodDeckSF','OpenPorchSF','CentralAir','OverallCond','SalePrice']]

print(df.shape)
df.head(2)

Unnamed: 0,OverallQual,GrLivArea,1stFlrSF,GarageCars,LotArea,MasVnrArea,YearBuilt,YearRemodAdd,LotFrontage,BsmtUnfSF,TotRmsAbvGrd,MoSold,Fireplaces,WoodDeckSF,OpenPorchSF,CentralAir,OverallCond,SalePrice
0,7,1710,856,2,8450,196.0,17,17,65.0,150,8,2,0,0,61,Y,5,208500
1,6,1262,1262,2,9600,0.0,44,44,80.0,284,6,5,1,298,0,Y,8,181500


In [28]:
# Dropping the rows with missing values for MasVnrArea
df=df.loc[df['MasVnrArea'].notnull(), : ]

# converting LotFrontage to a numeric dtypes
df['LotFrontage']=pd.to_numeric(df['LotFrontage'], errors='coerce')
print(df.isnull().sum())

print('\n',df.dtypes)

OverallQual     0
GrLivArea       0
1stFlrSF        0
GarageCars      0
LotArea         0
MasVnrArea      0
YearBuilt       0
YearRemodAdd    0
LotFrontage     0
BsmtUnfSF       0
TotRmsAbvGrd    0
MoSold          0
Fireplaces      0
WoodDeckSF      0
OpenPorchSF     0
CentralAir      0
OverallCond     0
House_Price     0
dtype: int64

 OverallQual       int64
GrLivArea         int64
1stFlrSF          int64
GarageCars        int64
LotArea           int64
MasVnrArea      float64
YearBuilt         int64
YearRemodAdd      int64
LotFrontage     float64
BsmtUnfSF         int64
TotRmsAbvGrd      int64
MoSold            int64
Fireplaces        int64
WoodDeckSF        int64
OpenPorchSF       int64
CentralAir       object
OverallCond       int64
House_Price       int64
dtype: object


In [9]:
df['LotFrontage']=df['LotFrontage'].replace('.','')
df['LotFrontage'].fillna(df['LotFrontage'].mean(),inplace=True)
df.isnull().sum()

# Separating the target variable from the independent variable

In [None]:
y=df['SalePrice']
X=df.drop(labels=['SalePrice'], axis=1)

# Splitting Dataset into train set and test set ( ratio 95 : 5 )

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y, random_state=15,test_size=5)

# Transforming the Dataset 

In [13]:
columns_tran=['OverallQual', 'GrLivArea', '1stFlrSF', 'GarageCars', 'LotArea',
                                               'MasVnrArea', 'YearBuilt', 'YearRemodAdd', 'LotFrontage', 'BsmtUnfSF',
                                               'TotRmsAbvGrd', 'MoSold', 'Fireplaces', 'WoodDeckSF', 'OpenPorchSF','OverallCond','CentralAir']

transformed_columns= make_column_transformer (
                                               (MinMaxScaler(), 
                                               ['OverallQual', 'GrLivArea', '1stFlrSF', 'GarageCars', 'LotArea',
                                               'MasVnrArea', 'YearBuilt', 'YearRemodAdd', 'LotFrontage', 'BsmtUnfSF',
                                               'TotRmsAbvGrd', 'MoSold', 'Fireplaces', 'WoodDeckSF', 'OpenPorchSF','OverallCond'] ),
                                                
                                                (OneHotEncoder(handle_unknown='error',sparse=False, dtype='float32' , drop='first'),['CentralAir']),
                                                remainder = 'passthrough'
                                              )

# transformer=transformed_columns.fit_transform(X_train)

# X_train=pd.DataFrame(transformer,columns=columns_tran)
# X_train.head()

# THE OPTIMISED XGBOOSTED MODEL

In [16]:
XGBRegressor=XGBRegressor (
                             colsample_bytree= 1,
                             gamma= 0,
                             learning_rate= 0.05,
                             max_depth= 3,
                             n_estimators= 300,
                             objective='reg:squarederror',
                             random_state=0,
                             reg_alpha= 3,
                             scale_pos_weight= 1,
                             subsample=0.9
                          )

# THE PIPELINE

In [17]:
pipe=make_pipeline(transformed_columns, XGBRegressor)
pipe.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['OverallQual', 'GrLivArea',
                                                   '1stFlrSF', 'GarageCars',
                                                   'LotArea', 'MasVnrArea',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'LotFrontage', 'BsmtUnfSF',
                                                   'TotRmsAbvGrd', 'MoSold',
                                                   'Fireplaces', 'WoodDeckSF',
                                                   'OpenPorchSF',
                                                   'OverallCond']),
                                                 ('onehotencoder',
                      

# TEST ON THE OPTIMISED MODEL

In [18]:
print('R-squared score (training): {:.3f}'.format(pipe.score(X_train, y_train)))
print('R-squared score (test): {:.3f}\n'.format(pipe.score(X_test, y_test)))

y_predicted_train=pipe.predict(X_train)
print('Train RMSE \t:',np.sqrt(mean_squared_error( y_train, y_predicted_train)))

y_test_predicted=pipe.predict(X_test)                           
print('Test RMSE \t:',np.sqrt(mean_squared_error( y_test, y_test_predicted)))

# SAVING THE MODEL 

In [19]:
# import pickle
# import os

# os.chdir('c://Users/user/.spyder-py3/templates/pickle/')

# file=open('model_housePricePrediction_xgboost2020.pkl','wb')
# pickle.dump(pipe,file)
# file.close()

# The Test Dataset

In [26]:
X_test['House_Price']=y_test
df=X_test
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,OverallQual,GrLivArea,1stFlrSF,GarageCars,LotArea,MasVnrArea,YearBuilt,YearRemodAdd,LotFrontage,BsmtUnfSF,TotRmsAbvGrd,MoSold,Fireplaces,WoodDeckSF,OpenPorchSF,CentralAir,OverallCond,House_Price
1086,4,1092,546,1,1974,0.0,47,47,70.030126,212,6,5,0,120,96,Y,5,83500
1136,6,1252,1032,1,9600,0.0,70,70,80.0,752,6,4,0,0,0,Y,5,119000
336,9,1922,1922,3,14157,200.0,15,14,86.0,673,8,7,1,178,51,Y,5,377426
211,6,1212,1212,2,10420,0.0,11,11,83.0,1176,6,3,0,100,22,Y,5,186000
377,8,2466,1580,2,11143,0.0,16,15,102.0,1580,8,12,1,159,214,Y,5,340000
