### House Prices - Advanced Regression Techniques
##### Predict sales prices and practice feature engineering, RFs, and gradient boosting

##### https://www.kaggle.com/c/house-prices-advanced-regression-techniques/

###### Sulaiman Binkhamis 
###### March 11, 2022

In [1]:
# House Prices - Advanced Regression Techniques
# Predict sales prices and practice feature engineering, RFs, and gradient boosting



In [2]:
# Works with pycaret and pycaret 2
#!pip install pycaret==2.0
from pycaret.regression import *
import pandas as pd

In [3]:
# check version
from pycaret.utils import version
version()

'2.3.4'

In [4]:
data = pd.read_csv(r'C:\Users\Sulaiman\Downloads\house-prices-advanced-regression-techniques\train2.csv')

test_data = pd.read_csv(r'C:\Users\Sulaiman\Downloads\house-prices-advanced-regression-techniques\test.csv')

print(data.shape, test_data.shape)

(1444, 81) (1459, 80)


In [5]:

data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [6]:
# Ignoring features with high null values 

demo = setup(data = data, target = 'SalePrice', 
                   ignore_features = ['Alley','PoolQC','MiscFeature','Fence','FireplaceQu','Utilities'],normalize = True,
                   transformation= True, transformation_method = 'yeo-johnson', 
                   transform_target = True, remove_outliers= True,
                   remove_multicollinearity = True,
                   ignore_low_variance = True, combine_rare_levels = True) 

Unnamed: 0,Description,Value
0,session_id,3367
1,Target,SalePrice
2,Original Data,"(1444, 81)"
3,Missing Values,True
4,Numeric Features,19
5,Categorical Features,55
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(959, 243)"


In [7]:
# Auto sort on R2 
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,13657.9011,416337325.5451,20195.837,0.9179,0.113,0.0808,0.014
huber,Huber Regressor,13492.4464,416642275.2406,20156.956,0.9175,0.1126,0.08,0.086
ridge,Ridge Regression,14112.3798,444014238.2646,20864.431,0.9114,0.1161,0.0834,0.16
par,Passive Aggressive Regressor,16780.0104,557654630.4995,23324.5345,0.8897,0.1318,0.0989,0.009
omp,Orthogonal Matching Pursuit,16106.0929,563085502.648,23581.6675,0.8879,0.1279,0.0936,0.167
gbr,Gradient Boosting Regressor,15404.3464,575456474.1895,23783.24,0.8868,0.1278,0.0898,0.097
lightgbm,Light Gradient Boosting Machine,16634.2367,635646080.3173,25001.1949,0.8735,0.1317,0.096,0.123
rf,Random Forest Regressor,17320.2956,778237997.2972,27489.2892,0.8493,0.141,0.0993,0.254
knn,K Neighbors Regressor,20288.9634,964325022.5688,30762.5609,0.8122,0.1583,0.1147,0.012
ada,AdaBoost Regressor,22810.922,1137757368.0826,33350.2398,0.7805,0.1686,0.1283,0.065


PowerTransformedTargetRegressor(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
                                compute_score=False, copy_X=True,
                                fit_intercept=True, lambda_1=1e-06,
                                lambda_2=1e-06, lambda_init=None, n_iter=300,
                                normalize=False,
                                power_transformer_method='box-cox',
                                power_transformer_standardize=True,
                                regressor=BayesianRidge(alpha_1=1e-06,
                                                        alpha_2=1e-06,
                                                        alpha_init=None,
                                                        compute_score=False,
                                                        copy_X=True,
                                                        fit_intercept=True,
                                                        lambda_1=1e-06,
               

In [8]:
# Creating models for the best estimators 
br = create_model('br')
huber = create_model('huber')
ridge = create_model('ridge')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,14875.734,607714990.4622,24651.876,0.8971,0.1207,0.0835
1,14126.0755,342396879.6633,18503.9693,0.9206,0.1125,0.086
2,15315.4804,498305129.4722,22322.7491,0.8929,0.1074,0.0834
3,14943.6151,429655413.4394,20728.131,0.9314,0.1124,0.0868
4,12713.3998,288288048.1339,16979.0473,0.937,0.1246,0.0873
5,15014.7777,470032136.4787,21680.2245,0.9143,0.1247,0.0888
6,12053.9056,300534908.7755,17335.9427,0.8842,0.1081,0.0761
7,15030.6029,710295626.3325,26651.3719,0.8684,0.1251,0.0858
8,12745.8406,428011365.41,20688.4355,0.9274,0.1171,0.0781
9,14304.3661,364907884.4785,19102.5622,0.941,0.108,0.0777


In [9]:
# Tuning the created models 
br = tune_model(br)
huber = tune_model(huber)
ridge = tune_model(ridge)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,14274.4199,567032138.3533,23812.4366,0.904,0.114,0.0792
1,12995.4301,287904918.115,16967.7611,0.9332,0.0979,0.0758
2,14847.6363,473490233.756,21759.8307,0.8982,0.1076,0.0817
3,15631.703,455522265.4322,21342.9676,0.9273,0.1176,0.0904
4,11813.3408,255445290.8811,15982.6559,0.9442,0.1197,0.0823
5,15127.452,468426166.3361,21643.1552,0.9146,0.1262,0.0917
6,10902.9813,259414069.8986,16106.3363,0.9001,0.1063,0.0713
7,14073.2302,594606961.6253,24384.564,0.8899,0.122,0.0824
8,12340.8857,412324495.2792,20305.7749,0.93,0.1115,0.0756
9,14388.246,369746142.83,19228.7842,0.9402,0.1061,0.0771


In [10]:
# Blending models
blender = blend_models(estimator_list = [br, huber, ridge])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,13754.6295,562501240.5881,23717.1086,0.9047,0.1111,0.0754
1,12248.8952,259007672.9352,16093.7153,0.9399,0.0916,0.0713
2,14766.9867,482670536.1274,21969.7641,0.8962,0.1071,0.0807
3,14790.7893,408612342.327,20214.1619,0.9348,0.1137,0.0868
4,11421.5981,242850000.1391,15583.6453,0.9469,0.1191,0.0807
5,14436.8026,436100072.8084,20883.0092,0.9205,0.1238,0.0881
6,10722.4807,257202156.6675,16037.5234,0.9009,0.1071,0.0707
7,13929.1803,573060122.0704,23938.6742,0.8939,0.1204,0.0813
8,11734.4901,362737231.6638,19045.6618,0.9385,0.1089,0.0737
9,14096.8795,352787776.0975,18782.6456,0.943,0.1042,0.0755


In [11]:
# Finaliszing model for predictions 
model = finalize_model(blender)
predictions = predict_model(model, data = test_data)

In [12]:
# Generating CSV for Kaggle Submissions 
sub = pd.DataFrame({
        "Id": predictions['Id'],
        "SalePrice": predictions['Label']
    })

sub.to_csv(r'C:\Users\Sulaiman\Downloads\house-prices-advanced-regression-techniques\submission.csv', index=False)