In [1]:
# import libraries
import pandas as pd
import numpy as np
import sklearn

In [2]:
# import dataset
dataset = pd.read_csv('final_data.csv')

In [3]:
print(dataset.shape)
dataset.head()

(1459, 23)


Unnamed: 0,Id,SalePrice,LotShape,BldgType,OverallQual,YearBuilt,YearRemodAdd,ExterQual,BsmtQual,BsmtExposure,...,1stFlrSF,GrLivArea,BsmtFullBath,KitchenQual,Fireplaces,GarageType,GarageFinish,GarageCars,PavedDrive,SaleCondition
0,1,12.247694,1.0,0.0,0.666667,0.036765,0.098361,0.666667,0.5,1.0,...,0.356155,0.577712,0.333333,0.666667,0.0,0.166667,0.666667,0.5,1.0,0.8
1,2,12.109011,1.0,0.0,0.555556,0.227941,0.52459,1.0,0.5,0.25,...,0.503056,0.470245,0.0,1.0,0.333333,0.166667,0.666667,0.5,1.0,0.8
2,3,12.317167,0.0,0.0,0.666667,0.051471,0.114754,0.666667,0.5,0.75,...,0.383441,0.593095,0.333333,0.666667,0.333333,0.166667,0.666667,0.5,1.0,0.8
3,4,11.849398,0.0,0.0,0.666667,0.669118,0.606557,1.0,1.0,1.0,...,0.399941,0.579157,0.333333,0.666667,0.333333,0.833333,1.0,0.75,1.0,0.0
4,5,12.429216,0.0,0.0,0.777778,0.058824,0.147541,0.666667,0.5,0.0,...,0.466237,0.666523,0.333333,0.666667,0.333333,0.166667,0.666667,0.75,1.0,0.8


In [4]:
# separate independent and dependent features
X = dataset.iloc[:, 2:].values
y = dataset.iloc[:, 1].values

In [5]:
# split train and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(1313, 21) (1313,) (146, 21) (146,)


In [6]:
# define rmse function for validation
from sklearn.metrics import mean_squared_error
def rmse(actual, predicted):
    actual = np.log(actual)
    predicted = np.log(predicted)
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    
    return rmse

# Train and validate model

### XGBRegressor

In [7]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(x_train, y_train)
xgb_pred = xgb.predict(x_test)
print('score :',rmse(y_test, xgb_pred))

score : 0.011961876600382284


### RandomForestRegressor

In [8]:
from sklearn.ensemble import RandomForestRegressor
rnd = RandomForestRegressor()
rnd.fit(x_train, y_train)
rnd_pred = rnd.predict(x_test)
print('score :',rmse(y_test, rnd_pred))

score : 0.011987825177110674


### DecisionTreeRegressor

In [9]:
from sklearn.tree import DecisionTreeRegressor
dcs = DecisionTreeRegressor()
dcs.fit(x_train, y_train)
dcs_pred = dcs.predict(x_test)
print('score :',rmse(y_test, dcs_pred))

score : 0.017120039440872103


## Hyperparameter Tunning

In [10]:
# define tuning parameters
n_estimator = [100,500,900,1100,1500]
max_depth = [2,3,5,10,15]
booster = ['gbtree','gblinear']
learning_rate = [0.05,0.1,0.15,0.20]
min_child_weight = [1,2,3,4]
base_score=[0.25,0.5,0.75,1]

hyperparameter_grid = {'n_estimator':n_estimator,
                      'max_depth':max_depth,
                      'learning_rate':learning_rate,
                      'min_child_weight':min_child_weight,
                      'booster':booster,
                      'base_score':base_score}

### RandomizedSearchCV

In [13]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

regressor = XGBRegressor()

random_cv = RandomizedSearchCV(estimator=regressor,
                            param_distributions=hyperparameter_grid,
                            cv=5, n_iter=50,
                            scoring = 'neg_mean_absolute_error', n_jobs=4,
                            verbose=5, return_train_score =True,
                            random_state=42)

random_cv.fit(x_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 144 tasks      | elapsed:    4.3s




[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:    5.9s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthread=None,
                                          objective='reg:linear',
                                          random_state=0, reg_alpha=...
                   iid='deprecated', n_iter=50, n_jobs=4,
                   param_distributions={'base_score': [0.25, 0.5, 0.75, 1],
                                        'booster': ['g

In [17]:
random_cv.best_estimator_

XGBRegressor(base_score=0.75, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=2, missing=None, n_estimator=1100,
             n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [18]:
regressor = XGBRegressor(base_score=0.75, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=2, missing=None, n_estimator=1100,
             n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [19]:
regressor.fit(x_train, y_train)



XGBRegressor(base_score=0.75, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=2, missing=None, n_estimator=1100,
             n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [20]:
tuned_xgb_pred = regressor.predict(x_test)
print('score :',rmse(y_test, tuned_xgb_pred))

score : 0.011904043353031203
