In [2]:
import pandas as pd

df = pd.read_csv("../challenge_houses-prices.csv")

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from bayes_opt import BayesianOptimization, UtilityFunction
import warnings
warnings.filterwarnings("ignore")

In [3]:
df.head()

Unnamed: 0,property_area,house_age,house_style,neighborhood,overall_quality,overall_condition,spaciousness,liv_lot_ratio,remodel_age,bath_area,bsmt_area,garage_area,garage_age,has_2ndfloor,has_porch,has_pool,has_multiple_kitchen,sale_price
0,2501,12,2Story,Gilbert,6,5,207.0,0.0,15,2.5,775,386,12.0,1,1,0,0,110000
1,1704,87,1.5Fin,OldTown,6,7,225.0,0.0,19,1.5,666,261,82.0,1,1,0,0,145203
2,2253,92,1.5Fin,OldTown,5,7,203.0,1.0,57,2.0,625,57,,1,1,0,0,120359
3,1788,75,1Story,BrkSide,5,7,165.0,0.0,59,1.0,838,228,89.0,0,1,0,0,92758
4,1361,42,1Story,Edwards,5,5,191.0,0.0,52,1.0,645,457,50.0,0,1,0,0,114073


In [4]:
df.shape

(50000, 18)

In [3]:
df.dtypes

property_area             int64
house_age                 int64
house_style              object
neighborhood             object
overall_quality           int64
overall_condition         int64
spaciousness            float64
liv_lot_ratio           float64
remodel_age               int64
bath_area               float64
bsmt_area                 int64
garage_area               int64
garage_age              float64
has_2ndfloor              int64
has_porch                 int64
has_pool                  int64
has_multiple_kitchen      int64
sale_price                int64
dtype: object

In [5]:
df_house_style = pd.get_dummies(df['house_style'])
df_neighborhood = pd.get_dummies(df['neighborhood'])

In [6]:
df = pd.concat([df, df_house_style, df_neighborhood], axis=1)

In [7]:
df.drop(['house_style','neighborhood'], axis=1, inplace=True)

In [8]:
for column in df.columns:
    df[column] = df[column].astype(float)

In [9]:
df.shape

(50000, 46)

In [12]:
df.head()

Unnamed: 0,property_area,house_age,overall_quality,overall_condition,spaciousness,liv_lot_ratio,remodel_age,bath_area,bsmt_area,garage_area,...,NWAmes,NoRidge,NridgHt,OldTown,SWISU,Sawyer,SawyerW,Somerst,StoneBr,Timber
0,2501.0,12.0,6.0,5.0,207.0,0.0,15.0,2.5,775.0,386.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1704.0,87.0,6.0,7.0,225.0,0.0,19.0,1.5,666.0,261.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2253.0,92.0,5.0,7.0,203.0,1.0,57.0,2.0,625.0,57.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1788.0,75.0,5.0,7.0,165.0,0.0,59.0,1.0,838.0,228.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1361.0,42.0,5.0,5.0,191.0,0.0,52.0,1.0,645.0,457.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
y = df['sale_price']
X = df.drop('sale_price', axis=1)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state = 42)

In [12]:
import xgboost as xgb

In [63]:
def XGB_CV(
        learning_rate,
        max_leaves,
        lambda1,
        lambda2,
        max_bin,
        max_depth,
        gamma,
        min_child_weight,
        max_delta_step,
        subsample,
        colsample_bytree,
        ):

    global rmse_best
    global mae_best
    global iter_best

    paramt = {
              'nthread' : 4,
              'seed' : 1001,
              'objective' : 'reg:squarederror',
              'eval_metric': ['rmse', 'mae'],
              'booster' : 'gbtree',
              'eta' : learning_rate,
              'gamma' : gamma,
              'max_depth' : int(max_depth),
              'min_child_weight' : min_child_weight,
              'max_delta_step' : int(max_delta_step),
              'subsample' : max(min(subsample, 1), 0),
              'colsample_bytree' : max(min(colsample_bytree, 1), 0),
              'alpha': lambda1,
              'lambda': lambda2,
              'max_leaves': int(max_leaves),
              'max_bin': int(max_bin),
              }

    folds = 5
    cv_score = 0

    print("\n Search parameters (%d-fold validation):\n %s" % (folds, paramt), file=log_file )
    log_file.flush()

    xgbc = xgb.cv(
        paramt,
        dtrain,
        num_boost_round = 100,
        nfold = folds,
        verbose_eval = 10,
        #early_stopping_rounds = 100,
        metrics = ['rmse','mae'],
        show_stdv = True
    )

    val_score = xgbc['test-rmse-mean'].iloc[-1]
    train_score = xgbc['train-rmse-mean'].iloc[-1]
    print(
        'Stopped after %d iterations with train-rmse = %f val-rmse = %f' % ( 
            len(xgbc), 
            train_score,
            val_score
        ) 
    )

    if ( val_score < rmse_best ):
        rmse_best = val_score
        iter_best = len(xgbc)

    return -val_score

In [64]:
# Define the log file. If you repeat this run, new output will be added to it
log_file = open('challenge-rmse-5fold-XGB-run-01-v1-full.log', 'a')
rmse_best = -1.
iter_best = 0

In [65]:
dtrain = xgb.DMatrix(X_train, label = y_train)

In [66]:
bounds = {
    'learning_rate': (0.001, 0.2),
    'max_leaves': (8, 1024),
    'lambda1': (0.1, 100),
    'lambda2': (0.1, 100),
    'max_bin': (3, 1023),
    'max_depth': (-1, 20),
    'gamma': (0.001, 10.0),
    'min_child_weight': (0, 20),
    'max_delta_step': (0, 10),
    'subsample': (0.1, 1.0),
    'colsample_bytree' :(0.1, 1.0)
}

In [67]:
XGB_BO = BayesianOptimization(
    f=XGB_CV, 
    pbounds=bounds,
)

In [70]:
print('-'*130)
print('-'*130, file=log_file)
log_file.flush()

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    XGB_BO.maximize(init_points=2, n_iter=50)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | colsam... |   gamma   |  lambda1  |  lambda2  | learni... |  max_bin  | max_de... | max_depth | max_le... | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
Stopped after 100 iterations with train-rmse = 191160.843750 val-rmse = 191159.087500
| [0m 25      [0m | [0m-1.912e+0[0m | [0m 0.3627  [0m | [0m 3.695   [0m | [0m 64.86   [0m | [0m 70.71   [0m | [0m 0.1122  [0m | [0m 755.2   [0m | [0m 1.536   [0m | [0m 8.471   [0m | [0m 282.9   [0m | [0m 1.521   [0m | [0m 0.1096  [0m |
Stopped after 100 iterations with train-rmse = 191161.403125 val-rmse = 191159.681250
| [0m 26      [0m | [0m-1.912e+0[0m | [0m 0.734   [0m | [0m 8.678   [0m | [0m 17.37   [0m | [0m 14.34   [0m

In [75]:
XGB_BO.max

{'target': -34906.491406400004,
 'params': {'colsample_bytree': 0.8832635679721044,
  'gamma': 1.7659450508306334,
  'lambda1': 70.27689014791974,
  'lambda2': 41.17989096578065,
  'learning_rate': 0.09192742232542225,
  'max_bin': 484.929853363818,
  'max_delta_step': 0.4252788214274317,
  'max_depth': 6.223921658901861,
  'max_leaves': 932.853965644163,
  'min_child_weight': 6.997838154971065,
  'subsample': 0.13915515897442735}}

In [13]:
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=100,
    objective='reg:squarederror',
    eval_metric=['rmse', 'mae'],
    booster='gbtree',
    eta=0.09192742232542225,
    gamma=1.7659450508306334,
    max_depth=6,
    min_child_weight=6.997838154971065,
    max_delta_step=0,
    subsample=0.13915515897442735,
    colsample_bytree=0.8832635679721044,
    reg_alpha=70.27689014791974,
    reg_lambda=41.17989096578065,
    max_leaves=932,
    max_bin=484,
    n_jobs=-1,
    random_state=1,
)

In [17]:
eval_set = [(X_train, y_train), (X_test, y_test)]

In [19]:
model.fit(X_train, y_train, eval_set=eval_set, verbose=0)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8832635679721044,
             eta=0.09192742232542225, eval_metric=['rmse', 'mae'],
             gamma=1.7659450508306334, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.0919274241,
             max_bin=484, max_delta_step=0, max_depth=6, max_leaves=932,
             min_child_weight=6.997838154971065, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=-1,
             num_parallel_tree=1, random_state=1, reg_alpha=70.27689014791974,
             reg_lambda=41.17989096578065, scale_pos_weight=1,
             subsample=0.13915515897442735, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [21]:
results = model.evals_result()

In [24]:
results['validation_1']['mae']

[162878.03125,
 148418.984375,
 135116.140625,
 123135.203125,
 112133.71875,
 102231.867188,
 93239.976563,
 85040.9375,
 77612.21875,
 70917.171875,
 64914.796875,
 59553.160156,
 54783.039063,
 50650.484375,
 46993.878906,
 43822.734375,
 41150.757813,
 38819.40625,
 36810.257813,
 35068.769531,
 33626.105469,
 32420.560547,
 31410.076172,
 30573.570313,
 29882.048828,
 29309.111328,
 28816.722656,
 28382.675781,
 28041.828125,
 27741.628906,
 27503.880859,
 27305.314453,
 27130.074219,
 26990.896484,
 26865.769531,
 26772.226563,
 26675.951172,
 26602.314453,
 26535.138672,
 26489.101563,
 26448.884766,
 26416.359375,
 26388.576172,
 26358.128906,
 26327.576172,
 26316.451172,
 26300.5625,
 26287.878906,
 26270.453125,
 26248.962891,
 26237.851563,
 26236.564453,
 26230.96875,
 26232.582031,
 26228.443359,
 26226.923828,
 26222.376953,
 26223.265625,
 26216.765625,
 26219.339844,
 26209.582031,
 26208.886719,
 26205.333984,
 26200.912109,
 26202.183594,
 26190.835938,
 26190.818359

In [91]:
y_pred = model.predict(X_test)

In [86]:
model.feature_importances_

array([0.032383  , 0.0224488 , 0.5891712 , 0.00595525, 0.01227575,
       0.00658045, 0.00673699, 0.10797799, 0.00944525, 0.01488979,
       0.00758247, 0.00825444, 0.00323331, 0.        , 0.        ,
       0.00411792, 0.00921421, 0.        , 0.00565385, 0.        ,
       0.00535795, 0.        , 0.        , 0.00695626, 0.        ,
       0.01600392, 0.00910971, 0.00478263, 0.0084412 , 0.00634724,
       0.00454606, 0.0016045 , 0.00740993, 0.0070185 , 0.        ,
       0.00760337, 0.01187306, 0.01184839, 0.00526642, 0.01023314,
       0.        , 0.00995217, 0.00602392, 0.00545086, 0.00825016],
      dtype=float32)

In [94]:
sorted_idx = (-model.feature_importances_).argsort()
cols = np.array(X.columns)[sorted_idx]
imp_values = model.feature_importances_[sorted_idx]

importance = dict(zip(cols, imp_values))