In [39]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor

from sklearn import metrics
from sklearn.model_selection import GridSearchCV

%store -r 
target = 'SalePrice'

In the previous notebook we saved both the dataframes train and test with dummy variables. Here we retrieve them back. Also remember that the various features are log transformed including SalePrice. 

### XGBoost
Here we try to train gradient boosting model using XGBoost implementation and the sole purpose is to tune the parameters of XGBoost model. 

In [77]:
def modelfit(alg, dtrain, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain.drop([target], axis=1), label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='rmse', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
#     Fit the algorithm on the data
    alg.fit(dtrain.drop([target], axis=1), dtrain[target], eval_metric='rmse')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain.drop([target], axis=1))

    #Print model report:
    print ("MSE train : {0}".format(metrics.mean_squared_error(dtrain[target].values, dtrain_predictions)))
    
    return cvresult
    

1. Firsr we try a high learning rate and choose optimal n_estimators for this learning rate. 
2. Then we tune tree specific parameters like max_depth, min_child_weight etc for the decided learning_rate and n_estimators. 
3. Tune regularization parameters
4. Lower learning rate and increase n_estimators. 

### Step 1 : Fix learning_rate and n_estimators for tuning tree base parameters

Lets also first set some intial parameters to fix learning_rate and n_estimators
1. max_depth=5
2. min_child_weight=1
3. gamma=0.1
4. subsample=0.8
5. scale_pos_weight=1
6. colsmple_bytree=0.8


In [34]:
xgb1 = XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=1000, min_child_weight=1, subsample=0.8, scale_pos_weight=1
                   , silent=False, gamma=0.1, colsample_bytree=0.8)
cvresult = modelfit(xgb1, train_dummy)

RMSE train : 0.005961450553124035


In [76]:
print ("No estimators : {0}".format(xgb1.get_params()['n_estimators']))
print ("RMSE test : {0}".format(cvresult.iloc[cvresult.shape[0]-1, 0]))

No estimators : 706
RMSE test : 0.12152080000000001


So n_estimators=706 is the optimal value for learning_rate=0.1
Now that we have got our learning_rate and n_estimators fixed lets tune other parameters. 

#### Tune max_depth and min_child_weight



In [47]:
param_test1 = {
    'max_depth' : range(3, 10, 2),
    'min_child_weight' : range(1, 6, 2)
}
estimator = XGBRegressor(learning_rate=0.1, n_estimators=706, gamma=0.1, colsample_bytree=0.8, subsample=0.8, silent=False
                         ,scale_pos_weight=1, max_depth=5, min_child_weight=1)
gsearch1 = GridSearchCV(estimator, param_test1, cv=5, verbose=True)
gsearch1.fit(train_dummy.drop([target], axis=1), train_dummy[target])

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 26.3min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.1, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=706, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.8),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_child_weight': range(1, 6, 2), 'max_depth': range(3, 10, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)

In [50]:
gsearch1.best_estimator_

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.1, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=3, missing=None, n_estimators=706, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.8)

So max_depth = 5 and min_child_weight=3 are the best values for these parameters. Lets get more precise values.

In [51]:
param_test2 = {
    'max_depth' : [4,5,6],
    'min_child_weight' : [2,3,4]
}
estimator = XGBRegressor(learning_rate=0.1, n_estimators=706, gamma=0.1, colsample_bytree=0.8, subsample=0.8, silent=False
                         ,scale_pos_weight=1)
gsearch2 = GridSearchCV(estimator, param_test2, cv=5, verbose=True)
gsearch2.fit(train_dummy.drop([target], axis=1), train_dummy[target])
gsearch2.best_estimator_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  9.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.1, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=706, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.8),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_child_weight': [2, 3, 4], 'max_depth': [4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.1, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=3, missing=None, n_estimators=706, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.8)

So the optimal values for max_depth=5 and min_child_weight=3. 
#### Gamma

In [54]:
param_test3={'gamma':[i/10 for i in range(0,5)]}
estimator = XGBRegressor(learning_rate=0.1, n_estimators=706, colsample_bytree=0.8, subsample=0.8, silent=False
                         ,scale_pos_weight=1, max_depth=5, min_child_weight=3)
gsearch3 = GridSearchCV(estimator, param_test3, cv=5, verbose=True)
gsearch3.fit(train_dummy.drop([target], axis=1), train_dummy[target])

gsearch3.best_estimator_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed: 15.9min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=3, missing=None, n_estimators=706, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.8),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'gamma': [0.0, 0.1, 0.2, 0.3, 0.4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.1, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=3, missing=None, n_estimators=706, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.8)

So optimal gamma=0.1

####  Tune subsample and colsample_bytree

In [63]:
param_test4={'subsample':[i/10 for i in range(6,10)]
             , 'colsample_bytree':[i/10 for i in range(6,10)]}
estimator = XGBRegressor(learning_rate=0.1, n_estimators=706, silent=False, gamma=0.1
                         ,scale_pos_weight=1, max_depth=5, min_child_weight=3)
gsearch4 = GridSearchCV(estimator, param_test4, cv=5, verbose=True)
gsearch4.fit(train_dummy.drop([target], axis=1), train_dummy[target])

gsearch4.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 16.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0.1, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=3, missing=None, n_estimators=706, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.1, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=3, missing=None, n_estimators=706, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.7)

subsmaple=0.7, colsample_bytree=0.8

Now we can try value in the range of 0.05 around the found optimal values. 

In [64]:
param_test5 = {'subsample':[0.65, 0.68, 0.7, 0.72, 0.75]
             , 'colsample_bytree':[0.75, 0.78, 0.8, 0.82, 0.85]}
estimator = XGBRegressor(learning_rate=0.1, n_estimators=706, silent=False, gamma=0.1
                         ,scale_pos_weight=1, max_depth=5, min_child_weight=3)
gsearch5 = GridSearchCV(estimator, param_test5, cv=5, verbose=True)
gsearch5.fit(train_dummy.drop([target], axis=1), train_dummy[target])

gsearch5.best_estimator_

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed: 28.4min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0.1, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=3, missing=None, n_estimators=706, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'subsample': [0.65, 0.68, 0.7, 0.72, 0.75], 'colsample_bytree': [0.75, 0.78, 0.8, 0.82, 0.85]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.75,
       gamma=0.1, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=3, missing=None, n_estimators=706, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.72)

subsample=0.72, colsample_bytree=0.75

#### Tuning regularization parameters


In [65]:
param_test6 = {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100] }
estimator = XGBRegressor(learning_rate=0.1, n_estimators=706, silent=False, gamma=0.1
                         ,scale_pos_weight=1, max_depth=5, min_child_weight=3, subsample=0.72, colsample_bytree=0.75)
gsearch6 = GridSearchCV(estimator, param_test6, cv=5, verbose=True)
gsearch6.fit(train_dummy.drop([target], axis=1), train_dummy[target])

gsearch6.best_estimator_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed: 12.3min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.75,
       gamma=0.1, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=3, missing=None, n_estimators=706, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.72),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'reg_alpha': [1e-05, 0.01, 0.1, 1, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.75,
       gamma=0.1, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=3, missing=None, n_estimators=706, nthread=-1,
       objective='reg:linear', reg_alpha=1e-05, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.72)

reg_alpha = 1e-5
This may be because most of complexity is controlled by max_depth and gamma. 
Now lets first find the optimal n_estimators for updated parameters, then we will reduce learning rate and increase n_estimators. 

In [78]:
xgb2 = XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=1000, min_child_weight=3, subsample=0.72, scale_pos_weight=1
                   , silent=False, gamma=0.1, colsample_bytree=0.75, reg_alpha=0.00001)
cvresult2 = modelfit(xgb2, train_dummy)
print ("No estimators : {0}".format(xgb2.get_params()['n_estimators']))
print ("RMSE test : {0}".format(cvresult2.iloc[cvresult2.shape[0]-1, 0]))

No estimators : 885
RMSE test : 0.12089839999999999


Now lets reduce learning rate and increase n_estimators proportionally.
learning_rate=0.05  n_estimators=1670

In [68]:
xgb3 = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=8850, min_child_weight=3, subsample=0.72, scale_pos_weight=1
                   , silent=False, gamma=0.1, colsample_bytree=0.75, reg_alpha=0.00001)
cvresult3 = modelfit(xgb3, train_dummy)
print ("No estimators : {0}".format(xgb3.get_params()['n_estimators']))
print ("RMSE test : {0}".format(cvresult3.iloc[cvresult3.shape[0]-1, 0]))

RMSE train : 0.006505656325214961


No estimators : 3602
RMSE test : 0.1184584


Lets half our learning_rate and double optimal n_estimators.
1. learning_rate=0.005
2. n_estimators=7204

In [80]:
xgb4 = XGBRegressor(max_depth=5, learning_rate=0.005, n_estimators=7204, min_child_weight=3, subsample=0.72, scale_pos_weight=1
                   , silent=False, gamma=0.1, colsample_bytree=0.75, reg_alpha=0.00001)
cvresult4 = modelfit(xgb4, train_dummy)
print ("No estimators : {0}".format(xgb4.get_params()['n_estimators']))
print ("RMSE test : {0}".format(cvresult4.iloc[cvresult4.shape[0]-1, 0]))

MSE train : 0.006641313921785242
No estimators : 5255
RMSE test : 0.119139


So here our test error is more than pervious one where learning_rate was 0.01, let try a mid value and see if we can reduce it. 

In [82]:
xgb5 = XGBRegressor(max_depth=5, learning_rate=0.009, n_estimators=4000, min_child_weight=3, subsample=0.72, scale_pos_weight=1
                   , silent=False, gamma=0.1, colsample_bytree=0.75, reg_alpha=0.00001)
cvresult5 = modelfit(xgb5, train_dummy)
print ("No estimators : {0}".format(xgb5.get_params()['n_estimators']))
print ("RMSE test : {0}".format(cvresult5.iloc[cvresult5.shape[0]-1, 0]))

MSE train : 0.006536573940670113
No estimators : 3672
RMSE test : 0.11846940000000002


So even RMSE test with learning_rate=0.009 is greater than learning_rate=0.01
So our optimal parameters are n_estimators=3602, learning_rate=0.01. Lets fit and predict with final parameters.


In [83]:
xgb_final = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=3602, min_child_weight=3, subsample=0.72, scale_pos_weight=1
                   , silent=False, gamma=0.1, colsample_bytree=0.75, reg_alpha=0.00001)
xgb_final.fit(train_dummy.drop([target], axis=1), train_dummy[target], verbose=True)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.75,
       gamma=0.1, learning_rate=0.01, max_delta_step=0, max_depth=5,
       min_child_weight=3, missing=None, n_estimators=3602, nthread=-1,
       objective='reg:linear', reg_alpha=1e-05, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.72)

In [88]:
y_pred = xgb_final.predict(test_dummy)
y_pred = np.expm1(y_pred)
df_xbg = pd.DataFrame(y_pred, index=test.index, columns=(['SalePrice']))
df_xbg.to_csv('./submissions/xgb.csv')