**Kernel based on the article [Complete Guide to Parameter Tuning in XGBoost with codes in Python](https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/) by AARSHAY JAIN**

In [None]:
import numpy as np 
import pandas as pd 

import xgboost as xgb
from xgboost.sklearn import XGBRegressor

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score

import matplotlib.pyplot as plt
from matplotlib.pyplot import rcParams
rcParams['figure.figsize'] = 12, 5

In [None]:
target = 'target'
seed = 42

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv(dirname + '/' + filenames[1], index_col='id')
train.head()

In [None]:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, 
                          xgtrain, 
                          num_boost_round=alg.get_params()['n_estimators'], 
                          nfold=cv_folds, 
                          metrics='rmse', 
                          early_stopping_rounds=early_stopping_rounds, 
                          verbose_eval=10)
        alg.set_params(n_estimators=cvresult.shape[0])
        
        alg.fit(dtrain[predictors], dtrain['target'], eval_metric='rmse')
        
        dtrain_predictions = alg.predict(dtrain[predictors])
        
#         print("\nModel Report")
#         print("Accuracy : {:.4f}".format(accuracy_score(dtrain[target].values, dtrain_predictions)))
        
        feat_imp = pd.Series(sorted(alg.feature_importances_, reverse=True))
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')
        plt.show()
        return cvresult.shape[0]

In [None]:
predictors = [x for x in train.columns if x != target]
initial_params = {'learning_rate': 0.5,
                   'n_estimators': 1000,
                   'max_depth': 5,
                   'min_child_weight': 1,
                   'gamma': 0,
                   'subsample': 0.8,
                   'colsample_bytree': 0.8,
                   'objective': 'reg:squarederror',
                   'nthread': 4,
                   'scale_pos_weight': 1, 
                   'seed': seed}
xgb1 = XGBRegressor(**initial_params)
num_boosts1 = modelfit(xgb1, train, predictors)

In [None]:
initial_params['n_estimators'] = num_boosts1 + 1

In [None]:
param_test1 = {
    'max_depth': range(3, 10, 2),
    'min_child_weight': range(1, 11, 2)
}
gsearch1 = GridSearchCV(XGBRegressor(**initial_params),
                       param_grid=param_test1, 
                       scoring='neg_mean_squared_error',
                       n_jobs=-1,
                       iid=False,
                       cv=5,
                       verbose=10)
gsearch1.fit(train[predictors], train[target])
gsearch1.cv_results_['mean_test_score'], gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test2 = {
    'max_depth': [gsearch1.best_params_['max_depth'] - 1,
                  gsearch1.best_params_['max_depth'] + 1],
    'min_child_weight': [gsearch1.best_params_['min_child_weight'] - 1,
                         gsearch1.best_params_['min_child_weight'] + 1]
}
gsearch2 = GridSearchCV(XGBRegressor(**initial_params),
                       param_grid=param_test2, 
                       scoring='neg_mean_squared_error',
                       n_jobs=-1,
                       iid=False,
                       cv=5,
                       verbose=10)
gsearch2.fit(train[predictors], train[target])
gsearch2.cv_results_['mean_test_score'], gsearch2.best_params_, gsearch2.best_score_

In [None]:
initial_params['max_depth'] = gsearch2.best_params_['max_depth']
initial_params['min_child_weight'] = gsearch2.best_params_['min_child_weight']

In [None]:
param_test3 = {
    'gamma': np.arange(0, 5, 0.1)
}
gsearch3 = GridSearchCV(XGBRegressor(**initial_params),
                       param_grid=param_test3, 
                       scoring='neg_mean_squared_error',
                       n_jobs=-1,
                       iid=False,
                       cv=5,
                       verbose=10)
gsearch3.fit(train[predictors], train[target])
gsearch3.cv_results_['mean_test_score'], gsearch3.best_params_, gsearch3.best_score_

In [None]:
initial_params['n_estimators'] = 1000

xgb2 = XGBRegressor(**initial_params)
num_boosts2 = modelfit(xgb2, train, predictors)

initial_params['n_estimators'] = num_boosts2 + 1

In [None]:
param_test4 = {
    'subsample': np.arange(0.5, 1, 0.1),
    'colsample_bytree': np.arange(0.5, 1, 0.1)
}
gsearch4 = GridSearchCV(XGBRegressor(**initial_params),
                       param_grid=param_test4, 
                       scoring='neg_mean_squared_error',
                       n_jobs=-1,
                       iid=False,
                       cv=5,
                       verbose=10)
gsearch4.fit(train[predictors], train[target])
gsearch4.cv_results_['mean_test_score'], gsearch4.best_params_, gsearch4.best_score_

In [None]:
param_test5 = {
    'subsample': np.arange(gsearch4.best_params_['subsample'] - 0.2, 
                           gsearch4.best_params_['subsample'] + 0.2, 
                           0.05),
    'colsample_bytree': np.arange(gsearch4.best_params_['colsample_bytree'] - 0.2, 
                           gsearch4.best_params_['colsample_bytree'] + 0.2, 
                           0.05)
}
gsearch5 = GridSearchCV(XGBRegressor(**initial_params),
                       param_grid=param_test5, 
                       scoring='neg_mean_squared_error',
                       n_jobs=-1,
                       iid=False,
                       cv=5,
                       verbose=10)
gsearch5.fit(train[predictors], train[target])
gsearch5.cv_results_['mean_test_score'], gsearch5.best_params_, gsearch5.best_score_

In [None]:
initial_params['subsample'] = gsearch5.best_params_['subsample']
initial_params['colsample_bytree'] = gsearch5.best_params_['colsample_bytree']

In [None]:
param_test6 = {
    'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(XGBRegressor(**initial_params),
                       param_grid=param_test6, 
                       scoring='neg_mean_squared_error',
                       n_jobs=-1,
                       iid=False,
                       cv=5,
                       verbose=10)
gsearch6.fit(train[predictors], train[target])
gsearch6.cv_results_['mean_test_score'], gsearch6.best_params_, gsearch6.best_score_

In [None]:
initial_params['reg_alpha'] = gsearch6.best_params_['reg_alpha']

In [None]:
initial_params['n_estimators'] = 1000

xgb3 = XGBRegressor(**initial_params)
num_boosts3 = modelfit(xgb3, train, predictors)

initial_params['n_estimators'] = num_boosts3 + 1

In [None]:
initial_params['n_estimators'] = 5000
initial_params['learning_rate'] = 0.2

xgb4 = XGBRegressor(**initial_params)
modelfit(xgb4, train, predictors)

In [None]:
test = pd.read_csv(dirname + '/' + filenames[2], index_col='id')
sample_submission = pd.read_csv(dirname + '/' + filenames[0], index_col='id')

In [None]:
test_predictions = xgb4.predict(test[predictors])

In [None]:
sample_submission[target] =  test_predictions
sample_submission.to_csv('tuned_xgb_av.csv')