# Hyper-parameter Tuning. Model: XGBoost

### Load libraries

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import xgboost as xgb

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

# Import pickle
import pickle

# Import warnings
import warnings
warnings.filterwarnings('ignore')

### Source data location and data dictionary

### Load data from github/ local

In [2]:
# Load Concrete Data file from local file system
# concrete_data = pd.read_csv("data/concrete.csv")

# Load Concrete Data file from github
concrete_data = pd.read_csv("https://raw.githubusercontent.com/socratesk/YHatSchoolOfAI/master/data/concrete.csv")

# Print the shape
print (concrete_data.shape)

# Print few rows to visualize the data
concrete_data.head(8)

(1030, 9)


Unnamed: 0,cement,blast,flyash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,compressive_strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3
5,266.0,114.0,0.0,228.0,0.0,932.0,670.0,90,47.03
6,380.0,95.0,0.0,228.0,0.0,932.0,594.0,365,43.7
7,380.0,95.0,0.0,228.0,0.0,932.0,594.0,28,36.45


### Split the dataset into Train and Test sets

In [4]:
SPLIT_RATIO = 0.2

# Split data into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(concrete_data.drop(["compressive_strength"], axis=1), 
                                                    concrete_data["compressive_strength"], 
                                                    test_size = SPLIT_RATIO, 
                                                    random_state = 230)

## Let us consider XGBoost algorithm to train the model and tune its hyper-parameters

### Create a XGBoost Model

In [5]:
# Generate XGBoost Model
model = XGBRegressor(random_state=999)

### Train the model with Train and Test dataset

In [7]:
# Train the XGBoost model using train dataset
model.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=999,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

### Predict the class using generated model with Test dataset

In [8]:
# Make predictions for test data
y_pred = model.predict(X_test)

### Print first few predicted values

In [9]:
# Print first 25 predicted values
print(y_pred[0:25])

[19.199282 48.96149  32.501865 34.468933 25.885956 52.59113  21.84642
 19.465195 48.688038 22.476915 41.423183 53.783    42.99973  31.512367
 40.679497 23.050293 34.574608 36.860527 55.422882 53.818497 44.517254
 23.361845 22.398268 38.04239  31.134315]


### Print the RMSE score of basic model

In [10]:
# Compute the RMSE score
rmse_initial = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# Print RMSE score
print ("RMSE of XGBoost model: ", rmse_initial)

RMSE of XGBoost model:  5.549236368916562


# <center>Parameter Tuning </center>

### Parameters to be tuned:
### - <code>n_estimators</code>
### - <code>max_depth</code>
### - <code>min_child_weight</code>
### - <code>subsample</code>
### - <code>colsample_bytree</code>
### - <code>learning_rate</code>
### - <code>reg_alpha</code>

### Tune <code>n_estimators</code>

In [17]:
# Set range of parameters for n_estimator
param_test0 = {
    'n_estimators':list(range(100, 200, 5))
}

# Build the XGBoost model for the range of n_estimator values
gridSearch = GridSearchCV(XGBRegressor(random_state=999),
                 param_grid = param_test0, 
                 scoring = 'r2',
                 n_jobs = 4,
                 iid = False,  
                 return_train_score=True,
                 cv = 5)

# Fit the train dataset
gridSearch.fit(X_train, y_train)

# Print scores for each parameter.
# REMEMBER the scores are based on train dataset only and NOT on test dataset
gridSearch.best_params_, gridSearch.best_score_   # gridSearch.cv_results_



({'n_estimators': 195}, 0.9063647320602183)

In [18]:
# Extract best scores from Grid search
estimatorvalue = gridSearch.best_params_['n_estimators']  # 195

### Tune <code>max_depth</code> and <code>min_child_weight</code>

In [22]:
# Set range of parameters for max_depth and min_child_weight
param_test1 = {
    'max_depth':list(range(3, 8, 1)),
    'min_child_weight':list(range(1, 6, 1))
}

# Build the XGBoost model for the range of max_depth and min_child_weight values
gridSearch = GridSearchCV(XGBRegressor(n_estimators=estimatorvalue, 
                                   random_state=999),
                 param_grid = param_test1, 
                 scoring = 'r2',
                 n_jobs = 4,
                 iid = False,  return_train_score=True,
                 cv = 5)

# Fit the train dataset
gridSearch.fit(X_train, y_train)

# Print scores for each parameter.
# REMEMBER the scores are based on train dataset only and NOT on test dataset
gridSearch.best_params_, gridSearch.best_score_   # gridSearch.cv_results_



({'max_depth': 5, 'min_child_weight': 1}, 0.9150314269732434)

In [23]:
# Extract best scores from Grid search
maxdepthvalue = gridSearch.best_params_['max_depth']  # 10
minchildvalue = gridSearch.best_params_['min_child_weight']  # 5

### Tune <code>subsample</code> and <code>colsample_bytree</code>

In [24]:
# Set range of parameters for subsample and colsample_bytree
param_test2 = {
 'subsample':[0.7, 0.75, 0.8, 0.85, 0.9],
 'colsample_bytree':[0.4, 0.45, 0.5, 0.55, 0.6]
}

# Build the XGBoost model for the range of subsample and colsample_bytree values
gridSearch = GridSearchCV(XGBRegressor(n_estimators = estimatorvalue, 
                                   max_depth = maxdepthvalue,
                                   min_child_weight = minchildvalue,
                                   random_state=999),
                 param_grid = param_test2, 
                 scoring = 'r2',
                 n_jobs = 4,
                 iid = False, 
                 cv = 5)

# Fit the train dataset
gridSearch.fit(X_train, y_train)

# Print scores for each parameter.
# REMEMBER the scores are based on train dataset only and NOT on test dataset
gridSearch.best_params_, gridSearch.best_score_



({'colsample_bytree': 0.5, 'subsample': 0.75}, 0.9246842873920856)

In [25]:
# Extract best scores from Grid search
colsamplevalue = gridSearch.best_params_['colsample_bytree'] # 0.5
subsamplevalue = gridSearch.best_params_['subsample'] # 0.8

### Tune <code>learning_rate</code>

In [26]:
# Set range of parameters for learning_rate
param_test3 = {
    #'learning_rate':[0.3, 0.2, 0.1, 0.01, 0.02, 0.03, 0.001, 0.002, 0.003]
    #'learning_rate':[0.08, 0.09, 0.1, 0.11, 0.12]
    'learning_rate':[0.085, 0.09, 0.095]
}

# Build the XGBoost model for the range of learning_rate values
gridSearch = GridSearchCV(XGBRegressor(n_estimators = estimatorvalue, 
                                   max_depth = maxdepthvalue,
                                   min_child_weight = minchildvalue, 
                                   subsample = subsamplevalue, 
                                   colsample_bytree = colsamplevalue,
                                   random_state=999),
                 param_grid = param_test3, 
                 scoring = 'r2',
                 n_jobs = 4,
                 iid = False, 
                 cv = 5)

# Fit the train dataset
gridSearch.fit(X_train, y_train)

# Print scores for each parameter.
# REMEMBER the scores are based on train dataset only and NOT on test dataset
gridSearch.best_params_, gridSearch.best_score_



({'learning_rate': 0.085}, 0.9258870234656745)

In [27]:
# Extract best scores from Grid search
learning_rate = gridSearch.best_params_['learning_rate'] # 0.09

### Tune  <code>reg_alpha</code>

In [28]:
# Set range of parameters for reg_alpha
param_test4 = {
 'reg_alpha':[0.9, 0.95, 1, 1.05, 1.1]
}

# Build the XGBoost model for the range of reg_alpha values
gridSearch = GridSearchCV(XGBRegressor(learning_rate = learning_rate, 
                                   n_estimators = estimatorvalue, 
                                   max_depth = maxdepthvalue,
                                   min_child_weight = minchildvalue, 
                                   subsample = subsamplevalue, 
                                   colsample_bytree = colsamplevalue,
                                   random_state=999),
                 param_grid = param_test4, 
                 scoring = 'r2',
                 n_jobs = 4,
                 iid = False, 
                 cv = 5)

# Fit the train dataset
gridSearch.fit(X_train, y_train)

# Print scores for each parameter.
# REMEMBER the scores are based on train dataset only and NOT on test dataset
gridSearch.best_params_, gridSearch.best_score_



({'reg_alpha': 1.05}, 0.9247388173408172)

In [29]:
# Extract best scores from Grid search
regalphavalue = gridSearch.best_params_['reg_alpha'] # 0.95

### Combine all the tuned parameters

In [30]:
final_model = XGBRegressor(
                  learning_rate = learning_rate,
                  n_estimators = estimatorvalue, 
                  max_depth = maxdepthvalue,
                  min_child_weight= minchildvalue, 
                  subsample = subsamplevalue, 
                  colsample_bytree = colsamplevalue,
                  reg_alpha = regalphavalue,
                  metrics = 'rmse',
                  random_state=999,
                  silent=True
                 )

### With combined parameters train the model using Train dataset

In [31]:
# Train the final model
final_model.fit(X_train, y_train)

# Make predictions for test data
y_pred = final_model.predict(X_test)

# Print the predicted values
print(y_pred[0:25])

[20.855257 51.29979  32.139202 34.521748 24.75205  51.81381  20.878647
 16.962027 44.97207  22.01795  36.398148 54.828156 40.983948 33.5913
 38.96908  23.389112 36.79337  35.482582 58.833076 52.701454 51.03874
 20.557758 20.822382 40.946056 28.830296]


### Compute and print the model's RMSE score after parameter tuning

In [32]:
# Compute the RMSE score
rmse_final = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# Print RMSE score
print('XGBoost model RMSE score after parameter tuning: %.4f' % rmse_final) # 4.8205

XGBoost model RMSE score after parameter tuning: 4.6743


### Print the model's RMSE score with default parameters (ORIGINAL)

In [33]:
# Print initial RMSE Score
print('XGBoost model RMSE score with default parameters: %.4f' % rmse_initial) # 5.5492

XGBoost model RMSE score with default parameters: 5.5492


### Print the percentage of decrease in RMSE score after parameter tuning

In [34]:
# Compute percentage improvement
perc_improvement = ( (rmse_initial - rmse_final) / rmse_initial) * 100

# Print percentage improvement
print('Percentage of improvement after parameter tuning: %.2f%%' % perc_improvement)

Percentage of improvement after parameter tuning: 15.77%


### Train the final deployable model with tuned parameters

In [35]:
# Train the final model
final_model.fit(concrete_data.drop(["compressive_strength"], axis=1), 
               concrete_data["compressive_strength"])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.5, gamma=0,
       importance_type='gain', learning_rate=0.085, max_delta_step=0,
       max_depth=5, metrics='rmse', min_child_weight=1, missing=None,
       n_estimators=195, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=999, reg_alpha=1.05, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.75, verbosity=1)

### Save the final model to deploy

In [36]:
# Persist the final model
filename = 'concrete_xgb_deloyable_model_ver_1_0.sav'
pickle.dump(final_model, open(filename, 'wb'))

---