# Hyper-parameter Tuning. Model: XGBoost

### Load libraries

In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import xgboost as xgb

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

# Import warnings
import warnings
warnings.filterwarnings('ignore')

### Source data location and data dictionary

### Load data from github/ local

In [None]:
# Load Concrete Data file from local file system
# concrete_data = pd.read_csv("data/concrete.csv")

# Load Concrete Data file from github
concrete_data = pd.read_csv("https://raw.githubusercontent.com/socratesk/YHatSchoolOfAI/master/data/concrete.csv")

# Print the shape
print (concrete_data.shape)

# Print few rows to visualize the data
concrete_data.head(8)

### Split the dataset into Train and Test sets

In [None]:
SPLIT_RATIO = 0.2

# Split data into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(concrete_data.drop(["compressive_strength"], axis=1), 
                                                    concrete_data["compressive_strength"], 
                                                    test_size = SPLIT_RATIO, 
                                                    random_state = 230)

## Let us consider XGBoost algorithm to train the model and tune its hyper-parameters

### Create a XGBoost Model

In [None]:
# Generate XGBoost Model
model = XGBRegressor(random_state=999)

### Train the model with Train and Test dataset

In [None]:
# Train the XGBoost model using train dataset
model.fit(X_train, y_train)

### Predict the class using generated XGBOOST model with Test dataset

In [None]:
# Make predictions for test data
y_pred = model.predict(X_test)

### Print first few predicted values

In [None]:
# Print first 25 predicted values
print(y_pred[0:25])

In [None]:
# Compute the RMSE score
rmse_initial = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# Print RMSE score
print ("RMSE of XGBoost model: ", rmse_initial)

# <center>Parameter Tuning </center>

### Tune <code>n_estimators</code>

In [None]:
# Set range of parameters for n_estimator
param_test0 = {
    'n_estimators':list(range(100, 200, 5))
}

# Build the XGBoost model for the range of max_depth and min_child_weight values
gridSearch = GridSearchCV(XGBRegressor(learning_rate =0.01, 
                                   #n_estimators=100, 
                                   max_depth=5,
                                   min_child_weight=2, 
                                   gamma=0,
                                   subsample=0.90, 
                                   colsample_bytree=0.90,
                                   silent=False,
                                   random_state=999),
             param_grid = param_test0, 
             scoring = 'r2',
             n_jobs = 4,
             iid = False,  
             return_train_score=True,
             cv = 5)

# Fit the train dataset
gridSearch.fit(X_train, y_train)

# Print scores for each parameter.
# REMEMBER the scores are based on train dataset only and NOT on test dataset
gridSearch.best_params_, gridSearch.best_score_   # gridSearch.cv_results_

In [None]:
# Extract best scores from Grid search
estimatorvalue = gridSearch.best_params_['n_estimators']  # 195

### Tune <code>max_depth</code> and <code>min_child_weight</code>

In [None]:
# Set range of parameters for max_depth and min_child_weight
param_test1 = {
    'max_depth':list(range(10, 15, 1)),
    'min_child_weight':list(range(1, 6, 1))
}

# Build the XGBoost model for the range of max_depth and min_child_weight values
gridSearch = GridSearchCV(XGBRegressor(learning_rate =0.01, 
                                   n_estimators=estimatorvalue, 
                                   #max_depth=5,
                                   #min_child_weight=2, 
                                   gamma=0,
                                   subsample=0.90, 
                                   colsample_bytree=0.90,
                                   silent=False,
                                   random_state=999),
             param_grid = param_test1, 
             scoring = 'r2',
             n_jobs = 4,
             iid = False,  return_train_score=True,
             cv = 5)

# Fit the train dataset
gridSearch.fit(X_train, y_train)

# Print scores for each parameter.
# REMEMBER the scores are based on train dataset only and NOT on test dataset
gridSearch.best_params_, gridSearch.best_score_   # gridSearch.cv_results_

In [None]:
# Extract best scores from Grid search
maxdepthvalue = gridSearch.best_params_['max_depth']  # 13
minchildvalue = gridSearch.best_params_['min_child_weight']  # 5

### Take the values of <code>max_depth</code> and <code>min_child_weight</code> from previous step and tune <code>subsample</code> and <code>colsample_bytree</code>

In [None]:
# Set range of parameters for subsample and colsample_bytree
param_test2 = {
 'subsample':[0.7, 0.75, 0.8, 0.85, 0.9, 0.95],
 'colsample_bytree':[0.6, 0.65, 0.7, 0.75, 0.8]
}

# Build the XGBoost model for the range of subsample and colsample_bytree values
gridSearch = GridSearchCV(XGBRegressor(learning_rate = 0.01, 
                                   n_estimators = estimatorvalue, 
                                   max_depth = maxdepthvalue,
                                   min_child_weight = minchildvalue, 
                                   gamma = 0,
                                   #subsample=0.75, 
                                   #colsample_bytree=0.72,
                                   random_state=999
                                       ),
             param_grid = param_test2, 
             scoring = 'r2',
             n_jobs = 4,
             iid = False, 
             cv = 5)

# Fit the train dataset
gridSearch.fit(X_train, y_train)

# Print scores for each parameter.
# REMEMBER the scores are based on train dataset only and NOT on test dataset
gridSearch.best_params_, gridSearch.best_score_

In [None]:
# Extract best scores from Grid search
colsamplevalue = gridSearch.best_params_['colsample_bytree'] # 0.75
subsamplevalue = gridSearch.best_params_['subsample'] # 0.95

### Take the values of <code>max_depth</code>, <code>min_child_weight</code>, <code>subsample</code>, and <code>colsample_bytree</code> from previous steps and tune <code>learning_rate</code>

In [None]:
# Set range of parameters for learning_rate
param_test3 = {
    #'learning_rate':[0.3, 0.2, 0.1, 0.01, 0.02, 0.03, 0.001, 0.002, 0.003]
    #'learning_rate':[0.08, 0.09, 0.1, 0.11, 0.12]
    'learning_rate':[0.085, 0.09, 0.095]
}

# Build the XGBoost model for the range of gamma values
gridSearch = GridSearchCV(XGBRegressor(#learning_rate = 0.01, 
                                   n_estimators = estimatorvalue, 
                                   max_depth = maxdepthvalue,
                                   min_child_weight = minchildvalue, 
                                   gamma=0,
                                   subsample = subsamplevalue, 
                                   colsample_bytree = colsamplevalue,
                                   random_state=999),
             param_grid = param_test3, 
             scoring = 'r2',
             n_jobs = 4,
             iid = False, 
             cv = 5)

# Fit the train dataset
gridSearch.fit(X_train, y_train)

# Print scores for each parameter.
# REMEMBER the scores are based on train dataset only and NOT on test dataset
gridSearch.best_params_, gridSearch.best_score_

In [None]:
# Extract best scores from Grid search
learning_rate = gridSearch.best_params_['learning_rate'] # 0.06

### Take the values of <code>max_depth</code>, <code>min_child_weight</code>, <code>subsample</code>,  <code>colsample_bytree</code>, and <code>gamma</code> from previous steps and tune  <code>reg_alpha</code>

In [None]:
# Set range of parameters for reg_alpha
param_test4 = {
 'reg_alpha':[0.9, 0.95, 1, 1.05, 1.1]
}

# Build the XGBoost model for the range of reg_alpha values
gridSearch = GridSearchCV(XGBRegressor(learning_rate = learning_rate, 
                                   n_estimators = estimatorvalue, 
                                   max_depth = maxdepthvalue,
                                   min_child_weight = minchildvalue, 
                                   gamma = 0,
                                   subsample = subsamplevalue, 
                                   colsample_bytree = colsamplevalue,
                                   random_state=999),
             param_grid = param_test4, 
             scoring = 'r2',
             n_jobs = 4,
             iid = False, 
             cv = 5)

# Fit the train dataset
gridSearch.fit(X_train, y_train)

# Print scores for each parameter.
# REMEMBER the scores are based on train dataset only and NOT on test dataset
gridSearch.best_params_, gridSearch.best_score_

In [None]:
# Extract best scores from Grid search
regalphavalue = gridSearch.best_params_['reg_alpha'] # 1.1

### Combine all the tuned parameters

In [None]:
finalModel = XGBRegressor(
                  learning_rate = learning_rate,
                  n_estimators = estimatorvalue, 
                  max_depth = maxdepthvalue,
                  min_child_weight= minchildvalue, 
                  gamma = 0,
                  subsample = subsamplevalue, 
                  colsample_bytree = colsamplevalue,
                  reg_alpha = regalphavalue,
                  metrics = 'rmse',
                  random_state=999,
                  silent=True
                 )

### With combined parameters train the model

In [None]:
# Train the final model
finalModel.fit(X_train, y_train)

# Make predictions for test data
y_pred = finalModel.predict(X_test)

# Print the predicted values
print(y_pred[0:25])

### Print the model RMSE score after parameter tuning

In [None]:
# Compute the RMSE score
rmse_xgb = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# Print prediction score
print('XGBoost model RMSE score after parameter tuning: %.4f' % rmse_xgb)

### Print the initial RMSE score with default parameters

In [None]:
# Print initial RMSE Score
print('XGBoost model RMSE score with default parameters: %.4f' % rmse_initial)

### Print the percentage of decease in RMSE score after parameter tuning

In [None]:
# Compute percentage improvement
perc_improvement = ( (rmse_initial - rmse_xgb) / rmse_initial) * 100

# Print percentage improvement
print('Percentage of improvement after parameter tuning: %.2f%%' % perc_improvement)

---