<b> Obejective</b> : Observe the impact of changing parameter on Accuracy. We would use a Small dataset of Dimension (7043, 31)

<strong style="color:Tomato;">Import Libraries </strong>

In [None]:
# Data Handling
from pandas import read_csv
import os
import pandas as pd
import numpy as np
import time

# Modeling
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# Visualization
import matplotlib
from matplotlib import pyplot

## Hyperparameter optimization using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

<strong style="color:Tomato;">Load Data Set </strong>

In [None]:
df_basedata_train_0 = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df_basedata_train_0.head()

<strong style="color:Tomato;">Data Preparation </strong>

In [None]:
df_basedata_train_0.info()
# Converting Total Charges to a numerical data type.
df_basedata_train_0.TotalCharges = pd.to_numeric(df_basedata_train_0.TotalCharges, errors='coerce')

# Find Categorical Variables
s = (df_basedata_train_0.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)
# Encode Categorical Variables
df_1 = pd.get_dummies(df_basedata_train_0, columns=["gender","Partner","Dependents", "PhoneService", "MultipleLines", "InternetService","OnlineSecurity","OnlineBackup","DeviceProtection", "TechSupport","StreamingTV","StreamingMovies","Contract","PaperlessBilling","PaymentMethod", "Churn"   ],drop_first=True)

# Rename Target feature
df_1 = df_1.rename(columns={'Churn_Yes': 'target'})

## Drop ID Features
df_2=df_1.drop(['customerID'],axis=1)
df_2.head()


<strong style="color:Tomato;">Split the Dataset in Predictor and Target </strong>

In [None]:
# split data into X and y
X = df_2.drop("target", axis=1)
y = df_2["target"]


# encode string class values as integers
label_encoded_y = LabelEncoder().fit_transform(y)
np.unique(label_encoded_y)

<strong style="color:Tomato;">Define Hyper-Parameters </strong>

In [None]:
## Hyper Parameter Optimization

learning_rate = [0.01, 0.03, 0.05, 0.10, 0.15, 0.20, 0.25, 0.3] 
max_depth = [ 3, 4, 5, 6, 8, 10, 15, 20]
min_child_weight = [ 1, 3, 5, 7 ]
gamma = [ 0.0, 0.1,  0.4,   0.9, 1 , 2, 5 , 10, 15, 20 ]
colsample_bytree = [ 0.3, 0.4, 0.5 , 0.7 ]
n_estimators = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

<h1 style="color:DodgerBlue;">Tune the Number of Decision Trees in XGBoost</h1>

Using scikit-learn we can perform a grid search of the n_estimators model parameter, evaluating a series of values from <b>100 to 1000 </b>with a step size of 100 (100, 200, 300, 400, 500, 600, 700, 800, 900, 1000).
We can perform this grid search on the Otto dataset, using 10-fold cross validation, requiring 100 models to be trained (10 configurations * 10 folds).

In [None]:
# Create Model Object
classifier=xgboost.XGBClassifier()

# Initiate start time
start_time = time.time()

# Parameters
#n_estimators = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000] 
param_grid = dict(n_estimators=n_estimators)

#Cross Validation 
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
#Model creation with GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold, verbose = 1)
#Fit the Model
grid_result = grid_search.fit(X, label_encoded_y)

# Check Execution time
end_time = time.time()
execution_time = (end_time - start_time)
print("Successfully executed in", round(execution_time/60,1), "mins")

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

# plot the results
pyplot.plot(n_estimators, means)
pyplot.title("XGBoost n_estimators vs Accuracy")
pyplot.xlabel('n_estimators')
pyplot.ylabel('Accuracy')
pyplot.show()    

The best number of trees was <b>n_estimators=100</b> resulting in a <b>Accuracy of 78.67%</b>. we can see that accuracy dropped with more number of trees but it stablizes after that. it means we did not get any significant advantage by adding further number of trees. Successfully executed in <b>1.6 mins</b>

<h1 style="color:DodgerBlue;">Tune the Size of Decision Trees in XGBoost </h1>



We can tune this hyperparameter of XGBoost using the grid search infrastructure in scikit-learn on the Otto dataset. 
<ul>Below we evaluate odd values for<b> max_depth between 3 and 20</b> (3, 4, 5, 6, 8, 10, 15, 20).</ul>

Each of the 8 configurations is evaluated using 10-fold cross validation, resulting in <b>80 models</b> being constructed. The full code listing is provided below for completeness.


In [None]:
# Experiment 2

# Create Model Object
classifier=xgboost.XGBClassifier(n_estimators=100)

# Initiate start time
start_time = time.time()

# Parameters
#max_depth = [2, 4, 6, 8, 10, 15, 20]
param_grid = dict(max_depth=max_depth)

#Cross Validation 
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
#Model creation with GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold, verbose = 1)
#Fit the Model
grid_result = grid_search.fit(X, label_encoded_y)

# Check Execution time
end_time = time.time()
execution_time = (end_time - start_time)
print("Successfully executed in", round(execution_time/60,1), "mins")

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

# plot
pyplot.plot(max_depth, means)
pyplot.title("XGBoost Max Depth vs Accuracy")
pyplot.xlabel('Tree Size')
pyplot.ylabel('Accuracy')
pyplot.show()    

The optimal configuration was <b>max_depth=3</b> resulting in <b>accuracy of 0.799947</b>. Successfully executed in 0.3 mins

Reviewing the plot of Accuracy scores, we can see a drop from max_depth=3 to max_depth=10 then pretty even and trending down performance for the rest the values of max_depth.

Although the best score was observed for max_depth=3, it is interesting to note that there was <b>practically little difference between using max_depth=8 or max_depth=20.</b>

This suggests a point of diminishing returns in max_depth on a problem that you can tease out using grid search. <b>Using tree with Depth 3 would solve the purpose.</b>

<h1 style="color:DodgerBlue;">Tune The Number of Trees and Max Depth in XGBoost</h1>

There is a relationship between the number of trees in the model and the depth of each tree.

We would expect that deeper trees would result in fewer trees being required in the model, and the inverse where simpler trees (such as decision stumps) require many more trees to achieve similar results.

We can investigate this relationship by evaluating a grid of n_estimators and max_depth configuration values. To avoid the evaluation taking too long, we will limit the total number of configuration values evaluated. Parameters were chosen to tease out the relationship rather than optimize the model.

We will create a grid of 10 different n_estimators values (100-1000) and 8 different max_depth values (3-20) and each combination will be evaluated using 5-fold cross validation. <ul>A total of 10 X 8 X 5 or <b>400 models</b> will be trained and evaluated.</ul>

In [None]:
# Experiment 3
# No. of Trees & Max_depth 


# Create Model Object
classifier=xgboost.XGBClassifier()

#Initiate start time
start_time = time.time()

# Parameters
#n_estimators = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
#max_depth = [2, 4, 6, 8, 10]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)

#Cross Validation 
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
#Model creation with GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold, verbose = 1)
#Fit the Model
grid_result = grid_search.fit(X, label_encoded_y)


# Check Execution time
end_time = time.time()
execution_time = (end_time - start_time)
print("Successfully executed in", round(execution_time/60,1), "mins")


# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

# plot results
scores = np.array(means).reshape(len(max_depth), len(n_estimators))
for i, value in enumerate(max_depth):
    pyplot.plot(n_estimators, scores[i], label='depth: ' + str(value))

    pyplot.title("XGBoost n_estimators vs Accuracy")
    pyplot.xlabel('n_estimators')
    pyplot.ylabel('Accuracy')
    pyplot.legend()

    
# Plot results in Heat map
df_scores = pd.DataFrame(scores, columns = (n_estimators), index = (max_depth))
df_scores

#import for visualization
import seaborn as sns
import matplotlib.pyplot as plt
f,ax = plt.subplots(figsize=(10, 10))


sns.heatmap(df_scores, annot = True, cbar = False, linewidths = 0.2, cmap="YlGnBu",
           xticklabels=n_estimators, yticklabels=max_depth, fmt='.5f')

plt.title('Comparative summary of Accuracy Trend', fontsize = 15) # title with fontsize 20
plt.xlabel('Trees-Numbers', fontsize = 15) # x-axis label with fontsize 15
plt.ylabel('Trees-Size', fontsize = 15) # y-axis label with fontsize 15

plt.show()

Here are trends of accuracy for various Tree sizes accross number of trees.

We can see that the best result was achieved with a <b>n_estimators=100 and max_depth=4</b>. We must look into other scores as well. <b> Depth 20 </b> is a big tree and creating <b> 1000 trees </b> is also time consuming. It is important to see other trends as well.
<ul>
  <li>Accuracy reduces as Tree size increases</li>
  <li>Accuracy reduces as number of trees increase </li>
  <li>Accuracy reduces as with both increasing together</li>
</ul>

<b>Conclusion:</b> Max_Depth of 4-5 and #Trees till 400 will give us accuracy more than what we have achieved from max_depth 20 and 1000 Trees. this would save resources and time.
<ul><b>Both MaxDepth and N_estimators should be considered for better accuracy </b></ul>

<b>Future consideration:</b> Try multiple models with changing Learning Rate, reg_lambda, Gamma and reg_alpha.

<h1 style="color:DodgerBlue;">Tune The Learning Rate in XGBoost</h1>

In [None]:
#Experiment 4
# Create Model Object
classifier=xgboost.XGBClassifier(max_depth=4, n_estimators=100)

# Initiate start time
start_time = time.time()

# Parameters
#learning_rate = [0.01, 0.03, 0.05, 0.10, 0.15, 0.20, 0.25, 0.3] 
param_grid = dict(learning_rate=learning_rate)

#Cross Validation 
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
#Model creation with GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold, verbose = 1)
#Fit the Model
grid_result = grid_search.fit(X, label_encoded_y)

# Check Execution time
end_time = time.time()
execution_time = (end_time - start_time)
print("Successfully executed in", round(execution_time/60,1), "mins")

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

# plot the results
pyplot.plot(learning_rate, means)
pyplot.title("XGBoost learning_rate vs Accuracy")
pyplot.xlabel('learning_rate')
pyplot.ylabel('Accuracy')
pyplot.show()   

Successfully executed in 0.2 mins
<ul>
    Accuracy peaks at <b>0.05</b> and continuously declines after that.

<h1 style="color:DodgerBlue;">Tune The Gamma in XGBoost</h1>

In [None]:
#Experiment 5
# Create Model Object
classifier=xgboost.XGBClassifier(max_depth=4, n_estimators=100, learning_rate= 0.05)

# Initiate start time
start_time = time.time()

# Parameters
#gamma = [ 0.0, 0.1, 0.2 , 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 ]
param_grid = dict(gamma=gamma)

#Cross Validation 
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
#Model creation with GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold, verbose = 1)
#Fit the Model
grid_result = grid_search.fit(X, label_encoded_y)

# Check Execution time
end_time = time.time()
execution_time = (end_time - start_time)
print("Successfully executed in", round(execution_time/60,1), "mins")

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

# plot the results
pyplot.plot(gamma, means)
pyplot.title("XGBoost Gamma vs Accuracy")
pyplot.xlabel('Gamma')
pyplot.ylabel('Accuracy')
pyplot.show()   

Successfully executed in <b>0.2 mins</b>
<ul>
Accuracy flatuates Gamma below 1. <b>Improves above 1 till 10</b> followed by decline.

<h1 style="color:DodgerBlue;">Tune The Learning Rate & Gamma in XGBoost</h1>

In [None]:
# Experiment 6
# Learning Rate & Gamma


# Create Model Object
classifier=xgboost.XGBClassifier(max_depth=4, n_estimators=100)

#Initiate start time
start_time = time.time()

# Parameters
# learning_rate = [0.01, 0.03, 0.05, 0.10, 0.15, 0.20, 0.25, 0.3] 
# gamma = [ 0.0, 0.1,  0.4,   0.9, 1 , 2, 5 , 10, 15, 20 ]
param_grid = dict(learning_rate=learning_rate, gamma=gamma)

#Cross Validation 
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
#Model creation with GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold, verbose = 1)
#Fit the Model
grid_result = grid_search.fit(X, label_encoded_y)


# Check Execution time
end_time = time.time()
execution_time = (end_time - start_time)
print("Successfully executed in", round(execution_time/60,1), "mins")


# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

# plot results
scores = np.array(means).reshape(len(gamma), len(learning_rate))
for i, value in enumerate(gamma):
    pyplot.plot(learning_rate, scores[i], label='gamma: ' + str(value))

    pyplot.title("XGBoost learning_rate vs Accuracy")
    pyplot.xlabel('learning_rate')
    pyplot.ylabel('Accuracy')
    pyplot.legend()

# Plot results in Heat map
df_scores = pd.DataFrame(scores, columns = (learning_rate), index = (gamma))
df_scores

#import for visualization
import seaborn as sns
import matplotlib.pyplot as plt
f,ax = plt.subplots(figsize=(10, 10))


sns.heatmap(df_scores, annot = True, cbar = False, linewidths = 0.2, cmap="YlGnBu",
           xticklabels=learning_rate, yticklabels=gamma, fmt='.5f')

plt.title('Comparative summary of Accuracy Trend', fontsize = 15) # title with fontsize 20
plt.xlabel('learning_rate', fontsize = 15) # x-axis label with fontsize 15
plt.ylabel('gamma', fontsize = 15) # y-axis label with fontsize 15

plt.show()

Successfully executed in <b>0.7 mins</b>
<ul>
Accuracy improves as learning rate increase from 0.05 till 0.15. <b>Accuracy peaks learning rate 0.05 to 0.1. gamma 5 & 10</b> followed by slow decline.

<h1 style="color:DodgerBlue;">Tune with best parameters found till now</h1>

<li>gamma=10, </li>
<li>learning_rate=0.05,  </li>
<li>max_depth=4, </li>
<li>n_estimators=500,  </li>
<li>verbosity=1 </li>

In [None]:
#Experiment 7
classifier=xgboost.XGBClassifier(gamma=10, learning_rate=0.05, max_depth=4,
              n_estimators=600, verbosity=1)

from sklearn.model_selection import cross_val_score
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
score=cross_val_score(classifier,X,label_encoded_y,cv=kfold)
score.mean()

<h1 style="color:DodgerBlue;">Find Best Parameters in XGBoost with Random Search</h1>

Find Best Parameters with Random Search.

In [None]:
#Experiment 8
# Create Model Object
classifier=xgboost.XGBClassifier()

# Initiate start time
start_time = time.time()

# Parameters
params = dict(learning_rate = learning_rate, max_depth = max_depth, min_child_weight = min_child_weight, gamma = gamma, colsample_bytree = colsample_bytree, n_estimators = n_estimators )

# Check Best parameters
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='accuracy',n_jobs=-1,cv=5,verbose=1)
random_result = random_search.fit(X,label_encoded_y)

# Check Execution time
end_time = time.time()
execution_time = (end_time - start_time)
print("Successfully executed in", round(execution_time/60,1), "mins")

random_result.best_estimator_

In [None]:
classifier=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.03, max_delta_step=0, max_depth=4,
              min_child_weight=5,  monotone_constraints='()',
              n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=1)

from sklearn.model_selection import cross_val_score
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
score=cross_val_score(classifier,X,y,cv=kfold)
score.mean()

<b>Conclusion:</b> We achieved Accuracy close to parameters found by best estimators with Random Search CV
<ul><b>Both MaxDepth, N_estimators, Learning Rate & Gamma are key parameters for better accuracy </b></ul>


<h1 style="color:DodgerBlue;">Find Best Parameters in XGBoost with Optuna</h1>

In [None]:
"""
Optuna example that optimizes a classifier configuration for Telco Churn dataset
using XGBoost.
In this example, we optimize the validation accuracy of churn detection
using XGBoost. We optimize both the choice of booster model and their hyper
parameters.
"""

import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb

import optuna


# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).

def objective(trial):
    (data, target) = (X,y)

    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dvalid = xgb.DMatrix(valid_x, label=valid_y)

    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
