In [7]:
#optuna
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import timeit
import pickle
import sys
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, \
                            precision_recall_curve, roc_curve, accuracy_score
from sklearn.exceptions import NotFittedError
import xgboost as xgb
from optuna import create_study, logging
from optuna.pruners import MedianPruner
from optuna.integration import XGBoostPruningCallback

In [8]:
# load training data

Xtr_loadpath = 'Xtr.csv'
Xts_loadpath = 'Xts.csv'
ytr_loadpath = 'ytr.csv'
#Xtr = np.loadtxt(Xtr_loadpath, delimiter=",", usecols = (2,3,5,6,7))
#Xts = np.loadtxt(Xts_loadpath, delimiter=",", usecols = (2,3,5,6,7))
Xtr = np.loadtxt(Xtr_loadpath, delimiter=",")
Xts = np.loadtxt(Xts_loadpath, delimiter=",")
ytr = np.loadtxt(ytr_loadpath, delimiter=",")

In [9]:
# standardize the training data
Xtr_standardized = Xtr # revise this line as needed
Xts_standardized = Xts # revise this line as needed
ytr_standardized = ytr # revise this line as needed

# save the standardized training data
Xtr_savepath = 'Xtr_xgboost.csv'
Xts_savepath = 'Xts_xgboost.csv'
ytr_savepath = 'ytr_xgboost.csv'
yts_hat_savepath = 'yts_hat_xgboost.csv'

np.savetxt(Xtr_savepath, Xtr_standardized, delimiter=",")
np.savetxt(Xts_savepath, Xts_standardized, delimiter=",")
np.savetxt(ytr_savepath, ytr_standardized, delimiter=",")

In [None]:
def objective(trial, X, y, group, score, params=dict()):
    dtrain = xgb.DMatrix(X, label=y)
    class_weight = (y.shape[0] - np.sum(y)) / np.sum(y)
    
    ## Initial Learning Parameters
    params['learning_rate'] = 0.1
    params['num_boost_round'] = 1000

    if group == '1':
        params['max_depth'] = trial.suggest_int('max_depth', 2, 10)
        params['min_child_weight'] = trial.suggest_loguniform('min_child_weight',
                                                              1e-10, 1e10)
    
    if group == '2':
        params['subsample'] = trial.suggest_uniform('subsample', 0, 1)
        params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0, 1)
    
    if group == '3':
        params['learning_rate'] = trial.suggest_uniform('learning_rate', 0, 0.1)
        params['num_boost_round'] = trial.suggest_int('num_boost_round', 100, 1000)

    pruning_callback = XGBoostPruningCallback(trial, "test-" + score.__name__)
    cv_scores = xgb.cv(params, dtrain, nfold=5,
                       stratified=True,
                       feval=score,
                       early_stopping_rounds=10,
                       callbacks=[pruning_callback],
                       seed=0)

    return cv_scores['test-' + score.__name__ + '-mean'].values[-1]


def execute_optimization(study_name, group, score, trials,
                         params=dict(), direction='maximize'):
    logging.set_verbosity(logging.ERROR)
    
    ## We use pruner to skip trials that are NOT fruitful
    pruner = MedianPruner(n_warmup_steps=5)
    
    study = create_study(direction=direction,
                         study_name=study_name,
                         storage='sqlite:///optuna.db',
                         load_if_exists=True,
                         pruner=pruner)

    study.optimize(lambda trial: objective(trial, x_train, y_train,
                                           group, score, params),
                   n_trials=trials,
                   n_jobs=-1)
    
    
    print("STUDY NAME: ", study_name)
    print('------------------------------------------------')
    print("EVALUATION METRIC: ", score.__name__)
    print('------------------------------------------------')
    print("BEST CV SCORE", study.best_value)
    print('------------------------------------------------')
    print(f"OPTIMAL GROUP - {group} PARAMS: ", study.best_params)
    print('------------------------------------------------')
    print("BEST TRIAL", study.best_trial)
    print('------------------------------------------------')
    
    
    return study.best_params

In [None]:
score_func = metrics.f1_score
def score_function(y_pred, dtrain):
    y_pred = (y_pred > 0.5).astype(int)
    y_true = (dtrain.get_label() > 0.5).astype(int)
    return score_func.__name__, score_func(y_true, y_pred)

score_function.__name__ = score_func.__name__

In [None]:
def stepwise_optimization(trials=10):
    final_params = dict()
    for g in ['1', '2', '3']:
        print(f"=========================== Optimizing Group - {g} ============================")
        update_params = execute_optimization('xgboost', g, score_function, trials,
                                             params=final_params, direction='maximize')
        final_params.update(update_params)
        print(f"PARAMS after optimizing GROUP - {g}: ", final_params)
        print()
        print()

    print("=========================== FINAL OPTIMAL PARAMETERS ============================")
    print(final_params)
    
    return final_params

In [None]:
params = stepwise_optimization()

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,ytrain,ytest = train_test_split(Xtr_standardized,ytr_standardized,test_size = 0.2,random_state= 42)


In [11]:
xtrain.shape


(8000, 8)

In [251]:
from sklearn.preprocessing import MinMaxScaler
# fit scaler on your training data
norm = MinMaxScaler().fit(X_train)
# transform your training data
X_train_norm = norm.transform(X_train)
# transform testing database
X_test_norm = norm.transform(X_test)

Unnamed: 0,4.809999999999999692e-02,1.344749999999999943e+02,-2.328261189763562911e+01,8.628743749848529987e+00,3.684999999999999942e-01,1.667416484629477935e+01,-3.083019540169018580e+00,7.836462158890424234e+01
0,0.0939,67.2404,-34.186612,23.601344,0.6114,24.044165,-3.44682,6.277822
1,0.145,173.233,-42.646412,-21.425756,0.128,17.904165,-3.20612,3.391522
2,0.3157,66.845,-96.218312,97.042544,0.1004,27.973365,-3.65052,6.303522
3,0.1889,43.7205,-18.457012,9.089744,0.1966,12.813565,-3.04932,62.272022
4,0.1277,170.952,-28.211612,25.315844,0.0892,19.756565,-3.11322,34.997722


In [270]:

model=xgboost.XGBClassifier(n_estimators=100)
model.fit(Xtr_standardized, ytr_standardized)
model.feature_importances_



array([0.26209012, 0.11002099, 0.15140432, 0.15012771, 0.32635695],
      dtype=float32)

In [5]:
n_estimators = [50,100, 200,500, 1100, 1500]
max_depth = [2, 3, 5, 7,8,12]
booster = ['gbtree', 'gblinear','dart']
base_score = [0.25, 0.5, 0.75, 1]
learning_rate = [0.05, 0.1, 0.15, 0.20,0.25,0.30]
gamma = [ 0.0, 0.1, 0.2 , 0.3, 0.4 ]
min_child_weight = [1, 2, 3, 4,5]
colsample_bytree = [ 0.3, 0.4, 0.5 , 0.7 ]
colsample_bylevel = np.arange(0.5, 1.0, 0.1)
# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'learning_rate' : learning_rate,
    'min_child_weight' : min_child_weight,
    'booster' : booster,
    'base_score' : base_score,
    'gamma': gamma,
    'colsample_bytree': colsample_bytree,
    'colsample_bylevel':colsample_bylevel
    }


In [8]:
import xgboost
from sklearn.model_selection import RandomizedSearchCV
classifier = xgboost.XGBClassifier()
random_cv = RandomizedSearchCV(estimator=classifier, param_distributions=hyperparameter_grid, cv=5,n_iter=50, 
                               scoring = 'roc_auc',n_jobs = 4,
                               verbose = 5, return_train_score = True, random_state=42)
random_cv.fit(Xtr_standardized, ytr_standardized)
random_cv.best_estimator_

Fitting 5 folds for each of 50 candidates, totalling 250 fits


65 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/sowbaranika/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/sowbaranika/opt/anaconda3/lib/python3.9/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/Users/sowbaranika/opt/anaconda3/lib/python3.9/site-packages/xgboost/sklearn.py", line 1516, in fit
    self._Booster = train(
  File "/Users/sowbaranika/opt/anaconda3/lib/python3.9/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwa

Parameters: { "colsample_bylevel", "colsample_bytree", "gamma", "max_depth", "min_child_weight" } are not used.

[CV 2/5] END base_score=0.25, booster=gblinear, colsample_bylevel=0.5, colsample_bytree=0.5, gamma=0.2, learning_rate=0.3, max_depth=7, min_child_weight=2, n_estimators=200;, score=(train=0.822, test=0.828) total time=   0.1s
Parameters: { "colsample_bylevel", "colsample_bytree", "gamma", "max_depth", "min_child_weight" } are not used.

[CV 5/5] END base_score=0.25, booster=gblinear, colsample_bylevel=0.5, colsample_bytree=0.5, gamma=0.2, learning_rate=0.3, max_depth=7, min_child_weight=2, n_estimators=200;, score=(train=0.826, test=0.814) total time=   0.1s
[CV 4/5] END base_score=0.75, booster=gbtree, colsample_bylevel=0.6, colsample_bytree=0.3, gamma=0.1, learning_rate=0.15, max_depth=7, min_child_weight=5, n_estimators=100;, score=(train=0.942, test=0.883) total time=   0.3s
Parameters: { "colsample_bylevel", "colsample_bytree", "gamma", "max_depth", "min_child_weight" }

Parameters: { "colsample_bylevel", "colsample_bytree", "gamma", "max_depth", "min_child_weight" } are not used.

[CV 1/5] END base_score=0.25, booster=gblinear, colsample_bylevel=0.5, colsample_bytree=0.5, gamma=0.2, learning_rate=0.3, max_depth=7, min_child_weight=2, n_estimators=200;, score=(train=0.821, test=0.833) total time=   0.1s
[CV 1/5] END base_score=0.75, booster=gbtree, colsample_bylevel=0.6, colsample_bytree=0.3, gamma=0.1, learning_rate=0.15, max_depth=7, min_child_weight=5, n_estimators=100;, score=(train=0.943, test=0.877) total time=   0.3s
Parameters: { "colsample_bylevel", "colsample_bytree", "gamma", "max_depth", "min_child_weight" } are not used.

[CV 2/5] END base_score=0.25, booster=gblinear, colsample_bylevel=0.6, colsample_bytree=0.3, gamma=0.2, learning_rate=0.05, max_depth=12, min_child_weight=4, n_estimators=1100;, score=(train=0.822, test=0.828) total time=   0.7s
[CV 1/5] END base_score=0.25, booster=dart, colsample_bylevel=0.6, colsample_bytree=0.7, gamma

Parameters: { "colsample_bylevel", "colsample_bytree", "gamma", "max_depth", "min_child_weight" } are not used.

[CV 3/5] END base_score=0.25, booster=gblinear, colsample_bylevel=0.5, colsample_bytree=0.5, gamma=0.2, learning_rate=0.3, max_depth=7, min_child_weight=2, n_estimators=200;, score=(train=0.823, test=0.820) total time=   0.1s
[CV 2/5] END base_score=0.75, booster=gbtree, colsample_bylevel=0.6, colsample_bytree=0.3, gamma=0.1, learning_rate=0.15, max_depth=7, min_child_weight=5, n_estimators=100;, score=(train=0.942, test=0.880) total time=   0.3s
Parameters: { "colsample_bylevel", "colsample_bytree", "gamma", "max_depth", "min_child_weight" } are not used.

[CV 1/5] END base_score=0.25, booster=gblinear, colsample_bylevel=0.6, colsample_bytree=0.3, gamma=0.2, learning_rate=0.05, max_depth=12, min_child_weight=4, n_estimators=1100;, score=(train=0.821, test=0.833) total time=   0.6s
Parameters: { "colsample_bylevel", "colsample_bytree", "gamma", "max_depth", "min_child_weight

Parameters: { "colsample_bylevel", "colsample_bytree", "gamma", "max_depth", "min_child_weight" } are not used.

[CV 4/5] END base_score=0.25, booster=gblinear, colsample_bylevel=0.5, colsample_bytree=0.5, gamma=0.2, learning_rate=0.3, max_depth=7, min_child_weight=2, n_estimators=200;, score=(train=0.825, test=0.821) total time=   0.1s
[CV 3/5] END base_score=0.75, booster=gbtree, colsample_bylevel=0.6, colsample_bytree=0.3, gamma=0.1, learning_rate=0.15, max_depth=7, min_child_weight=5, n_estimators=100;, score=(train=0.943, test=0.876) total time=   0.3s
[CV 5/5] END base_score=0.75, booster=gbtree, colsample_bylevel=0.6, colsample_bytree=0.3, gamma=0.1, learning_rate=0.15, max_depth=7, min_child_weight=5, n_estimators=100;, score=(train=0.944, test=0.865) total time=   0.4s
Parameters: { "colsample_bylevel", "colsample_bytree", "gamma", "max_depth", "min_child_weight" } are not used.

[CV 4/5] END base_score=0.25, booster=gblinear, colsample_bylevel=0.6, colsample_bytree=0.3, gamma

In [1]:
random_cv.best_params_

NameError: name 'random_cv' is not defined

In [273]:
model=xgboost.XGBClassifier(random_cv.best_estimator_)

model.fit(Xtr_standardized, ytr_standardized)

XGBClassifier(base_score=0.75, booster='dart', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0.2, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=5, max_leaves=0, min_child_weight=4, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [274]:
# save the model: you must use the .json format for xgboost models!
model_savepath = 'model.json'
model.save_model(model_savepath)

In [275]:
# generate kaggle submission file using the validation script
!python {"validation.py " + model_savepath + " --Xts_path " + Xts_savepath + " --Xtr_path " + Xtr_savepath + " --yts_hat_path " + yts_hat_savepath }

training auc =  0.9433362596673114
test label confidences saved in yts_hat_xgboost.csv
