In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

In [2]:
wine = pd.read_csv("winequality-red.csv", sep=";")

In [3]:
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [4]:
x = wine.drop(columns='quality')
x

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [5]:
y = wine[['quality']]
y

Unnamed: 0,quality
0,5
1,5
2,5
3,6
4,5
...,...
1594,5
1595,6
1596,6
1597,5


In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.20, random_state=30)

In [24]:
def objective_classification(trials):
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.20, random_state=30)
    params = {
        'objective':'binary:logistics',
        'booster':trials.suggest_categorical('booster', ['dart', 'gbtree', 'gblinear']),
        'lambda':trials.suggest_float('lambda', 1e-4, 1),
        'alpha':trials.suggest_float('alpha', 1e-4, 1),
        #'subsamples':trials.suggest_categorical('subsamples', [0.1,0.2,0.3,0.5]),
        #'colsamples_bytree':trials.suggest_categorical('colsamples_bytree', [0.1,0.2,0.3,0.5,1,10,20,50,100])
        
    }
    
    if params['booster'] in ['gbtree','dart']:
        params['gamma']:trials.suggest_float('gamma', 1e-3, 4)
        params['eta']:trials.suggest_float('eta', 0.001, 5)
    
    xgboost_classifier = xgb.XGBRFClassifier(**params)
    xgboost_classifier.fit(x_train, y_train, eval_set = [(x_test, y_test)])
    pred = xgboost_classifier.predict(x_test)
    accuracy_score = xgboost_classifier.score(x_test, y_test)
    
    return accuracy_score

In [25]:
import optuna

In [26]:
xgb_classification_optuna = optuna.create_study(direction='minimize')
xgb_classification_optuna.optimize(objective_classification, n_trials=10)

[32m[I 2021-10-31 17:27:17,828][0m A new study created in memory with name: no-name-03e7e002-a75e-4321-918f-747430a96f7c[0m


Parameters: { "colsample_bynode", "num_parallel_tree", "subsample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-mlogloss:1.30995


  return f(*args, **kwargs)
[32m[I 2021-10-31 17:27:17,943][0m Trial 0 finished with value: 0.41875 and parameters: {'booster': 'gblinear', 'lambda': 0.9060269199584562, 'alpha': 0.5118472415583296}. Best is trial 0 with value: 0.41875.[0m
  return f(*args, **kwargs)


[0]	validation_0-mlogloss:1.00696


[32m[I 2021-10-31 17:27:18,995][0m Trial 1 finished with value: 0.671875 and parameters: {'booster': 'gbtree', 'lambda': 0.31150801844518117, 'alpha': 0.08744478725332437}. Best is trial 0 with value: 0.41875.[0m
  return f(*args, **kwargs)


[0]	validation_0-mlogloss:1.01230


[32m[I 2021-10-31 17:27:20,347][0m Trial 2 finished with value: 0.66875 and parameters: {'booster': 'gbtree', 'lambda': 0.2134819268783558, 'alpha': 0.22116255144598582}. Best is trial 0 with value: 0.41875.[0m


Parameters: { "colsample_bynode", "num_parallel_tree", "subsample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-mlogloss:1.30741


  return f(*args, **kwargs)
[32m[I 2021-10-31 17:27:20,447][0m Trial 3 finished with value: 0.465625 and parameters: {'booster': 'gblinear', 'lambda': 0.25511237520079194, 'alpha': 0.24838562122180868}. Best is trial 0 with value: 0.41875.[0m
  return f(*args, **kwargs)


[0]	validation_0-mlogloss:1.00681


[32m[I 2021-10-31 17:27:21,973][0m Trial 4 finished with value: 0.671875 and parameters: {'booster': 'gbtree', 'lambda': 0.40383399914660406, 'alpha': 0.07529252381840516}. Best is trial 0 with value: 0.41875.[0m
  return f(*args, **kwargs)


[0]	validation_0-mlogloss:1.01556


[32m[I 2021-10-31 17:27:23,289][0m Trial 5 finished with value: 0.66875 and parameters: {'booster': 'gbtree', 'lambda': 0.8684824961964708, 'alpha': 0.41173645090887895}. Best is trial 0 with value: 0.41875.[0m
  return f(*args, **kwargs)


[0]	validation_0-mlogloss:1.01436


[32m[I 2021-10-31 17:27:24,711][0m Trial 6 finished with value: 0.6625 and parameters: {'booster': 'dart', 'lambda': 0.7160150959488782, 'alpha': 0.37026212394100166}. Best is trial 0 with value: 0.41875.[0m
  return f(*args, **kwargs)


[0]	validation_0-mlogloss:1.03132


[32m[I 2021-10-31 17:27:25,596][0m Trial 7 finished with value: 0.6625 and parameters: {'booster': 'gbtree', 'lambda': 0.5891746562571412, 'alpha': 0.9732627912150671}. Best is trial 0 with value: 0.41875.[0m


Parameters: { "colsample_bynode", "num_parallel_tree", "subsample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-mlogloss:1.31063


  return f(*args, **kwargs)
[32m[I 2021-10-31 17:27:25,650][0m Trial 8 finished with value: 0.41875 and parameters: {'booster': 'gblinear', 'lambda': 0.8999586067270069, 'alpha': 0.5569029218926419}. Best is trial 0 with value: 0.41875.[0m
  return f(*args, **kwargs)


[0]	validation_0-mlogloss:1.01908


[32m[I 2021-10-31 17:27:26,757][0m Trial 9 finished with value: 0.675 and parameters: {'booster': 'dart', 'lambda': 0.8625328695827252, 'alpha': 0.5717584539516185}. Best is trial 0 with value: 0.41875.[0m


In [27]:
xgb_classification_optuna.best_params

{'booster': 'gblinear',
 'lambda': 0.9060269199584562,
 'alpha': 0.5118472415583296}

In [28]:
xgb_classification_optuna.best_trial

FrozenTrial(number=0, values=[0.41875], datetime_start=datetime.datetime(2021, 10, 31, 17, 27, 17, 831897), datetime_complete=datetime.datetime(2021, 10, 31, 17, 27, 17, 942423), params={'booster': 'gblinear', 'lambda': 0.9060269199584562, 'alpha': 0.5118472415583296}, distributions={'booster': CategoricalDistribution(choices=('dart', 'gbtree', 'gblinear')), 'lambda': UniformDistribution(high=1.0, low=0.0001), 'alpha': UniformDistribution(high=1.0, low=0.0001)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=0, state=TrialState.COMPLETE, value=None)

In [29]:
# Training model

param = xgb_classification_optuna.best_params

In [30]:
param

{'booster': 'gblinear',
 'lambda': 0.9060269199584562,
 'alpha': 0.5118472415583296}

In [34]:
xgb_final_classifier = xgb.XGBClassifier(**param)

In [35]:
xgb_final_classifier

XGBClassifier(alpha=0.5118472415583296, base_score=None, booster='gblinear',
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, enable_categorical=False, gamma=None,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              lambda=0.9060269199584562, learning_rate=None,
              max_delta_step=None, max_depth=None, min_child_weight=None,
              missing=nan, monotone_constraints=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, predictor=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

In [37]:
xgb_final_classifier.fit(x_train, y_train)



  return f(*args, **kwargs)


XGBClassifier(alpha=0.5118472415583296, base_score=0.5, booster='gblinear',
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, enable_categorical=False, gamma=None,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              lambda=0.9060269199584562, learning_rate=0.5, max_delta_step=None,
              max_depth=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=4,
              num_parallel_tree=None, objective='multi:softprob',
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=1, ...)

In [38]:
xgb_final_classifier.score(x_test, y_test)

0.490625