# XGBoost with Optuna tuning
* doc: 
https://github.com/optuna/optuna

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
import random
import optuna
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

In [None]:
data0 = pd.read_csv("../input/water-quality/waterQuality1.csv")
data0

In [None]:
data0.info()

In [None]:
data0['is_safe'].value_counts()

In [None]:
data0['is_safe']=data0['is_safe'].map({'0':0,'1':1,'#NUM!':0})

In [None]:
ammo=[]
for i in range(len(data0)):
    s=data0.loc[i,'ammonia']
    if s=='#NUM!':
        ammo+=[0]
    else:
        ammo+=[float(s)]
data0['ammonia']=ammo

In [None]:
n=len(data0)
N=[]
for i in range(n):
    N+=[i]
random.shuffle(N)

In [None]:
dataY=data0['is_safe']
dataX=data0.drop('is_safe',axis=1)

trainY=dataY.loc[N[0:(n//4)*3]]
trainX=dataX.loc[N[0:(n//4)*3]]

testY=dataY.loc[N[(n//4)*3:]]
testX=dataX.loc[N[(n//4)*3:]]

In [None]:
columns=dataX.columns.to_list()
print(columns)

In [None]:
data=trainX
target=trainY

In [None]:
def objective(trial,data=data,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param = {
        'objective': trial.suggest_categorical('objective',['reg:logistic','reg:tweedie']), 
        'tree_method': trial.suggest_categorical('tree_method',['hist']),  # 'gpu_hist','hist'
        'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
        'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018,0.02]),
        'n_estimators': trial.suggest_categorical('n_estimators', [1000,2000,4000,8000]),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24,48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1,300),
        'use_label_encoder': trial.suggest_categorical('use_label_encoder',[False])
    }
    model = xgb.XGBClassifier(**param)      
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(test_x)
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

#### Objective candidate for XGBoost
* Objective candidate: survival:aft
* Objective candidate: binary:hinge
* Objective candidate: multi:softmax
* Objective candidate: multi:softprob
* Objective candidate: rank:pairwise
* Objective candidate: rank:ndcg
* Objective candidate: rank:map
* Objective candidate: reg:squarederror
* Objective candidate: reg:squaredlogerror
* Objective candidate: reg:logistic
* Objective candidate: reg:pseudohubererror
* Objective candidate: binary:logistic
* Objective candidate: binary:logitraw
* Objective candidate: reg:linear
* Objective candidate: count:poisson
* Objective candidate: survival:cox
* Objective candidate: reg:gamma
* Objective candidate: reg:tweedie

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=16)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.trials_dataframe()

In [None]:
# shows the scores from all trials
optuna.visualization.plot_optimization_history(study)

In [None]:
# interactively visualizes the hyperparameters and scores
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
# shows the evolution of the search
optuna.visualization.plot_slice(study)

In [None]:
# parameter interactions on an interactive chart.
optuna.visualization.plot_contour(study, params=['colsample_bytree','max_depth'])

In [None]:
# Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [None]:
# Visualize empirical distribution function
optuna.visualization.plot_edf(study)

In [None]:
Best_trial=study.best_trial.params
print(Best_trial)

In [None]:
train=trainX
test=testX

In [None]:
preds = np.zeros((testX.shape[0]))
kf = KFold(n_splits=5,random_state=48,shuffle=True)
for trn_idx, test_idx in kf.split(train[columns],target):
    X_tr,X_val=train[columns].iloc[trn_idx],train[columns].iloc[test_idx]
    y_tr,y_val=target.iloc[trn_idx],target.iloc[test_idx]
    model = xgb.XGBClassifier(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    preds+=model.predict(test[columns])/kf.n_splits   ###### predict_proba
    rmse=mean_squared_error(y_val, model.predict(X_val),squared=False)
    print(rmse)

In [None]:
print(preds.shape)
print(preds[0])

In [None]:
subm=pd.DataFrame(testY)
subm['predicted'] = np.where(preds<0.5,0,1).astype(int)
subm

In [None]:
ANS=list(testY)
PRED=list(subm['predicted'])

In [None]:
accuracy=accuracy_score(ANS,PRED)
print(accuracy)