# XGBoost with Optuna tuning
* doc: 
https://github.com/optuna/optuna

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
import random
import optuna
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

## Read data

In [None]:
train = pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv")

In [None]:
train.head()

In [None]:
Name0=train['loss'].unique()
Name=sorted(Name0)
print(Name)

N=list(range(len(Name)))  
normal_mapping=dict(zip(Name,N)) 
reverse_mapping=dict(zip(N,Name)) 

## Target setting

In [None]:
target = train['loss']
data = train.drop(['loss','id'],axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(16,4))
sns.histplot(target, label='Train', ax=ax, color='C1',bins=43)
ax.legend()
ax.grid()

In [None]:
columns=data.columns.to_list()
print(columns)

## Optuna tuning

In [None]:
def objective(trial,data=data,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param = {

        'lambda': trial.suggest_uniform('lambda',0.001,0.1),
        'alpha': trial.suggest_uniform('alpha',0.1,0.2),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
        'subsample': trial.suggest_uniform('subsample', 0.4,0.8),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.05,0.08),
        'n_estimators': trial.suggest_int('n_estimators', 1000,4000),
        'max_depth': trial.suggest_int('max_depth', 3,6),
        'random_state': trial.suggest_int('random_state', 400,1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 10,100),
        'objective': trial.suggest_categorical('objective',['reg:logistic']), 
        'tree_method': trial.suggest_categorical('tree_method',['gpu_hist']),  # 'gpu_hist','hist'       
        'use_label_encoder': trial.suggest_categorical('use_label_encoder',[False])
    }
    model = xgb.XGBClassifier(**param)      
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(test_x)
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=8)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.trials_dataframe()

In [None]:
# shows the scores from all trials
optuna.visualization.plot_optimization_history(study)

In [None]:
# interactively visualizes the hyperparameters and scores
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
# shows the evolution of the search
optuna.visualization.plot_slice(study)

In [None]:
# parameter interactions on an interactive chart.
optuna.visualization.plot_contour(study, params=['alpha','lambda','colsample_bytree'])

In [None]:
# Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [None]:
# Visualize empirical distribution function
optuna.visualization.plot_edf(study)

### Best result of tuning

In [None]:
Best_trial=study.best_trial.params
print(Best_trial)

In [None]:
sample = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")
print(sample.shape)

## Predict

In [None]:
preds = np.zeros((sample.shape[0],len(Name)))
kf = KFold(n_splits=5,random_state=48,shuffle=True)
for trn_idx, test_idx in kf.split(train[columns],target):
    X_tr,X_val=train[columns].iloc[trn_idx],train[columns].iloc[test_idx]
    y_tr,y_val=target.iloc[trn_idx],target.iloc[test_idx]
    model = xgb.XGBClassifier(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    preds+=model.predict_proba(test[columns])/kf.n_splits   ###### predict_proba
    rmse=mean_squared_error(y_val, model.predict(X_val),squared=False)
    print(rmse)

In [None]:
print(preds.shape)
print(preds[0])

In [None]:
subm = sample
PRED=[]
for item in preds:
    value=np.argmax(item)      
    PRED+=[value]
subm['loss'] = PRED
subm.to_csv('submission.csv',index=False)
subm

In [None]:
subm['loss'].value_counts()