## xgboosting with optuna 

In [None]:
import pandas as pd
import optuna

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
train.head()

See my EDA [here](https://www.kaggle.com/truongdang1311/tabular-sep-2021-automl)

In [None]:
X = train.drop(['id', 'claim'], axis = 1)
y = train['claim'].values

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

class AddFeature(BaseEstimator,TransformerMixin ):
    def __init__(self):
        pass
    def fit(self, X, y = None):
        self.greater1e10 = []
        self.greater1e5 = []
        self.greater1e2 = []
        self.lessthan1e2 = []
        for col in X.columns:
            if (X[col].mean() > 1e10):
                self.greater1e10.append(col)
            elif (X[col].mean() > 1e5):
                self.greater1e5.append(col)
            elif (X[col].mean() > 1e2):
                self.greater1e2.append(col)
            else:
                self.lessthan1e2.append(col)
        return self
    def transform(self, X, y = None):
        df = X.copy()
        df['std'] = df.std(axis = 1)
        df['mean'] = df.mean(axis = 1)
        df['missing counts'] = df.isna().sum(axis = 1)
        df['mean1'] = df[self.greater1e10].mean(axis =1 )
        df['mean2'] = df[self.greater1e5].mean(axis =1 )
        df['mean3'] = df[self.greater1e2].mean(axis =1 )
        df['mean4'] = df[self.lessthan1e2].mean(axis =1 )
        return df

In [None]:
pipeline = make_pipeline(
    #(SimpleImputer(strategy='mean')),
    (AddFeature()),
    (RobustScaler())
)
X = pipeline.fit_transform(X)
X

#### Try with booster: gbtree

In [None]:
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
def objective_xgb(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2021)
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'booster':'gbtree',
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e4),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e4),
        'n_estimators' : 6000,
        'use_label_encoder' : False,
        'tree_method' : 'gpu_hist',
        'predictor' : 'gpu_predictor',
        'gpu_id' : 0,
        'max_depth' : trial.suggest_int('max_depth', 0, 15),
        'eta' : trial.suggest_loguniform('eta', 1e-5, 0.2),
        'gamma' : trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'grow_policy' : trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'subsample' : trial.suggest_float('subsample', 1e-8,1),
        'max_bin' : trial.suggest_int('max_bin',256, 2048, step = 32),
        'max_leaves' : trial.suggest_int('max_leaves', 0,20),
        'min_child_weight' : trial.suggest_int('min_child_weight', 1, 32)
    }
    eval_set = [(X_test, y_test)]
    xgb = XGBClassifier().set_params(**params)
    xgb.fit(X_train, y_train, early_stopping_rounds = 250, eval_set = eval_set, verbose = 500)
    y_pred = xgb.predict_proba(X_test)
    roc = roc_auc_score(pd.get_dummies(y_test), y_pred)
    return roc

In [None]:
print('gbtree: ')
study_xgb = optuna.create_study(study_name='xgboosting classifier using optuna', direction= 'maximize')
study_xgb.optimize(objective_xgb, n_trials = 60)

In [None]:
study_xgb.best_trial.params

In [None]:
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv', index_col = 0)
test = pipeline.transform(test)
test

In [None]:
lgb = XGBClassifier(        
    objective = 'binary:logistic',
    eval_metric = 'auc',
    booster ='gbtree',
    n_estimators  = 10000,
    use_label_encoder = False,
    tree_method = 'gpu_hist',
    predictor = 'gpu_predictor',
    gpu_id = 0,
    **study_xgb.best_trial.params)

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
test_preds = []
kf = KFold(n_splits= 10, shuffle = True, random_state = 42)
i = 0
for train_index, valid_index in kf.split(X):
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    eval_set = [(X_valid, y_valid)]
    lgb.fit(X_train, y_train, eval_set = eval_set, eval_metric = 'auc', early_stopping_rounds= 250, verbose = 1000)
    y_pred = lgb.predict_proba(X_valid)
    valid_score = roc_auc_score(pd.get_dummies(y_valid), y_pred)
    print('Split {} : {}'.format(i, valid_score))
    i+=1
    test_pred = lgb.predict_proba(test)[:,1]
    test_preds.append(test_pred)

In [None]:
import numpy as np
submiss = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv', index_col = 0)
submiss['claim'] = np.array(test_preds).mean(axis = 0)
submiss.to_csv('./xgb_gbtree.csv')