In [None]:
import optuna
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
path = '../input/tabular-playground-series-mar-2021/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sub = pd.read_csv(path + 'sample_submission.csv')

In [None]:
cat = [col for col in train.columns if 'cat' in col]
cont = [col for col in train.columns if 'cont' in col]
all_features = cat + cont

# Label Encoder

In [None]:
all_df = pd.concat([train , test]).reset_index(drop = True)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cat:
    all_df[col] = le.fit_transform(all_df[col])
    
train = all_df[:train.shape[0]]
test = all_df[train.shape[0]:].reset_index(drop = True)

In [None]:
data = train[all_features]
target = train['target']

# OPTUNA

In [None]:
def objective(trial , data = data , target = target):
    train_x , test_x , train_y , test_y = train_test_split(data , target , \
                test_size = 0.028059109276941666 , random_state = 42)
    params = {
        'eval_metric' : 'auc',
        'booster' : 'gbtree',
        'tree_method' : 'gpu_hist' , 
        'use_label_encoder' : False , 
        'lambda' : trial.suggest_loguniform('lambda' , 1e-5 , 1.0),
        'alpha' : trial.suggest_loguniform('alpha' , 1e-5 , 1.0),
        'colsample_bytree' : trial.suggest_uniform('colsample_bytree' , 0 , 1.0),
        'subsample' : trial.suggest_uniform('subsample' , 0 , 1.0),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0 , 0.02),
        'n_estimators' : trial.suggest_int('n_estimators' , 1 , 9999),
        'max_depth' : trial.suggest_int('max_depth' , 1 , 20),
        'random_state' : trial.suggest_categorical('random_state' , [0,42,2021]),
        'min_child_weight' : trial.suggest_int('min_child_weight' , 1 , 300),
        'gamma' : trial.suggest_loguniform('gamma' , 1e-5 , 1.0)
    }
    model = xgb.XGBClassifier(**params)
    model.fit(train_x , train_y , eval_set = [(test_x , test_y)] , early_stopping_rounds = 222 , \
              verbose = False)
    preds = model.predict_proba(test_x)[: , 1]
    auc = roc_auc_score(test_y , preds )
    return auc

In [None]:
study = optuna.create_study(direction = 'maximize' , study_name = 'xgbclassifier')
study.optimize(objective , n_trials = 60)
print('number of the finished trials:' , len(study.trials))
print('the parametors of best trial:' , study.best_trial.params)
print('best value:' , study.best_value)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importance(study)

In [None]:
#Best  0.8990941303451653
params = {'lambda': 3.342625262710592e-05, 'alpha': 0.0005910445093857934, \
          'colsample_bytree': 0.42295113660344236, 'subsample': 0.8092952867076734,\
          'learning_rate': 0.014533634130298151, 'n_estimators': 5966, 'max_depth': 12, \
          'random_state': 2021, 'min_child_weight': 24, 'gamma': 0.017646631838015223}

In [None]:
params['eval_metric'] = 'auc'
params['booster'] = 'gbtree'
params['tree_method'] = 'gpu_hist'
params['use_label_encoder'] = False

In [None]:
params

In [None]:
preds = np.zeros(test.shape[0])
oof_predictions = np.zeros(len(data))
kf = KFold(n_splits = 20 , random_state = 42 , shuffle = True)
roc = []
n = 0
for trn_idx , val_idx in kf.split(data , target):
    train_x = data.iloc[trn_idx]
    train_y = target.iloc[trn_idx]
    val_x = data.iloc[val_idx]
    val_y = target.iloc[val_idx]
    
    model = xgb.XGBClassifier(**params)
    model.fit(train_x , train_y , eval_set = [(val_x , val_y)] , early_stopping_rounds = 100 , \
             verbose = False)
    preds += model.predict_proba(test[all_features])[:,1]/kf.n_splits
    oof_predictions += model.predict_proba(data[all_features])[:,1]/kf.n_splits
    roc.append(roc_auc_score( val_y , model.predict_proba(val_x)[:,1]))
    print(n+1 , roc[n])
    n+=1

In [None]:
sub['target'] = preds
sub.to_csv('xgbsubmission.csv' , index = False)

In [None]:
output = pd.DataFrame({'id':train['id'] , 'target':oof_predictions})
output.to_csv('xgboof_predictions.csv' , index = False)