In [None]:
import pandas as pd
import numpy as np
import random
import os
from xgboost import XGBClassifier
from scipy.stats import rankdata
from pathlib import Path
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import BayesianRidge, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pickle

In [None]:
n_folds = 10
seed_list = [i for i in range(2000, 2022)]

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(seed_list[0])

In [None]:
INPUT_PATH = Path("../input/tabular-playground-series-mar-2021")

TRAIN_PATH = Path("../input/tabularplaygroundseriesmar2021/preprocessed-data/train")
TEST_PATH = Path("../input/tabularplaygroundseriesmar2021/preprocessed-data/test")

In [None]:
train_df = pd.read_csv(INPUT_PATH / "train.csv")
test_df = pd.read_csv(INPUT_PATH / "test.csv")
sub_df = pd.read_csv(INPUT_PATH / 'sample_submission.csv')

In [None]:
train_oof_dict = {
    'trans_1': 'train_rgr_epoch2000_probas8_params0_batch512.npy',
    'trans_2': 'train_rgr_epoch2000_probas8_params1_batch512.npy',
    'trans_3': 'train_rgr_epoch2000_probas8_params2_batch512.npy',
    'trans_4': 'train_rgr_epoch2000_probas8_params3_batch512.npy',
    'trans_5': 'train_rgr_epoch2000_probas8_params4_batch512.npy',
    'trans_6': 'train_rgr_epoch2000_probas8_params5_batch512.npy',
    'trans_7': 'train_rgr_epoch2000_probas8_params6_batch512.npy',
    'lightgbm1': 'train_lgb.npy',
    'lightgbm2': 'train_oof_lgbm_0.npy',
    'lightgbm3': 'train_oof_lgbm_1.npy',
    'xgboost': 'train_xgb.npy',
    'catboost': 'train_cbt.npy',
    'logistic_regression1': 'train_lr.npy',
    'logistic_regression2': 'train_oof_lr_0.npy',
    'random_forest': 'train_rf.npy',
    'tabnet1': 'train_tabnet_0.npy',
    'tabnet2': 'train_tabnet_1.npy',
    'histgradient1': 'train_oof_hgb_0.npy',
    'histgradient2': 'train_oof_hgb_1.npy',
    'keras1': 'train_keras_0.npy',
    'keras2': 'train_keras_1.npy'
}

test_pred_dict = {
    'trans_1': 'test_rgr_epoch2000_probas8_params0_batch512.npy',
    'trans_2': 'test_rgr_epoch2000_probas8_params1_batch512.npy',
    'trans_3': 'test_rgr_epoch2000_probas8_params2_batch512.npy',
    'trans_4': 'test_rgr_epoch2000_probas8_params3_batch512.npy',
    'trans_5': 'test_rgr_epoch2000_probas8_params4_batch512.npy',
    'trans_6': 'test_rgr_epoch2000_probas8_params5_batch512.npy',
    'trans_7': 'test_rgr_epoch2000_probas8_params6_batch512.npy',
    'lightgbm1': 'test_lgb.npy',
    'lightgbm2': 'test_preds_lgbm_0.npy',
    'lightgbm3': 'test_preds_lgbm_1.npy',
    'xgboost': 'test_xgb.npy',
    'catboost': 'test_cbt.npy',
    'logistic_regression1': 'test_lr.npy',
    'logistic_regression2': 'test_preds_lr_0.npy',
    'random_forest': 'test_rf.npy',
    'tabnet1': 'test_tabnet_0.npy',
    'tabnet2': 'test_tabnet_1.npy',
    'histgradient1': 'test_preds_hgb_0.npy',
    'histgradient2': 'test_preds_hgb_1.npy',
    'keras1': 'test_keras_0.npy',
    'keras2': 'test_keras_1.npy'
}


## Stacking and creating a new DataFrame

In [None]:
oof_df = pd.DataFrame()
preds_df = pd.DataFrame()

for name, train_oof in train_oof_dict.items():
    oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)
    
for name, test_pred in test_pred_dict.items():
    preds_df = pd.concat([preds_df, pd.Series(np.load(TEST_PATH / test_pred), name=name)], axis=1)

In [None]:
oof_df.shape

In [None]:
def normalize(dataset):
    dataNorm=((dataset-dataset.min())/(dataset.max()-dataset.min()))
    return dataNorm


In [None]:
oof_df = normalize(oof_df)
preds_df = normalize(preds_df)

# **optuna**

In [None]:
data = oof_df
target = train_df['target']

def objective(trial , data = data , target = target):
    train_x , test_x , train_y , test_y = train_test_split(data , target , \
                test_size = 0.0356789 , random_state = 42)
    params = {
        'eval_metric' : 'auc',
        'booster' : 'gbtree',
        'tree_method' : 'gpu_hist' , 
        'use_label_encoder' : False , 
        'lambda' : trial.suggest_loguniform('lambda' , 1e-5 , 1.0),
        'alpha' : trial.suggest_loguniform('alpha' , 1e-5 , 1.0),
        'colsample_bytree' : trial.suggest_uniform('colsample_bytree' , 0 , 1.0),
        'subsample' : trial.suggest_uniform('subsample' , 0 , 1.0),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0 , 0.02),
        'n_estimators' : trial.suggest_int('n_estimators' , 1 , 9999),
        'max_depth' : trial.suggest_int('max_depth' , 1 , 20),
        'random_state' : trial.suggest_categorical('random_state' , [0,42,2021]),
        'min_child_weight' : trial.suggest_int('min_child_weight' , 1 , 300),
        'gamma' : trial.suggest_loguniform('gamma' , 1e-5 , 1.0)
    }
    model = XGBClassifier(**params)
    model.fit(train_x , train_y , eval_set = [(test_x , test_y)] , early_stopping_rounds = 222 , \
              verbose = False)
    preds = model.predict_proba(test_x)[: , 1]
    auc = roc_auc_score(test_y , preds )
    return auc


In [None]:
study = optuna.create_study(direction = 'maximize' , study_name = 'xgbclassifier')
study.optimize(objective , n_trials = 60)
print('number of the finished trials:' , len(study.trials))
print('the parametors of best trial:' , study.best_trial.params)
print('best value:' , study.best_value)

In [None]:
params = {'lambda': 0.00021936500359658444, 'alpha': 0.022578559219294244, 'colsample_bytree': 0.3922738071998231, 
          'subsample': 0.12739514555784553, 'learning_rate': 0.01986398851633632, 'n_estimators': 9489, 'max_depth': 15, 'random_state': 2021, 'min_child_weight': 63, 'gamma': 0.008018001812029647}

In [None]:
params['eval_metric'] = 'auc'
params['booster'] = 'gbtree'
params['tree_method'] = 'gpu_hist'
params['use_label_encoder'] = False

In [None]:
test = preds_df
all_features =oof_df.columns.values

# XGBoost for final prediction.

In [None]:
preds = np.zeros(test.shape[0])
oof_predictions = np.zeros(len(data))
skf = StratifiedKFold(n_splits = 20 , random_state = 42 , shuffle = True)
roc = []
n = 0
for trn_idx , val_idx in skf.split(data , target):
    train_x = data.iloc[trn_idx]
    train_y = target.iloc[trn_idx]
    val_x = data.iloc[val_idx]
    val_y = target.iloc[val_idx]
    
    model = XGBClassifier(**params)
    model.fit(train_x , train_y , eval_set = [(val_x , val_y)] , early_stopping_rounds = 100 , \
             verbose = False)
    preds += model.predict_proba(test[all_features])[:,1]/skf.n_splits
    oof_predictions += model.predict_proba(data[all_features])[:,1]/skf.n_splits
    roc.append(roc_auc_score( val_y , model.predict_proba(val_x)[:,1]))
    print(n+1 , roc[n])
    n+=1

In [None]:
df = pd.DataFrame(preds, columns = ['target'])
df.insert(loc=0, column='id', value=test_df['id'])

In [None]:
df.to_csv('sub.csv', index=False)