In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
import gc

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df


In [None]:
train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
train = reduce_memory_usage(train)
test = reduce_memory_usage(test)

In [None]:
x_data = train.drop(['id', 'target'], axis=1)
y_data = train.target
x_test = test.drop('id', axis=1)

In [None]:
del train, test
gc.collect()

### data preprocessing

In [None]:
float_columns = ['f'+str(i) for i in range(242)]
float_columns.remove('f22')
float_columns.remove('f43')
int_columns = ['f'+str(i) for i in range(242,285)]+['f22','f43']
cols = float_columns + int_columns


In [None]:
scaler = RobustScaler()
x_data[float_columns] = scaler.fit_transform(x_data[float_columns])
x_test[float_columns] = scaler.transform(x_test[float_columns]) 

In [None]:
print('x_data shape {}'.format(x_data.shape))
print('x_test shape {}'.format(x_test.shape))

In [None]:
from sklearn.model_selection import train_test_split

# x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.3, random_state=50)


svm

In [None]:
# from sklearn.svm import SVC

# model = SVC()
# model.fit(x_train, y_train)
# roc_auc_score(y_val, model.predict_proba(x_val))


### optuna

In [None]:
def objective(trial):
    gc.collect()
    x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.1, random_state=50)
    param_grid = {'objective': 'binary:logistic',
              'use_label_encoder': False,
              'n_estimators': trial.suggest_int('n_estimators', 500, 5000),
              'learning_rate': trial.suggest_discrete_uniform('learning_rate',0.01,0.1,0.01),
              'subsample': trial.suggest_discrete_uniform('subsample', 0.3, 1.0, 0.1),
              'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree',0.1,1.0, 0.1),
              'max_depth': trial.suggest_int('max_depth', 2, 20),
              'booster': 'gbtree',
              'gamma': trial.suggest_uniform('gamma',1.0,10.0),
              'reg_alpha': trial.suggest_int('reg_alpha',50,100),
              'reg_lambda': trial.suggest_int('reg_lambda',50,100),
              'random_state': 42,
                 }

#     gamma = trial.suggest_discrete_uniform('gamma_', 0.3, 3, 0.1)
#     print('gamma', gamma)
    x_train[float_columns] = np.sin(x_train[float_columns] * np.pi * 2)
    x_val[float_columns] = np.sin(x_val[float_columns] * np.pi * 2)
    
    xgb_model = XGBClassifier(**param_grid, tree_method='gpu_hist', predictor='gpu_predictor',
                            eval_metric=['logloss'])

    xgb_model.fit(x_train, y_train, verbose=False)
    y_pred = xgb_model.predict_proba(x_val)[:, 1]
    
    return roc_auc_score(y_val, y_pred)

In [None]:
import optuna
from optuna.samplers import TPESampler

train_time = 1 * 60 * 60 # h * m * s
study = optuna.create_study(direction='maximize', sampler=TPESampler(), study_name='XGBClassifier')
study.optimize(objective, timeout=train_time)

print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('\tValue: {}'.format(trial.value))
print('\tParams: ')
for key, value in trial.params.items():
    print('\t\t{}: {}'.format(key, value))

In [None]:
params = trial.params

# gamma = params.pop('gamma_')
# gamma = 2.6
xgb_params = params
# xgb_params = {}
xgb_params['tree_method'] = 'gpu_hist'
xgb_params['predictor'] = 'gpu_predictor'

# xgb_params = {
#     'objective': 'binary:logistic',
#     'use_label_encoder': False,
#     'n_estimators': 4095,
#     'learning_rate':0.05,
#     'subsample': 0.6,
#     'colsample_bytree': 0.2,
#     'max_depth': 5,
#     'booster': 'gbtree',
#     'gamma': 9.227759584552311,
#     'reg_alpha': 63,
#     'reg_lambda': 56,
#     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor',
#     'n_jobs': 4
# }

In [None]:
# xgb_params

### KFold

In [None]:
# gamma

In [None]:
from sklearn.model_selection import KFold

n_split = 10
kfold = KFold(n_split)

val_pred = np.zeros(y_data.shape)
y_test = np.zeros((x_test.shape[0],))
x_data[float_columns] = np.sin(x_data[float_columns] * np.pi * 2)
x_test[float_columns] = np.sin(x_test[float_columns] * np.pi * 2)

for i, (train_index, val_index) in enumerate(kfold.split(x_data)):
    # train model
    print("fold {} training".format(i))
    gc.collect()
    
    model = XGBClassifier(**xgb_params, eval_metric=['logloss'])
    model.fit(x_data.iloc[train_index], y_data.iloc[train_index])
    
    # predict val and test
    val_pred[val_index] = model.predict_proba(x_data.iloc[val_index])[:, 1]
    vla_score = roc_auc_score(y_data.iloc[val_index], val_pred[val_index])
    print("fold {} validation auc score {}".format(i, vla_score))
    
    y_test += model.predict_proba(x_test)[:, 1] / n_split
    
# evaluate validation score    
print("val auc score :", roc_auc_score(y_data, val_pred))

### submission

In [None]:
sub_mission = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')
sub_mission.target = y_test
sub_mission.to_csv('submission.csv', index=False)

In [None]:
import seaborn as sns
sns.histplot(y_test)