## Import Lib

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import warnings


#sklearn model
import optuna
from optuna.samplers import TPESampler
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

warnings.filterwarnings('ignore')

## Read Data

In [None]:
# reduce memory
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
train = reduce_memory_usage(train)
test = reduce_memory_usage(test)

In [None]:
train.head()

In [None]:
FEATURES = [col for col in train.columns if col not in ['id', 'target']]

In [None]:
x_train = train.drop(['id', 'target'], axis=1)
y_train = train.target

x_test = test.drop('id', axis=1)

In [None]:
# x_train = x_train.iloc[:10000, :10]
# y_train = y_train.iloc[:10000]
# x_test = x_test.iloc[:10000, :10]

# FEATURES = FEATURES[:10]

del train, test
gc.collect()

## EDA

### data shape

In [None]:
print('x_train shape: ', x_train.shape)
print('y_train shape: ', y_train.shape)
print('x_test shape:', x_test.shape)

print('\r')
print('x_train data null count: ', x_train.isnull().sum().sum())
print('y_train data null count: ', y_train.isnull().sum().sum())
print('x_test data null count: ', x_test.isnull().sum().sum())

In [None]:
x_train.info()

In [None]:
x_test.info()

In [None]:
x_train.describe().T

In [None]:
x_test.describe().T

### data cleanning

In [None]:
#nothing to do

### data visualization

In [None]:
df = pd.concat([x_train[FEATURES], x_test[FEATURES]], axis=0)
con_feature = [col for col in FEATURES if df[col].nunique() > 2]
cat_feature = [col for col in FEATURES if df[col].nunique() <= 2]

del df
gc.collect()

print('con feature len: ', len(con_feature))
print('cat feature len: ', len(cat_feature))
plt.pie([len(con_feature), len(cat_feature)], labels=['Continue', 'Categorate'], autopct='%1.1f%%')

In [None]:
# target visualization
plt.pie(y_train.value_counts(), labels=['One', 'Zero'], autopct='%1.1f%%')
plt.axis('equal') 

In [None]:
#feature visualization
ncols = 3
nrows = x_train.shape[1] // ncols + (x_train.shape[1] % ncols != 0)
fig, axes = plt.subplots(nrows, ncols, figsize=(5 * ncols, 5*nrows))

for row in range(nrows):
    for col in range(ncols):
        index = row * ncols + col
        if index >= x_train.shape[1] :
            break
        sns.kdeplot(x_train.iloc[:, index], ax=axes[row, col])
        sns.kdeplot(x_test.iloc[:, index], ax=axes[row, col])

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(16 , 16))
corr = x_train.sample(10000, random_state=2021).corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr, ax=ax, square=True, center=0, linewidth=1, vmax=0.1, vmin=-0.1,
        cmap=sns.diverging_palette(240, 10, as_cmap=True),
        cbar_kws={"shrink": .85}, mask=mask ) 

ax.set_title('Correlation heatmap: Numerical features', fontsize=24, y= 1.05);


### feature engineering

In [None]:
x_train["mean"] = x_train[FEATURES].mean(axis=1)
x_train["std"] = x_train[FEATURES].std(axis=1)
x_train["min"] = x_train[FEATURES].min(axis=1)
x_train["max"] = x_train[FEATURES].max(axis=1)

x_test["mean"] = x_test[FEATURES].mean(axis=1)
x_test["std"] = x_test[FEATURES].std(axis=1)
x_test["min"] = x_test[FEATURES].min(axis=1)
x_test["max"] = x_test[FEATURES].max(axis=1)

x_train.drop('f101', axis=1, inplace=True)
x_test.drop('f101', axis=1, inplace=True)

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

y_train = y_train.values
gc.collect()

## Train Model

### linear regression

In [None]:
def objective(trial):

    param_grid = {'objective': 'binary:logistic',
              'use_label_encoder': False,
              'n_estimators': trial.suggest_int('n_estimators', 500, 5000),
              'learning_rate': trial.suggest_discrete_uniform('learning_rate',0.01,0.1,0.01),
              'subsample': trial.suggest_discrete_uniform('subsample', 0.3, 1.0, 0.1),
              'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree',0.1,1.0, 0.1),
              'max_depth': trial.suggest_int('max_depth', 2, 20),
              'booster': 'gbtree',
              'gamma': trial.suggest_uniform('gamma',1.0,10.0),
              'reg_alpha': trial.suggest_int('reg_alpha',50,100),
              'reg_lambda': trial.suggest_int('reg_lambda',50,100),
              'random_state': 42,
                 }

    x_train_, x_val, y_train_, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=50)
    xgb_model = XGBClassifier(**param_grid, tree_method='gpu_hist', predictor='gpu_predictor',
                            eval_metric=['logloss'])

    xgb_model.fit(x_train_, y_train_, verbose=False)
    y_pred = xgb_model.predict_proba(x_val)[:, 1]
    return roc_auc_score(y_val, y_pred)

In [None]:
train_time = 1 * 30 * 60 # h * m * s
study = optuna.create_study(direction='maximize', sampler=TPESampler(), study_name='XGBClassifier')
study.optimize(objective, timeout=train_time)

print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('\tValue: {}'.format(trial.value))
print('\tParams: ')
for key, value in trial.params.items():
    print('\t\t{}: {}'.format(key, value))

In [None]:
xgb_params = trial.params
# xgb_params = {}
xgb_params['tree_method'] = 'gpu_hist'
xgb_params['predictor'] = 'gpu_predictor'
xgb_params['use_label_encoder'] = False,

In [None]:
from sklearn.model_selection import KFold

n_split = 10
kfold = KFold(n_split)

val_pred = np.zeros(y_train.shape)
y_test = np.zeros((x_test.shape[0],))

for i, (train_index, val_index) in enumerate(kfold.split(x_train)):
    # train model
    print("fold {} training".format(i))
    model = XGBClassifier(**xgb_params, eval_metric=['logloss'])
    model.fit(x_train[train_index], y_train[train_index])
    
    # predict val and test
    val_pred[val_index] = model.predict_proba(x_train[val_index])[:, 1]
    vla_score = roc_auc_score(y_train[val_index], val_pred[val_index])
    print("fold {} validation auc score {}".format(i, vla_score))
    
    y_test += model.predict_proba(x_test)[:, 1] / n_split
    

## Validation Score

In [None]:
# evaluate validation score    
print("val auc score :", roc_auc_score(y_train, val_pred))

## Submission

In [None]:
sub_mission = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')
sub_mission.target = y_test
sub_mission.to_csv('submission.csv', index=False)

In [None]:
sns.kdeplot(y_test)