In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, ParameterSampler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import xgboost as xgb
import optuna

import matplotlib.pyplot as plt
import graphviz

import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

train_df = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv', index_col='id')
test_df = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv', index_col='id')
sub_df = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv', index_col='id')
print(train_df.shape)
train_df.head()

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

train_df = reduce_memory_usage(train_df, verbose=True)
test_df = reduce_memory_usage(test_df, verbose=True)
sub_df = reduce_memory_usage(sub_df, verbose=True)

In [None]:
train = train_df.copy()
target = train.pop('target')

In [None]:
total_df = pd.concat([train, test_df])
print(total_df.shape)
total_df.head()

In [None]:
%%time

tmp_df = total_df.copy()
for i in range(10):
    temp = []
    for j in range(len(tmp_df)):
        temp.append(total_df['f_27'][j][i])
    tmp_df[f'f_27_{i + 1}'] = temp
    
tmp_df.head()

In [None]:
labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
encoder = LabelEncoder()
encoder.fit(labels)
for i in range(10):
    tmp_df[f'f_27_{i + 1}'] = encoder.transform(tmp_df[f'f_27_{i + 1}'])
tmp_df.head()

In [None]:
int_cols = [col for col in tmp_df.columns if (tmp_df[col].dtype == 'int8' or tmp_df[col].dtype == 'int64')]
float_cols = [col for col in tmp_df.columns if tmp_df[col].dtype == 'float16']

In [None]:
oh_encoder = OneHotEncoder(sparse=False)
OH_cols = pd.DataFrame(oh_encoder.fit_transform(tmp_df[int_cols]))
OH_cols.index = tmp_df.index

In [None]:
tmp_df_float = tmp_df.drop(int_cols, axis=1)

total = pd.concat([tmp_df_float, OH_cols], axis=1)

In [None]:
def feature_engineering(df):
    df["unique_characters"] = df.f_27.apply(lambda s: len(set(s)))
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)
    return df
total = feature_engineering(total)
total

In [None]:
X = total.iloc[:train_df.shape[0], :]
test = total.iloc[train_df.shape[0]:, :]
X.shape, test.shape

In [None]:
X.pop('f_27')
test.pop('f_27')

In [None]:
X = reduce_memory_usage(X, verbose=True)
test = reduce_memory_usage(test, verbose=True)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, target)
xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_eval = xgb.DMatrix(X_valid, label=y_valid)

In [None]:
def objective(trial, xgb_train, xgb_eval):
    
    # Define the parameter spase
    params = {
     "tree_method": trial.suggest_categorical("tree_method", ['hist']),
     'objective': trial.suggest_categorical('objective',['binary:logistic']),
     "eta": trial.suggest_float("eta", 1e-4, 0.3, log=True),
     "gamma": trial.suggest_float("gamma", 1e-4, 1000, log=True),
#      "max_leaves": trial.suggest_int("num_leaves", 20, 200, step=10),
     "max_depth": trial.suggest_int("max_depth", 3, 12),
     'grow_policy': trial.suggest_categorical('grow_policy',['depthwise', 'lossguide']),
     "min_child_weight": trial.suggest_float("min_child_weight", 1e-4, 1000, log=True),
     "lambda": trial.suggest_float("lambda", 0.0001, 100, log=True),
     "alpha": trial.suggest_float("alpha", 0.0001, 100, log=True),
     "colsample_bytree": trial.suggest_float(
         "colsample_bytree", 0.9, 1.0, step=0.05
     ),
     "colsample_bylevel": trial.suggest_float(
         "colsample_bylevel", 0.8, 1.0, step=0.05
     ),
     "colsample_bynode": trial.suggest_float(
         "colsample_bynode", 0.7, 1.0, step=0.05
     ),
     "subsample": trial.suggest_float(
         "subsample", 0.5, 1.0, step=0.05
     ),
     'eval_metric': trial.suggest_categorical('eval_metric', ['auc',]),
         }
    
      # Define the lightgbm model
    num_round = 2000
    evallist = [(xgb_eval, 'eval')]
    model = xgb.train(
                      params,
                      xgb_train,
                      num_round,
                      evallist,
                      early_stopping_rounds=10,
                      verbose_eval=500
                     )
    
    
    return model.best_score

In [None]:
%%time

study = optuna.create_study(direction='maximize', study_name='Xgboost')
func = lambda trial: objective(trial, xgb_train, xgb_eval)
study.optimize(func, n_trials=100)

In [None]:
from optuna.visualization.matplotlib import plot_optimization_history

plot_optimization_history(study)

In [None]:
from optuna.visualization.matplotlib import plot_param_importances

plot_param_importances(study) 

In [None]:
%%time

best_params = study.best_params

evallist = [(xgb_eval, 'eval')]
best_model = xgb.train(
                  best_params,
                  xgb_train,
                  20000,
                  evallist,
                  early_stopping_rounds=100,
                  verbose_eval=100
                 )

In [None]:
%%time

xgb_test = xgb.DMatrix(test)
sub_df['target'] = best_model.predict(xgb_test)
sub_df.to_csv('submission.csv')