In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, ParameterSampler
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgbm
import optuna

import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

train_df = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv', index_col='id')
test_df = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv', index_col='id')
sub_df = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv', index_col='id')
print(train_df.shape)
train_df.head()

In [None]:
train = train_df.copy()
target = train.pop('target')

In [None]:
total_df = pd.concat([train, test_df])
print(total_df.shape)
total_df.head()

In [None]:
%%time

tmp_df = total_df.copy()
for i in range(10):
    temp = []
    for j in range(len(tmp_df)):
        temp.append(total_df['f_27'][j][i])
    tmp_df[f'f_27_{i + 1}'] = temp

In [None]:
tmp_df.head()

In [None]:
labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
encoder = LabelEncoder()
encoder.fit(labels)
for i in range(10):
    tmp_df[f'f_27_{i + 1}'] = encoder.transform(tmp_df[f'f_27_{i + 1}'])
tmp_df.head()

In [None]:
X = tmp_df.drop('f_27', axis=1).iloc[:train_df.shape[0], :]
test = tmp_df.drop('f_27', axis=1).iloc[train_df.shape[0]:, :]
X.shape, test.shape

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, target)
lgbm_train = lgbm.Dataset(X_train, label=y_train)
lgbm_eval = lgbm.Dataset(X_valid, y_valid, reference=lgbm_train)

In [None]:
def objective(trial, lgbm_train, lgbm_eval):
    
    # Define the parameter spase
    params = {
     "device_type": trial.suggest_categorical("device_type", ['gpu']),
     'boosting_type': trial.suggest_categorical('boosting_type',['gbdt']),
     "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.3, log=True),
     "num_leaves": trial.suggest_int("num_leaves", 20, 200, step=10),
     "max_depth": trial.suggest_int("max_depth", 3, 12),
     "lambda_l1": trial.suggest_float("lambda_l1", 0.0001, 100, log=True),
     "lambda_l2": trial.suggest_float("lambda_l2", 0.0001, 100, log=True),
     "bagging_fraction": trial.suggest_float(
         "bagging_fraction", 0.5, 0.95, step=0.05
     ),
     "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
     "feature_fraction": trial.suggest_float(
         "feature_fraction", 0.5, 0.95, step=0.05
     ),
    
     'task': trial.suggest_categorical('task', ['prediction',]),
     'objective': trial.suggest_categorical('objective', ['binary',]),
     'metric': trial.suggest_categorical('metric', ['auc',]),
     'verbosity': trial.suggest_categorical('verbosity', [-1]),
         }
    
      # Define the lightgbm model
    model = lgbm.train(
                    params,
                    lgbm_train,
                    2000,
                    categorical_feature=[col for col in X_train.columns if X_train[col].dtype == 'int64'],
                    callbacks=[
                                lgbm.early_stopping(stopping_rounds=10),
                                lgbm.log_evaluation(500),
                               ],
                    valid_sets=[lgbm_eval],
         )
    
    return model.best_score['valid_0']['auc']

In [None]:
%%time

study = optuna.create_study(direction='maximize', study_name='LGBM')
func = lambda trial: objective(trial, lgbm_train, lgbm_eval)
study.optimize(func, n_trials=100)

In [None]:
best_param = study.best_params

In [None]:
from optuna.visualization.matplotlib import plot_optimization_history

plot_optimization_history(study)

In [None]:
from optuna.visualization.matplotlib import plot_param_importances

plot_param_importances(study) 

In [None]:
%%time

'''
best_param = {
     'boosting_type': 'gbdt',
     'learning_rate': 0.10376903658865379,
     'num_leaves': 110,
     'max_depth': 8,
     'lambda_l1': 0.002218313729985511,
     'lambda_l2': 1.7566012024323334,
     'bagging_fraction': 0.8,
     'bagging_freq': 1,
     'feature_fraction': 0.95,
     'task': 'prediction',
     'objective': 'binary',
     'metric': 'auc',
     'verbosity': -1
             }
'''

best_model = lgbm.train(
                   best_param,
                   lgbm_train,
                   20000,
                   categorical_feature=[col for col in X_train.columns if X_train[col].dtype == 'int64'],
                   callbacks=[
                               lgbm.early_stopping(stopping_rounds=100),
                               lgbm.log_evaluation(200),
                              ],
                   valid_sets=[lgbm_eval],
                  )

In [None]:
%%time

sub_df['target'] = best_model.predict(test)
sub_df.to_csv('submission.csv')