# Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import optuna
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm
from lightgbm import LGBMClassifier
import xgboost as xgb
from xgboost import XGBClassifier
import catboost
from catboost import CatBoostClassifier
from optuna.integration import LightGBMPruningCallback, XGBoostPruningCallback
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Load Data
train = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv').drop('id', axis=1)
train.head()

# Missing Values and Feature Engineering

In [None]:
#Imputation Pipeline
pipeline = Pipeline([('impute', SimpleImputer(strategy='mean')), ('scale', StandardScaler())])

In [None]:
#Drop ID and claim columns, latter temporarily
temp = train['claim']
train.drop(columns = ['claim', 'id'], inplace=True)

In [None]:
#Feature Engineering
train['min'] = train.min(axis=1)
train['max'] = train.max(axis=1)
train['sum'] = train.isna().sum(axis=1)
train['mean'] = train.mean(axis=1)
train['std'] = train.std(axis=1)
test['min'] = test.min(axis=1)
test['max'] = test.max(axis=1)
test['sum'] = test.isna().sum(axis=1)
test['mean'] = test.mean(axis=1)
test['std'] = test.std(axis=1)

In [None]:
#Restore datasets with missing data + new features
train = pd.DataFrame(columns = train.columns, data=pipeline.fit_transform(train))
test = pd.DataFrame(columns = test.columns, data=pipeline.fit_transform(test))
train['claim'] = temp
train.head()

In [None]:
#Split into X and Y
train_x = train
train_y = train['claim']
train_x.drop(columns = ['claim'], inplace=True)
train_x.head()

# Optuna

In [None]:
def Optuna(argument):
    N_TRIALS = 20
    N_SPLITS = 5
    #Credit here for Objective Function: https://www.kaggle.com/bextuychiev/lgbm-optuna-hyperparameter-tuning-w-understanding
    def LGBMObjective(trial, train_x = train_x, train_y = train_y):
        param = {
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "num_leaves": trial.suggest_int("num_leaves", 8, 4096),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000),
            "lambda_l1": trial.suggest_int("lambda_l1", 0, 100),
            "lambda_l2": trial.suggest_int("lambda_l2", 0, 100),
            "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95),
            "device_type": 'gpu',
            "n_estimators": 10000,
            "bagging_freq": 1,
            "metric": 'auc',
            "objective": 'binary'
        }
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        cv_scores = np.empty(5)
        for idx, (train_idx, test_idx) in enumerate(cv.split(train_x, train_y)):
            X_train, X_valid = train_x.iloc[train_idx], train_x.iloc[test_idx]
            y_train, y_valid = train_y[train_idx], train_y[test_idx]
            model = LGBMClassifier(**param)
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric="roc_auc_score", early_stopping_rounds=100,
                callbacks=[LightGBMPruningCallback(trial, "auc")], verbose = False
            )
            preds = model.predict_proba(X_valid)
            cv_scores[idx] = roc_auc_score(y_valid, preds[:,1])
        return np.mean(cv_scores)
    #Credit here for objective function: https://www.kaggle.com/mohammadkashifunique/xgboost-hyperparametertuning-optuna
    def XGBObjective(trial, x_train = train_x, y_train = train_y): 
        param = {
            'max_depth': trial.suggest_int('max_depth', 6, 10), 
            'n_estimators': trial.suggest_int('n_estimators', 400, 4000, 400), 
            'eta': trial.suggest_float('eta', 0.007, 0.013), 
            'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 0.9, 0.1),
            'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.1),
            'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 0.9, 0.1),
            'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 1e4), 
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e4),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e4), 
            'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4),
            'predictor': "gpu_predictor",
            'eval_metric' : 'auc',
            'objective' : 'binary:logistic',
            'tree_method': 'gpu_hist',
        }
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        cv_scores = np.empty(5)
        for idx, (train_idx, test_idx) in enumerate(cv.split(train_x, train_y)):
            X_train, X_valid = train_x.iloc[train_idx], train_x.iloc[test_idx]
            y_train, y_valid = train_y[train_idx], train_y[test_idx]
            model = XGBClassifier(**param)
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100, 
                      callbacks=[XGBoostPruningCallback(trial, 'validation_0-auc')], verbose = 0)
            preds = model.predict_proba(X_valid)
            cv_scores[idx] = roc_auc_score(y_valid, preds[:,1])
        return np.mean(cv_scores)
    #Credit here for objective function: https://www.kaggle.com/mlanhenke/tps-09-optuna-study-catboostclassifier
    def CatBoostObjective(trial, x_train = train_x, y_train = train_y):
        param = {
            'iterations':trial.suggest_int("iterations", 1000, 20000),
            'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
            'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
            'od_wait':trial.suggest_int('od_wait', 500, 2000),
            'learning_rate' : trial.suggest_uniform('learning_rate',0.02,1),
            'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
            'random_strength': trial.suggest_uniform('random_strength',10,50),
            'depth': trial.suggest_int('depth',1,15),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
            'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
            'verbose': False,
            'task_type' : 'GPU',
            'devices' : '0',
            'eval_metric':'AUC',
            'od_type': 'IncToDec',
            'od_pval': 1e-7, 
            'od_wait' : 100,
        }
        if param['bootstrap_type'] == 'Bayesian':
            param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
        elif param['bootstrap_type'] == 'Bernoulli':
            param['subsample'] = trial.suggest_float('subsample', 0.1, 1)
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        cv_scores = np.empty(5)
        for idx, (train_idx, test_idx) in enumerate(cv.split(train_x, train_y)):
            X_train, X_valid = train_x.iloc[train_idx], train_x.iloc[test_idx]
            y_train, y_valid = train_y[train_idx], train_y[test_idx]
            model = CatBoostClassifier(**param)
            #Notice: Optuna Callback not supported on CatBoost
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
            preds = model.predict_proba(X_valid)
            cv_scores[idx] = roc_auc_score(y_valid, preds[:,1])
        return np.mean(cv_scores)
    def ObjectiveSelector(argument):
        objective = {
            'LGBM': LGBMObjective,
            'XGB': XGBObjective,
            'CatBoost': CatBoostObjective            
        }
        return objective.get(argument, "Invalid Selection")
    def ModelSelector(argument, trial): #Switch case not usable here without crashing. 
        if(argument == 'LGBM'):
            return LGBMClassifier(**trial.params)
        elif(argument == 'XGB'):
            return XGBClassifier(**trial.params)
        elif(argument == 'CatBoost'):
            return CatBoostClassifier(**trial.params)
        return "Invalid Model"
    study = optuna.create_study(direction="maximize")
    study.optimize(ObjectiveSelector(argument), n_trials=N_TRIALS)
    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial
    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    model = ModelSelector(argument, trial)
    model.fit(train_x, train_y)
    predictions_optuna = model.predict_proba(test)
    return predictions_optuna

In [None]:
argument = 'XGB'
predictions_optuna = Optuna(argument)

# Submission

In [None]:
sample_solution = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')
sample_solution['claim'] = predictions_optuna[:, 1]
sample_solution.head()

In [None]:
sample_solution.to_csv('submission.csv', index=False)