In [1]:
import pandas as pd
import numpy as np
import random
import time
import os
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

import optuna

In [2]:
df = pd.read_csv('../input/september-folds/train_folds.csv')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
sample_solution = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

useful_features = [c for c in df.columns if c not in ("id", "claim", "KFold")]
numerical_cols = [col for col in useful_features if col.startswith("f")]
df_test = df_test[useful_features]

In [3]:
def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log = True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)
    
    xtrain =  df[df.KFold != fold].reset_index(drop = True)
    xvalid = df[df.KFold == fold].reset_index(drop = True)
    xtest = df_test.copy()

    ytrain = xtrain.claim
    yvalid = xvalid.claim
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = lgb.LGBMClassifier(
        objective = 'binary',
        random_state = 42,
        n_estimators = 7000,
        learning_rate = learning_rate,
        reg_lambda = reg_lambda,
        reg_alpha = reg_alpha,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        max_depth = max_depth,
    )
    
    model.fit(
        xtrain, 
        ytrain,
        eval_set = [(xvalid, yvalid)],
        eval_metric = 'auc',
        early_stopping_rounds = 300,
        verbose = False,
    )
    
    valid_preds = model.predict_proba(xvalid)[:, -1]
    auc = roc_auc_score(yvalid, valid_preds)
    print(fold, auc)
    return auc

In [4]:
study = optuna.create_study(direction = "maximize")
study.optimize(run, n_trials = 5)

[32m[I 2021-09-10 11:13:48,049][0m A new study created in memory with name: no-name-fffc4cc5-c555-4258-900a-2cb58dd6f99e[0m
[32m[I 2021-09-10 11:18:57,280][0m Trial 0 finished with value: 0.8015591877234297 and parameters: {'learning_rate': 0.030934323080696102, 'reg_lambda': 0.574865474998729, 'reg_alpha': 4.74318539421638e-06, 'subsample': 0.30429690832467116, 'colsample_bytree': 0.45005565684548254, 'max_depth': 3}. Best is trial 0 with value: 0.8015591877234297.[0m


0 0.8015591877234297


[32m[I 2021-09-10 11:21:08,064][0m Trial 1 finished with value: 0.8012812082760264 and parameters: {'learning_rate': 0.11889277467068529, 'reg_lambda': 72.41996373119524, 'reg_alpha': 0.24638657065332062, 'subsample': 0.7794082424481142, 'colsample_bytree': 0.695989459844451, 'max_depth': 4}. Best is trial 0 with value: 0.8015591877234297.[0m


0 0.8012812082760264


[32m[I 2021-09-10 11:34:57,967][0m Trial 2 finished with value: 0.8016746024641792 and parameters: {'learning_rate': 0.015332061170550073, 'reg_lambda': 1.103077586535056, 'reg_alpha': 32.904819081261586, 'subsample': 0.6863733858469923, 'colsample_bytree': 0.8915690246247586, 'max_depth': 5}. Best is trial 2 with value: 0.8016746024641792.[0m


0 0.8016746024641792


[32m[I 2021-09-10 11:36:19,448][0m Trial 3 finished with value: 0.8011219644146832 and parameters: {'learning_rate': 0.16310374825068039, 'reg_lambda': 0.005253029732602893, 'reg_alpha': 0.31191168610956116, 'subsample': 0.10467920378910557, 'colsample_bytree': 0.5620809962039652, 'max_depth': 7}. Best is trial 2 with value: 0.8016746024641792.[0m


0 0.8011219644146832


[32m[I 2021-09-10 11:39:02,450][0m Trial 4 finished with value: 0.8012962509249845 and parameters: {'learning_rate': 0.09200337822082338, 'reg_lambda': 1.6795461594456134, 'reg_alpha': 2.157156627633875e-06, 'subsample': 0.3326134746985864, 'colsample_bytree': 0.8077683311522, 'max_depth': 5}. Best is trial 2 with value: 0.8016746024641792.[0m


0 0.8012962509249845


In [5]:
study.best_params

{'learning_rate': 0.015332061170550073,
 'reg_lambda': 1.103077586535056,
 'reg_alpha': 32.904819081261586,
 'subsample': 0.6863733858469923,
 'colsample_bytree': 0.8915690246247586,
 'max_depth': 5}