In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from xgboost import XGBRegressor
import optuna

In [2]:
final_predictions = []

df = pd.read_csv('../input/september-folds/train_folds.csv')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
sample_solution = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

useful_features = [c for c in df.columns if c not in ("id", "claim", "KFold")]
numerical_cols = [col for col in useful_features if col.startswith("f")]
df_test = df_test[useful_features]

In [3]:
def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log = True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = df[df.KFold != fold].reset_index(drop = True)
    xvalid = df[df.KFold == fold].reset_index(drop = True)

    ytrain = xtrain.claim
    yvalid = xvalid.claim

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    model = XGBRegressor(
        random_state = 42,
        tree_method = "gpu_hist",
        gpu_id = 1,
        predictor = "gpu_predictor",
        n_estimators = 7000,
        learning_rate = learning_rate,
        reg_lambda = reg_lambda,
        reg_alpha = reg_alpha,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        max_depth = max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds = 300, eval_set = [(xvalid, yvalid)], verbose = False)
    preds_valid = model.predict(xvalid)
    roc = roc_auc_score(yvalid, preds_valid)
    return roc

In [4]:
study = optuna.create_study(direction = "maximize")
study.optimize(run, n_trials = 5)

[32m[I 2021-09-04 17:57:59,985][0m A new study created in memory with name: no-name-ad2e045c-da89-4e8f-9c1d-964d1093ef11[0m




[32m[I 2021-09-04 18:00:09,918][0m Trial 0 finished with value: 0.7980824475683297 and parameters: {'learning_rate': 0.04260605966751691, 'reg_lambda': 0.012100146976901016, 'reg_alpha': 6.952812689021962e-06, 'subsample': 0.5034591718148855, 'colsample_bytree': 0.7665198972844353, 'max_depth': 7}. Best is trial 0 with value: 0.7980824475683297.[0m




[32m[I 2021-09-04 18:02:40,398][0m Trial 1 finished with value: 0.8021406524582633 and parameters: {'learning_rate': 0.02055173106562401, 'reg_lambda': 2.1090860046009154e-08, 'reg_alpha': 6.512677632385896e-08, 'subsample': 0.6008673924829881, 'colsample_bytree': 0.5189846067991032, 'max_depth': 5}. Best is trial 1 with value: 0.8021406524582633.[0m




[32m[I 2021-09-04 18:03:49,217][0m Trial 2 finished with value: 0.7994892720492748 and parameters: {'learning_rate': 0.019269084521484254, 'reg_lambda': 0.009174716631164594, 'reg_alpha': 0.000186942282547991, 'subsample': 0.9341119550484657, 'colsample_bytree': 0.2593232743287781, 'max_depth': 1}. Best is trial 1 with value: 0.8021406524582633.[0m




[32m[I 2021-09-04 18:05:32,832][0m Trial 3 finished with value: 0.8016166968042915 and parameters: {'learning_rate': 0.015482844591328693, 'reg_lambda': 1.770163475657427e-05, 'reg_alpha': 0.00016361159106420054, 'subsample': 0.8189437984090763, 'colsample_bytree': 0.1908331580062795, 'max_depth': 3}. Best is trial 1 with value: 0.8021406524582633.[0m




[32m[I 2021-09-04 18:06:45,432][0m Trial 4 finished with value: 0.7942729041031333 and parameters: {'learning_rate': 0.2212929642070948, 'reg_lambda': 0.034878979265272954, 'reg_alpha': 3.7542706385172053, 'subsample': 0.626889675316616, 'colsample_bytree': 0.11602850972790407, 'max_depth': 3}. Best is trial 1 with value: 0.8021406524582633.[0m


In [5]:
study.best_params

{'learning_rate': 0.02055173106562401,
 'reg_lambda': 2.1090860046009154e-08,
 'reg_alpha': 6.512677632385896e-08,
 'subsample': 0.6008673924829881,
 'colsample_bytree': 0.5189846067991032,
 'max_depth': 5}

In [6]:
scores = []

for fold in range(5):
    xtrain =  df[df.KFold != fold].reset_index(drop = True)
    xvalid = df[df.KFold == fold].reset_index(drop = True)
    xtest = df_test.copy()

    ytrain = xtrain.claim
    yvalid = xvalid.claim
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    params = {
        'learning_rate': 0.020059359127029743,
        'reg_lambda': 0.03503327716632611,
        'reg_alpha': 5.402866067493725e-07,
        'subsample': 0.818581067621836,
        'colsample_bytree': 0.5749793197385817,
        'max_depth': 4,
    }
    
    model = XGBRegressor(
        random_state = fold, 
        tree_method = 'gpu_hist', 
        gpu_id = 0, 
        predictor = "gpu_predictor",
        n_estimators = 5000,
        **params
    )
    
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    roc = roc_auc_score(yvalid, preds_valid)
    print(fold, roc)
    scores.append(roc)

print(np.mean(scores), np.std(scores))

0 0.8019486824357859
1 0.8037325213032117
2 0.8022124915050781
3 0.8039685843831872
4 0.803533339835929
0.8030791238926384 0.0008310631559718997


In [7]:
np.column_stack(final_predictions).shape

(493474, 5)

In [8]:
predictions_submit = np.mean(np.column_stack(final_predictions), axis = 1)

In [9]:
sample_solution.claim = predictions_submit
sample_solution.to_csv("submission.csv", index = False)