In [None]:
import numpy as np
import pandas as pd
import random
import time
import gc
import os

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier

import xgboost as xgb
import optuna
import warnings
print('done!')

## Data

In [None]:

data = {
    "train" : "../input/tabular-playground-series-sep-2021/train.csv",
    "test"  : "../input/tabular-playground-series-sep-2021/test.csv",
    "sample": "../input/tabular-playground-series-sep-2021/sample_solution.csv"
}

train  = pd.read_csv(data["train"])
test   = pd.read_csv(data["test"])
sample = pd.read_csv(data["sample"])


df1 = pd.read_csv("../input/tabular-sep-21/train_pred_1.csv")
df2 = pd.read_csv("../input/tabular-sep-21/train_pred_2.csv")
df3 = pd.read_csv("../input/tabular-sep-21/train_pred_3.csv")
df4 = pd.read_csv("../input/tabular-sep-21/train_pred_4.csv")
df5 = pd.read_csv("../input/tabular-sep-21/train_pred_5.csv")
df6 = pd.read_csv("../input/tabular-sep-21/train_pred_6.csv")

df_t1 = pd.read_csv("../input/tabular-sep-21/test_pred_1.csv")
df_t2 = pd.read_csv("../input/tabular-sep-21/test_pred_2.csv")
df_t3 = pd.read_csv("../input/tabular-sep-21/test_pred_3.csv")
df_t4 = pd.read_csv("../input/tabular-sep-21/test_pred_4.csv")
df_t5 = pd.read_csv("../input/tabular-sep-21/test_pred_5.csv")
df_t6 = pd.read_csv("../input/tabular-sep-21/test_pred_6.csv")



## Feature

In [None]:
features = [col for col in train.columns if col not in ('id', 'claim')]
TARGET   = 'claim'
target   = train[TARGET].copy()

train["min"] = train[features].min(axis=1)
test["min"]  = test[features].min(axis=1)

train["max"] = train[features].max(axis=1)
test["max"]  = test[features].max(axis=1)

train["std"] = train[features].std(axis=1)
test["std"]  = test[features].std(axis=1)

train["n_missing"] = train[features].isna().sum(axis=1)
test["n_missing"]  = test[features].isna().sum(axis=1)

features += ['std', 'n_missing']

n_missing = train["n_missing"].copy()



In [None]:
train[features] = train[features].fillna(train[features].mean())
test[features]  = test[features].fillna(test[features].mean())

# scaler = StandardScaler()
scaler = RobustScaler()
# scaler = MinMaxScaler()

train[features] = scaler.fit_transform(train[features])
test[features]  = scaler.transform(test[features])

N_SPLITS = 5
N_ESTIMATORS = 500
EARLY_STOPING_ROUND = 200
VERBOSE = 1000
SEED = 2021

N_BINS = 20

def seed_everything(seed = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

In [None]:
df = train.copy()

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
df = df.merge(df3, on="id", how="left")
df = df.merge(df4, on="id", how="left")
df = df.merge(df5, on="id", how="left")
df = df.merge(df6, on="id", how="left")

df_test = test.copy()

df_test = df_test.merge(df_t1, on="id", how="left")
df_test = df_test.merge(df_t2, on="id", how="left")
df_test = df_test.merge(df_t3, on="id", how="left")
df_test = df_test.merge(df_t4, on="id", how="left")
df_test = df_test.merge(df_t5, on="id", how="left")
df_test = df_test.merge(df_t6, on="id", how="left")


new_features = [col for col in df.columns if col.startswith('pred')]



## Tuning

In [None]:
lgb_oof  = np.zeros(train.shape[0])
lgb_pred = np.zeros(test.shape[0])


lgb_params = {    
    'objective': 'binary',
    'n_estimators': N_ESTIMATORS,
    'random_state': SEED,
    'num_leaves': 1024,
    'colsample_bytree': 0.9936205081531161,
    'learning_rate': 0.11148211208623116,
    'max_depth': 4,
    'min_child_samples': 25,
    'min_child_weight': 6,
    'reg_alpha': 2.7099507331966978e-06,
    'reg_lambda': 0.00020521474176137927,
    'subsample': 0.9600566076878911,
    
    'gpu_id': 0,
    'predictor': 'gpu_predictor',
    'tree_method': 'gpu_hist',
    'objective': 'reg:squarederror'
}

## Fit

In [None]:
features = features + new_features

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

valid_predictions = {}
test_predictions  = []

for fold, (trn_idx, val_idx) in enumerate(skf.split(X = df, y = n_missing)):
    print(f"===== fold {fold} =====")

    xtrain, ytrain = df.iloc[trn_idx], df[TARGET].iloc[trn_idx]
    xvalid, yvalid = df.iloc[val_idx], df[TARGET].iloc[val_idx]

    xtest = df_test[features]

    valid_ids = xvalid.id.values.tolist()

    xtrain = xtrain[features]
    xvalid = xvalid[features]
        
    
    start = time.time()

    model = xgb.XGBClassifier(**lgb_params)
    model.fit(
        xtrain, ytrain,
        eval_set=[(xvalid, yvalid)],
        eval_metric='auc',
        early_stopping_rounds = EARLY_STOPING_ROUND,
        verbose = VERBOSE
    )
    

    val_pred = model.predict_proba(xvalid)[:, -1]
    test_pred= model.predict_proba(xtest)[:, -1]

    lgb_oof[val_idx] = val_pred
    lgb_pred += test_pred / N_SPLITS

    test_predictions.append(test_pred)
    valid_predictions.update(dict(zip(valid_ids, val_pred)))

    elapsed = time.time() - start
    auc = roc_auc_score(yvalid, lgb_oof[val_idx])

    print(f"fold {fold} - lgb auc: {auc: .6f}, elapsed time: {elapsed:.2f}sec\n")

print(f"oof lgb roc = {roc_auc_score(train[TARGET], lgb_oof)}")



In [None]:
output = sample.copy()
output.claim = np.mean(np.column_stack(test_predictions), axis=1)
output.to_csv("submission.csv", index=False)