In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# LGBM

In [None]:
import copy
import time
import random

import warnings

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, cross_validate
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score, roc_curve, plot_roc_curve

from sklearn.preprocessing import RobustScaler

from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlesize=14, titlepad=10,)

def ht(df, n=2):
    display(df.head(n))
    display(df.tail(n))
    display(df.shape)
    
target = 'claim'

In [None]:
SEED = 2021
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
y = train[target].copy()
features = train.columns.tolist()
features.remove('id')
features.remove(target)

## Preprocessing

In [None]:
train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

train['std'] = train[features].std(axis=1)
test['std'] = test[features].std(axis=1)

n_missing = train['n_missing'].copy()

train[features] = train[features].fillna(train[features].mean())
test[features] = test[features].fillna(test[features].mean())

features += ['n_missing', 'std']

scaler = RobustScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

train.shape, test.shape

Thanks [BIZEN](https://www.kaggle.com/hiro5299834) for **lgbm_params** from this [notebook](https://www.kaggle.com/hiro5299834/tps-sep-2021-single-lgbm)

In [None]:
EARLY_STOPPING_ROUNDS = 200
VERBOSE = 1000

lgbm_params = {
    'objective': 'binary',
    'n_estimators': 20000,
    'random_state': SEED,
    'learning_rate': 5e-3,
    'subsample': 0.6,
    'subsample_freq': 1,
    'colsample_bytree': 0.4,
    'reg_alpha': 10.0,
    'reg_lambda': 1e-1,
    'min_child_weight': 256,
    'min_child_samples': 20,
    'importance_type': 'gain',
#     'device_type': 'gpu',
} 

In [None]:
model_oof = np.zeros(train.shape[0])
model_pred = np.zeros(test.shape[0])

N_SPLITS = 7
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train, y=n_missing)):
    print(f"===== fold {fold} =====")
    X_train = train[features].iloc[trn_idx]
    y_train = y.iloc[trn_idx]
    X_valid = train[features].iloc[val_idx]
    y_valid = y.iloc[val_idx]
    X_test = test[features]

    start = time.time()
    model = LGBMClassifier(**lgbm_params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose=VERBOSE,
    )

    print('predicting val')
    model_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    print('predicting ...')
    model_pred += model.predict_proba(X_test)[:, -1] / N_SPLITS

    elapsed = time.time() - start
    auc = roc_auc_score(y_valid, model_oof[val_idx])
    print(f"fold {fold} - model auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")

print(f"oof roc = {roc_auc_score(y, model_oof)}")   

In [None]:
sample_solution = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
sample_solution[target] = model_pred
ht(sample_solution)
sample_solution.to_csv('SUBMISSION.csv', index=False)
print()
print('==================== R E A D Y ====================')