In [None]:
import numpy as np
import pandas as pd

import lightgbm as lgbm
from sklearn.metrics import roc_auc_score

In [None]:
data = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv', index_col='id')
X_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv', index_col='id')
submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
data['n_missing'] = data.isna().sum(axis=1)
X_test['n_missing'] = X_test.isna().sum(axis=1)
X, y = data.drop('claim', axis=1), data['claim']

In [None]:
params = {'objective': 'binary',
          'learning_rate': 0.01,
          'random_state': 42,
          'metric': 'auc',
          'verbose': -1,
          'feature_pre_filter': False,
          'lambda_l1': 7.156494352115886,
          'lambda_l2': 1.0579686997546254e-08,
          'num_leaves': 14,
          'feature_fraction': 0.8,
          'bagging_fraction': 0.8549429585285225,
          'bagging_freq': 1,
          'min_child_samples': 20}

In [None]:
n_splits=5
train_set = lgbm.Dataset(X, label=y)
model = lgbm.cv(params=params,
                train_set=train_set,
                num_boost_round=20000,
                metrics='auc',
                nfold=n_splits,
                early_stopping_rounds=150,
                verbose_eval=False,
                return_cvbooster=True)

In [None]:
oof = np.zeros_like(y, dtype='float64')
for i in range(n_splits):
    val_idx = model['cvbooster'].boosters[i].valid_sets[0].used_indices
    X_val = X.loc[val_idx, :]
    oof[val_idx] = model['cvbooster'].boosters[i].predict(X_val)

In [None]:
lgbm_oof = pd.DataFrame(oof, index=data.index, columns=['lgbm_oof'])
lgbm_oof.to_csv('lgbm_oof.csv')
val_score = roc_auc_score(y, oof)
print(f'Validation score: {val_score}')

In [None]:
cv_booster = model['cvbooster']
predictions = cv_booster.predict(X_test)

In [None]:
prediction = np.mean(predictions, axis=0)

In [None]:
submission['claim'] = prediction
submission.to_csv('lgbm_cvbooster_sub.csv', index=False)