In [None]:
import pandas  as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [None]:
train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")

In [None]:
train["missing"] = train.isnull().sum(axis = 1)
test["missing"] = test.isnull().sum(axis = 1)

In [None]:
target = "claim"
predictors = [x for x in train.columns if x not in ["id", target]]

kf = KFold(n_splits = 5, shuffle = True, random_state = 666)
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 666)

In [None]:
train[predictors] = train[predictors].fillna(train.groupby("missing")[predictors].transform("mean"))
test[predictors] = test[predictors].fillna(train.groupby("missing")[predictors].transform("mean"))

In [None]:
scaler = StandardScaler()

train[predictors] = scaler.fit_transform(train[predictors])
test[predictors] = scaler.transform(test[predictors])

In [None]:
X = train[predictors]
y = train[target]
test = test[predictors]

In [None]:
params = {
    'max_depth': 15, 
    'colsample_bytree': 0.6104262197364133, 
    'min_child_weight': 471, 
    'subsample': 0.9133343196782143, 
    'reg_alpha': 1.6609002211483963, 
    'reg_lambda': 0.5107124760580259
}

In [None]:
oof_lgb = np.zeros(len(X))
predictions_lgb = np.zeros(len(test))
i = 1

for train_ix, test_ix in kf.split(X.values):
    
    print("\033[1m" + "Out of fold predictions generating for fold \033[94m {} \033[0m \n".format(i))
    
    train_X, train_y = X.values[train_ix], y.values[train_ix]
    test_X, test_y = X.values[test_ix], y.values[test_ix]
    
    model_lgb = lgb.LGBMClassifier(
        random_state = 666,
        n_jobs = -1,
        n_estimators = 40000,
        learning_rate = 0.005,
        subsample_freq = 1,
        **params
    )  
    
    model_lgb.fit(
        train_X, 
        train_y,
        eval_set = [(test_X, test_y)],
        eval_metric = "auc",
        early_stopping_rounds = 300,
        verbose = 1000
    )
    
    oof_lgb[test_ix] = oof_lgb[test_ix] + model_lgb.predict_proba(test_X)[:, 1]
    predictions_lgb = predictions_lgb + model_lgb.predict_proba(test)[:, 1]
    
    print("\033[1mAUC for fold \033[91m{} \t\t\t \033[92m {} \033[0m \n".format(i, round(roc_auc_score(test_y, oof_lgb[test_ix]), 5)))
    
    i = i + 1
    
print("\033[1mAUC for Training Set: \t\t \033[92m {} \033[0m \n".format(round(roc_auc_score(y, oof_lgb), 5)))

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
submission[target] = predictions_lgb / 5
submission.to_csv("submission.csv", index = False)
submission

In [None]:
np.save("oof_lgb.npy", oof_lgb)
np.save("predictions_lgb.npy", predictions_lgb)