# Imports

In [None]:
import numpy as np 
import pandas as pd
import datatable as dt
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

# Load dataset

In [None]:
%%time
train = dt.fread('../input/tabular-playground-series-nov-2021/train.csv').to_pandas()
test  = dt.fread('../input/tabular-playground-series-nov-2021/test.csv').to_pandas()
sub   = dt.fread('../input/tabular-playground-series-nov-2021/sample_submission.csv').to_pandas()

In [None]:
train.drop('id',axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)

In [None]:
y = train['target']
train.drop('target',axis=1,inplace=True)

# Model hyperparameters

In [None]:
# lgbm params
lgbm_params = {
  "objective": "binary",
  "metric": "auc",
  "learning_rate": 0.08,
  "device": "gpu",
  "verbose": 0, 
  "feature_pre_filter": False, 
  "lambda_l1": 9.314037635261775, 
  "lambda_l2": 0.10613573572440353,
  "num_leaves": 7,
  "feature_fraction": 0.4, 
  "bagging_fraction": 0.8391963650875751, 
  "bagging_freq": 5, 
  "min_child_samples": 100,
  "num_iterations": 10000,
  "n_estimators": 20000,
  "random_state": 42
}

# 5-fold model training and prediction

In [None]:
folds = KFold(n_splits = 5, random_state = 102021, shuffle = True)

predictions = np.zeros(len(test))
lgbm_oof = np.zeros(train.shape[0])

for fold, (trn_idx, val_idx) in enumerate(folds.split(train)):

    X_train, X_test = train.iloc[trn_idx], train.iloc[val_idx]
    y_train, y_test = y.iloc[trn_idx], y.iloc[val_idx]

    model = LGBMClassifier(**lgbm_params)
    print(f"model fit started for Fold: {fold}")
    model.fit(X_train, 
              y_train,
              eval_set=[(X_test, y_test)],
              early_stopping_rounds=400,
              verbose=False)
    pred = model.predict_proba(X_test)[:,1]
    lgbm_oof[val_idx] = pred
    roc = roc_auc_score(y_test, pred)
    print(f" roc_auc_score: {roc}")
    print("-"*50)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits 

# Store submission and oof 

In [None]:
sub['target'] = predictions
sub.to_csv(f'submission_lgbm_1.csv',index = False)
np.savez_compressed('oof_lgbm.npz', lgbm_oof)