In [None]:
import pandas as pd
import numpy as np
import random
import time
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from tqdm import tqdm

from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMClassifier, LGBMRegressor

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, QuantileTransformer, KBinsDiscretizer
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.ensemble import StackingRegressor, StackingClassifier
from sklearn.impute import SimpleImputer

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense,   Dropout,  Concatenate, Embedding,  Flatten, Add, Average
from tensorflow.keras.models import Model

tf.config.set_visible_devices([], 'GPU')

import optuna
from optuna.samplers import TPESampler
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import gc

In [None]:
SEED = 2021
TARGET = "target"

N_SPLITS = 20
N_ESTIMATORS=15000
DEVICE = 'GPU'

LOSS = 'CrossEntropy'
EVAL_METRIC = "AUC"

STUDY_TIME = 60*60*8

In [None]:
def seed_everything(seed=2021):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

In [None]:
df_train = pd.read_csv(r"../input/tabular-playground-series-oct-2021/train.csv", index_col=0)
df_test = pd.read_csv(r"../input/tabular-playground-series-oct-2021/test.csv", index_col=0)

features = [col for col in df_train.columns if 'f' in col]

cont_features =[]
disc_features =[]

for col in features:
    if df_train[col].dtype=='float64':
        cont_features.append(col)
    else:
        disc_features.append(col)
        

df_train[cont_features] = df_train[cont_features].astype('float32')
df_train[disc_features] = df_train[disc_features].astype('uint8')

df_test[cont_features] = df_test[cont_features].astype('float32')
df_test[disc_features] = df_test[disc_features].astype('uint8')

X = df_train.drop(TARGET, axis=1)
y = df_train[[TARGET]]
X_test = df_test

X_index = X.index
X_test_index = X_test.index

del df_train, df_test
gc.collect()

LGBM Model : 
https://www.kaggle.com/hiro5299834/tps-oct-2021-single-lightgbm?kernelSessionId=76163449 by BIZEN

In [None]:
xgb1_oof = pd.read_parquet(r"../input/tps-10-21-single-xgboost-model/xgb_oof.parquet").rename(columns={TARGET:"xgb1_oof"})
xgb1_pred = pd.read_csv(r"../input/tps-10-21-single-xgboost-model/xgb_submission.csv",  index_col=0).rename(columns={TARGET:"xgb1_pred"})

cat1_oof = pd.read_parquet(r"../input/tps-10-21-catboost/cat_oof.parquet").rename(columns={TARGET:"cat1_oof"})
cat1_pred = pd.read_csv(r"../input/tps-10-21-catboost/cat_submission.csv", index_col=0).rename(columns={TARGET:"cat1_pred"})

lgbm1_oof = pd.DataFrame(np.load(r"../input/tps-oct-2021-single-lightgbm/lgb_oof.npy"), columns =[TARGET], index= X_index)
lgbm1_pred = pd.DataFrame(np.load(r"../input/tps-oct-2021-single-lightgbm/lgb_pred.npy"), columns =[TARGET], index= X_test_index)

# lgbm1_oof = pd.read_parquet(r"../input/tps-09-21-lgbm/lgbm_oof.parquet").rename(columns={TARGET:"lgbm1_oof"})
# lgbm1_pred = pd.read_csv(r"../input/tps-09-21-lgbm/lgbm_submission.csv", index_col=0).rename(columns={TARGET:"lgbm1_pred"})

off_ls = [xgb1_oof, cat1_oof, lgbm1_oof]
pred_ls = [xgb1_pred, cat1_pred, lgbm1_pred]

X = pd.concat(off_ls, axis=1)
X_test = pd.concat(pred_ls, axis=1)

gc.collect()

In [None]:
def run_kfold(model, n_splits=5, test_data=None):
    kf = KFold(n_splits=n_splits, random_state=SEED, shuffle=True)
    
    scores = np.empty((n_splits,1))
    
    y_preds = np.empty((len(test_data), n_splits))
    y_oof =  np.empty((len(X), 1))
    
    for i_fold,(train_idx, val_idx) in enumerate(kf.split(X)):
        print(5*"=" + f" Fold {i_fold} " + 5*"=")
        X_train = X.iloc[train_idx,:]
        y_train = y.iloc[train_idx].values.ravel()
        
        X_val = X.iloc[val_idx,:]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)

        fold_oof = model.predict_proba(X_val)[:,1].reshape((-1,1))
        y_oof[val_idx,:] = fold_oof
        
        fold_score = roc_auc_score(y_val, fold_oof)
        scores[i_fold, 0] = fold_score
        print(f"*** Fold {i_fold} score :", fold_score, " ***")

        if test_data is not None :
            y_preds[:,i_fold] = model.predict_proba(test_data)[:,1] 

    print('CV auc scores: ',scores.mean(), " +/- ",  scores.std())
    return scores, y_preds, y_oof

In [None]:
model = LogisticRegression(random_state=SEED)

In [None]:
scores, y_preds, y_oof = run_kfold(model, n_splits= N_SPLITS, test_data=X_test)

In [None]:
y_preds

In [None]:
pd.DataFrame(y_oof, index=X.index, columns= [TARGET]).to_parquet("stack_oof.parquet")
pd.DataFrame(y_preds.mean(axis=1), index=X_test.index, columns= [TARGET]).to_csv("stack_submission.csv")