In [None]:
import numpy as np
import pandas as pd
import random
import os
import time
import pickle
from pathlib import Path
import gc

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
#import xgboost as xgb
#import catboost as ctb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters

In [None]:
target = 'target'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 2017
    CVSEED = 2017
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 5
    N_ESTIMATORS = 20000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 2017
    CVSEED = 2017
    #N_ITERS = 10

In [None]:
def set_seed(seed=2017):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Datasets

In [None]:
INPUT = Path("../input/tabular-playground-series-oct-2021")

train = pd.read_csv(INPUT / "train.csv")
test = pd.read_csv(INPUT / "test.csv")
submission = pd.read_csv(INPUT / "sample_submission.csv")

In [None]:
train = train[train.columns[1:]]
test = test[test.columns[1:]]

# Preprocessing

In [None]:
features = [col for col in train.columns if 'f' in col]

In [None]:
cont_features =[]
disc_features =[]

for col in features:
    if train[col].dtype=='float64':
        cont_features.append(col)
    else:
        disc_features.append(col)
        
features = disc_features + cont_features

In [None]:
train[cont_features] = train[cont_features].astype('float32')
train[disc_features] = train[disc_features].astype('uint8')
train[target] = train[target].astype('uint8')

test[cont_features] = test[cont_features].astype('float32')
test[disc_features] = test[disc_features].astype('uint8')

In [None]:
test[features]

In [None]:
train[target]

# LGB

In [None]:
lgb_params = {
     'objective': 'binary',
     'n_estimators':N_ESTIMATORS,
     'importance_type': 'gain',
     'metric':'auc',
     'boosting_type': 'gbdt',
     'n_jobs' : -1,
        
    'learning_rate': 0.0038511441056118664, 
    'subsample': 0.5827550088149794, 
    'subsample_freq': 1, 
    'colsample_bytree': 0.19599597755538956, 
    'reg_lambda': 0.011685550612519125, 
    'reg_alpha': 0.04502045156737212, 
    'min_child_weight': 16.843316711276092, 
    'min_child_samples': 412, 
    'num_leaves': 546, 
    'max_depth': 5, 
    'cat_smooth': 36.40200359200525, 
    'cat_l2': 12.979520035205597
    }

In [None]:
lgb_oof = np.zeros(train.shape[0])
lgb_pred = np.zeros(test.shape[0])
lgb_importances = pd.DataFrame()

X_test = test[features]
del test
gc.collect()


kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
seed_list=[SEED+1]

for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train[features], y=train[target])):
    print(f"===== fold {fold} =====")
    if fold == 4:
        
        X_train = train[features].iloc[trn_idx]
        y_train = train[target].iloc[trn_idx]
        X_valid = train[features].iloc[val_idx]
        y_valid = train[target].iloc[val_idx]
        


        start = time.time()
        for inseed in seed_list:
            lgb_params['random_state'] = inseed

            pre_model = lgb.LGBMClassifier(**lgb_params)
            pre_model.fit(
                X_train, 
                y_train,
                eval_set=[(X_valid, y_valid)],
                eval_metric='auc',
                categorical_feature = disc_features,
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                verbose=VERBOSE,
            )

            lgb_params2 = lgb_params.copy()
            lgb_params2['reg_lambda'] *= 0.9
            lgb_params2['reg_alpha'] *= 0.9
            lgb_params2['learning_rate'] *= 0.1
            model = lgb.LGBMClassifier(**lgb_params2)
            model.fit(
                    X_train, y_train,
                    eval_set=[(X_valid, y_valid)],
                    eval_metric='auc',
                    categorical_feature = disc_features,
                    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                    verbose=VERBOSE,
                    init_model=pre_model
            )    

            with open(f"lgb_model{fold}_seed{inseed}.pkl", 'wb') as f:
                pickle.dump(model, f)

            fi_tmp = pd.DataFrame()
            fi_tmp['feature'] = X_train.columns
            fi_tmp['importance'] = model.feature_importances_
            fi_tmp['fold'] = fold
            fi_tmp['seed'] = inseed
            lgb_importances = lgb_importances.append(fi_tmp)

            lgb_oof[val_idx] += model.predict_proba(X_valid)[:,-1] / len(seed_list)
            lgb_pred += model.predict_proba(X_test)[:,-1] / len(seed_list)
            
            del pre_model
            del model
            gc.collect()


        elapsed = time.time() - start
        auc = roc_auc_score(y_valid, lgb_oof[val_idx])
        print(f"fold {fold} - lgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")
        
        del X_train
        del y_train
        del X_valid
        del y_valid
        gc.collect()


del X_test
gc.collect()

lgb_pred /= N_SPLITS
print(f"oof lgb_auc = {roc_auc_score(train[target], lgb_oof)}")

np.save("lgb_oof.npy", lgb_oof)
np.save("lgb_pred.npy", lgb_pred)

# OOF predictions

In [None]:
plt.plot(train[target], train[target])
plt.scatter(train[target], lgb_oof)

In [None]:
del train
del lgb_oof
gc.collect()

# Features importances

In [None]:
order = list(lgb_importances.groupby('feature').mean().sort_values('importance', ascending=False).index)

fig = plt.figure(figsize=(16, 16), tight_layout=True)
sns.barplot(x="importance", y="feature", data=lgb_importances.groupby('feature').mean().reset_index(), order=order)
plt.title("LGB feature importances")

# Submission

In [None]:
submission[target] = lgb_pred
submission.to_csv("submission.csv", index=False)

submission

# Log

seeds

2017 ver1
2018 ver6 fold 0-3 ver7 fold 4 
2019 ver3
2020 ver4
2021 ver5