# Libraries
---

In [None]:
import pandas as pd
import numpy as np
import random
import time
import os
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

from xgboost import XGBClassifier

import warnings
warnings.simplefilter('ignore')

# Parameters
---

In [None]:
N_SPLITS = 5
N_ESTIMATORS = 25001
EARLY_STOPPING_ROUNDS = 3048 # very important, well protects against overfitting
VERBOSE = 1000 # faster and more clearly
SEED = 42

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

# Datasets
---

In [None]:
INPUT = "../input/tabular-playground-series-sep-2021/"

train = pd.read_csv(INPUT + "train.csv")
test = pd.read_csv(INPUT + "test.csv")
submission = pd.read_csv(INPUT + "sample_solution.csv")



In [None]:
features = [col for col in test.columns if 'f' in col]
TARGET = 'claim'

target = train[TARGET].copy()

Idea taken from https://www.kaggle.com/realtimshady/single-simple-lightgbm
Missing feature values are replaced depending on the type of distribution.

In [None]:
train['n_missing'] = train[features].isna().sum(axis=1)
# train['mean'] = train[features].mean(axis=1)
# train['median'] = train[features].median(axis=1)
# train['std'] = train[features].std(axis=1)
train['min'] = train[features].min(axis=1)
# train['max'] = train[features].max(axis=1)
train['sem']= train[features].sem(axis=1)
train['quantile'] = train[features].quantile(axis = 1)

test['n_missing'] = test[features].isna().sum(axis=1)
# test['mean'] = test[features].mean(axis=1)
# test['median'] = test[features].median(axis=1)
# test['std'] = test[features].std(axis=1)
test['min'] = test[features].min(axis=1)
# test['max'] = test[features].max(axis=1)
test['sem']= test[features].sem(axis=1)
test['quantile'] = test[features].quantile(axis=1)

# features += ['n_missing','mean','median','std','min','max','sem','quantile']
features += ['n_missing','min','sem','quantile']
n_missing = train['n_missing'].copy()

In [None]:
# fill_value_dict = {
#     'f1': 'Mean', 
#     'f2': 'Median', 
#     'f3': 'Median', 
#     'f4': 'Median', 
#     'f5': 'Mode', 
#     'f6': 'Mean', 
#     'f7': 'Median', 
#     'f8': 'Median', 
#     'f9': 'Median', 
#     'f10': 'Median', 
#     'f11': 'Mean', 
#     'f12': 'Median', 
#     'f13': 'Mean', 
#     'f14': 'Median', 
#     'f15': 'Mean', 
#     'f16': 'Median', 
#     'f17': 'Median', 
#     'f18': 'Median', 
#     'f19': 'Median', 
#     'f20': 'Median', 
#     'f21': 'Median', 
#     'f22': 'Mean', 
#     'f23': 'Mode', 
#     'f24': 'Median', 
#     'f25': 'Median', 
#     'f26': 'Median', 
#     'f27': 'Median', 
#     'f28': 'Median', 
#     'f29': 'Mode', 
#     'f30': 'Median', 
#     'f31': 'Median', 
#     'f32': 'Median', 
#     'f33': 'Median', 
#     'f34': 'Mean', 
#     'f35': 'Median', 
#     'f36': 'Mean', 
#     'f37': 'Median', 
#     'f38': 'Median', 
#     'f39': 'Median', 
#     'f40': 'Mode', 
#     'f41': 'Median', 
#     'f42': 'Mode', 
#     'f43': 'Mean', 
#     'f44': 'Median', 
#     'f45': 'Median', 
#     'f46': 'Mean', 
#     'f47': 'Mode', 
#     'f48': 'Mean', 
#     'f49': 'Mode', 
#     'f50': 'Mode', 
#     'f51': 'Median', 
#     'f52': 'Median', 
#     'f53': 'Median', 
#     'f54': 'Mean', 
#     'f55': 'Mean', 
#     'f56': 'Mode', 
#     'f57': 'Mean', 
#     'f58': 'Median', 
#     'f59': 'Median', 
#     'f60': 'Median', 
#     'f61': 'Median', 
#     'f62': 'Median', 
#     'f63': 'Median', 
#     'f64': 'Median', 
#     'f65': 'Mode', 
#     'f66': 'Median', 
#     'f67': 'Median', 
#     'f68': 'Median', 
#     'f69': 'Mean', 
#     'f70': 'Mode', 
#     'f71': 'Median', 
#     'f72': 'Median', 
#     'f73': 'Median', 
#     'f74': 'Mode', 
#     'f75': 'Mode', 
#     'f76': 'Mean', 
#     'f77': 'Mode', 
#     'f78': 'Median', 
#     'f79': 'Mean', 
#     'f80': 'Median', 
#     'f81': 'Mode', 
#     'f82': 'Median', 
#     'f83': 'Mode', 
#     'f84': 'Median', 
#     'f85': 'Median', 
#     'f86': 'Median', 
#     'f87': 'Median', 
#     'f88': 'Median', 
#     'f89': 'Median', 
#     'f90': 'Mean', 
#     'f91': 'Mode', 
#     'f92': 'Median', 
#     'f93': 'Median', 
#     'f94': 'Median', 
#     'f95': 'Median', 
#     'f96': 'Median', 
#     'f97': 'Mean', 
#     'f98': 'Median', 
#     'f99': 'Median', 
#     'f100': 'Mode', 
#     'f101': 'Median', 
#     'f102': 'Median', 
#     'f103': 'Median', 
#     'f104': 'Median', 
#     'f105': 'Median', 
#     'f106': 'Median', 
#     'f107': 'Median', 
#     'f108': 'Median', 
#     'f109': 'Mode', 
#     'f110': 'Median', 
#     'f111': 'Median', 
#     'f112': 'Median', 
#     'f113': 'Mean', 
#     'f114': 'Median', 
#     'f115': 'Median', 
#     'f116': 'Mode', 
#     'f117': 'Median', 
#     'f118': 'Mean'
# }

# for col in tqdm(features):
#     if fill_value_dict.get(col)=='Mean':
#         fill_value = train[col].mean()
#     elif fill_value_dict.get(col)=='Median':
#         fill_value = train[col].median()
#     elif fill_value_dict.get(col)=='Mode':
#         fill_value = train[col].mode().iloc[0]
    
#     train[col].fillna(fill_value, inplace=True)
#     test[col].fillna(fill_value, inplace=True)

# train.dropna(inplace=True)
# test.dropna(inplace=True)

In [None]:
pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler()) #StandardScaler RobustScaler
])
train[features] = pipeline.fit_transform(train[features])
test[features] = pipeline.transform(test[features])

In [None]:
train.shape, test.shape

#  XGBClassifier
---

In [None]:
xgb_params = {
    'eval_metric': 'auc', 
    'objective': 'binary:logistic', 
    'tree_method': 'gpu_hist', 
    'gpu_id': 0, 
    'predictor': 'gpu_predictor', 
    'n_estimators': N_ESTIMATORS, 
    'learning_rate': 0.01, 
    'gamma': 0.25, 
    'max_depth': 4, 
    'min_child_weight': 378, 
    'subsample': 0.63, 
    'colsample_bytree': 0.77, 
    'colsample_bylevel': 0.87, 
    'lambda': 0.05, 
    'alpha': 10
}


In [None]:
xgb_oof = np.zeros(train.shape[0])
xgb_pred = np.zeros(test.shape[0])
xgb_importances = pd.DataFrame()

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train, y=n_missing)):
    print(f">>> fold {fold} >>>")
    X_train = train[features].iloc[trn_idx]
    y_train = target.iloc[trn_idx]
    X_valid = train[features].iloc[val_idx]
    y_valid = target.iloc[val_idx]
    X_test = test[features]
    
    start = time.time()
    model = XGBClassifier(**xgb_params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],                
        verbose=VERBOSE,
        early_stopping_rounds=EARLY_STOPPING_ROUNDS # very important, well protects against overfitting
    )    
    df_tmp = pd.DataFrame()
    df_tmp['fold'] = fold
    df_tmp['seed'] = SEED
    xgb_importances = xgb_importances.append(df_tmp)
    xgb_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    xgb_pred += model.predict_proba(X_test)[:, -1] / N_SPLITS

    elapsed = time.time() - start
    auc = roc_auc_score(y_valid, xgb_oof[val_idx])
    print(f"fold {fold} - xgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")

print(f"oof xgb roc = {roc_auc_score(target, xgb_oof)}")

# Submission
---

In [None]:
submission[TARGET] = xgb_pred
submission.to_csv("submission.csv", index=False)

In [None]:
train.isna().sum()

In [None]:
train.describe()