## References
- https://www.kaggle.com/dlaststark/tps-sep-single-xgboost-model
- https://www.kaggle.com/maximkazantsev/tps-09-21-eda-xgboost-with-folds
- https://www.kaggle.com/akihironomura/tps-lightgbm-optuna-kfold

## Import Modules

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from warnings import filterwarnings
filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

## Exploratory Data Analysis (EDA)

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.isnull().sum()

In [None]:
test.head()

In [None]:
test.isnull().sum()

## Feature Engineering

In [None]:
features = [x for x in train.columns.values if x[0]=="f"]

In [None]:
train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

train['abs_sum'] = train[features].abs().sum(axis=1)
test['abs_sum'] = test[features].abs().sum(axis=1)

train['sem'] = train[features].sem(axis=1)
test['sem'] = test[features].sem(axis=1)

train['std'] = train[features].std(axis=1)
test['std'] = test[features].std(axis=1)

train['avg'] = train[features].mean(axis=1)
test['avg'] = test[features].mean(axis=1)

train['max'] = train[features].max(axis=1)
test['max'] = test[features].min(axis=1)

train['min'] = train[features].min(axis=1)
test['min'] = test[features].min(axis=1)

In [None]:
imputer = SimpleImputer(strategy="median")
for col in features:
    train[col] = imputer.fit_transform(np.array(train[col]).reshape(-1,1))
    test[col] = imputer.transform(np.array(test[col]).reshape(-1,1))

In [None]:
# s_scaler = StandardScaler()
# m_scaler = MinMaxScaler()
r_scaler = RobustScaler()
for col in features:
    # train[col] = s_scaler.fit_transform(np.array(train[col]).reshape(-1,1))
    # test[col] = s_scaler.transform(np.array(test[col]).reshape(-1,1))
    # train[col] = m_scaler.fit_transform(np.array(train[col]).reshape(-1,1))
    # test[col] = m_scaler.transform(np.array(test[col]).reshape(-1,1))
    train[col] = r_scaler.fit_transform(np.array(train[col]).reshape(-1,1))
    test[col] = r_scaler.transform(np.array(test[col]).reshape(-1,1))

In [None]:
X = train.drop(['id', 'claim'], axis = 1)
y = train['claim']

## XGBClassifier

In [None]:
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'n_estimators': 2000,
    'max_depth': 4,
    'gamma': 0.2465,
    'subsample': 0.6423,
    'colsample_bytree': 0.775,
    'colsample_bylevel': 0.868,
    'min_child_weight': 366,
    'reg_lambda': 0.05,
    'reg_alpha': 10,
    'verbosity': 0,
    'random_state': 42
} 

In [None]:
model = XGBClassifier(**xgb_params)
model

## Model Training

In [None]:
X_test = test.drop(['id'], axis = 1)

In [None]:
splits = 5
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

oof_preds = np.zeros((X.shape[0],))
preds = 0
model_fi = 0
total_mean_auc = 0

for num, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
    y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
    
    model.fit(X_train, y_train,
              verbose=False,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="auc",
              early_stopping_rounds=300,
              )
    
    preds += model.predict_proba(X_test)[:, 1] / splits
    model_fi += model.feature_importances_ / splits
    
    oof_preds[valid_idx] = model.predict_proba(X_valid)[:, 1]
    
    fold_auc = roc_auc_score(y_valid, oof_preds[valid_idx])
    print(f"Fold {num} ROC AUC: {fold_auc}")

    total_mean_auc += fold_auc / splits
    
print(f"\nOverall ROC AUC: {total_mean_auc}")

## Feature Importance

In [None]:
importance = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['importance'])
importance = importance.sort_values('importance', ascending=False)
importance

## Make Submission

In [None]:
submission.claim = preds
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

## If you like this kernel, please upvote:)