## References
- https://www.kaggle.com/dwin183287/tps-september-2021-eda
- https://www.kaggle.com/realtimshady/lightgbm-2-0
- https://www.kaggle.com/mohammadkashifunique/hyperparameter-tuning-lgbm-optuna
- https://www.kaggle.com/maximkazantsev/tps-09-21-eda-lightgbm-with-folds

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import optuna

import warnings 
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

## Exploratory Data Analysis (EDA)

In [None]:
train.head()

In [None]:
print('Training data shape (rows, cols): ', train.shape)

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
print('Total null count: ', train.isnull().sum().sum())

In [None]:
train.isnull().sum().sort_values(ascending = False)

In [None]:
# Target Distribution (0 or 1)
dist_class = train['claim'].value_counts()
labels = ['0', '1']

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,6))

sns.barplot(x=dist_class.index, y=dist_class, ax=ax1).set_title("Target Count")

ax2.pie(dist_class,
        labels=labels,
        counterclock=False,
        startangle=90,
        autopct='%1.1f%%',
        pctdistance=0.7)
plt.title("Target Frequency Proportion")
plt.show

## Feature Engineering (Data Pre-Processing?)

In [None]:
features = [x for x in train.columns.values if x[0]=="f"]

In [None]:
train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

train['abs_sum'] = train[features].abs().sum(axis=1)
test['abs_sum'] = test[features].abs().sum(axis=1)

train['sem'] = train[features].sem(axis=1)
test['sem'] = test[features].sem(axis=1)

train['std'] = train[features].std(axis=1)
test['std'] = test[features].std(axis=1)

train['avg'] = train[features].mean(axis=1)
test['avg'] = test[features].mean(axis=1)

train['max'] = train[features].max(axis=1)
test['max'] = test[features].min(axis=1)

train['min'] = train[features].min(axis=1)
test['min'] = test[features].min(axis=1)

In [None]:
X = train.drop(['id', 'claim'], axis = 1)
y = train['claim']

## Hyperparameter Tuning using Optuna

Hyperparameter tuning is time-consuming, so it is commented out...

In [None]:
# def objective(trial,data=X,target=y):
    
#     X_train, X_valid, y_train, y_valid = train_test_split(data, target, train_size=0.8, test_size=0.2,random_state=0)
    
#     imputer = SimpleImputer(strategy='median')
#     X_train = imputer.fit_transform(X_train)
#     X_valid = imputer.transform(X_valid)
    
#     scaler = RobustScaler()
#     X_train = scaler.fit_transform(X_train)
#     X_valid = scaler.transform(X_valid)
    
#     params = {
#         'objective': 'binary',
#         'metric': 'auc', 
#         'boosting_type': 'gbdt',
#         'n_estimators': 1000,
#         'random_state': 42,
#         'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
#         'subsample': trial.suggest_loguniform('subsample', 0.4, 1.0),
#         'subsample_freq': trial.suggest_loguniform('subsample_freq', 0.4, 1.0),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 1.0),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
#         'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
#         'min_child_weight': trial.suggest_int('min_child_weight', 5, 256),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
#         'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
#     }
#     model = lgb.LGBMClassifier(**params) 
#     model.fit(X_train, y_train)
    
#     preds = model.predict(X_valid)
#     auc = roc_auc_score(y_valid, preds)
    
#     return auc

In [None]:
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=10)
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [None]:
# study.best_trial.params

In [None]:
# lgb_params = {
#     'objective': 'binary',
#     'metric': 'auc', 
#     'boosting_type': 'gbdt',
#     'n_estimators': 1000,
#     'random_state': 42,
#     'learning_rate': 0.02,
#     'subsample': 0.8751761372035946,
#     'subsample_freq': 0.43935171514346294,
#     'colsample_bytree': 0.675151285253419,
#     'reg_alpha': 5.536980361906913,
#     'reg_lambda': 0.3142777516202206,
#     'min_child_weight': 201,
#     'min_child_samples': 9,
#     'bagging_fraction': 0.9678154285091293,
#     'bagging_freq': 6
# }

# based on study.best_trial.params...
lgb_params = {
    'objective': 'binary',
    'metric': 'auc', 
    'boosting_type': 'gbdt',
    'n_estimators': 1000,
    'random_state': 42,
    'learning_rate': 0.02,
    'subsample': 0.46366427250815384,
    'subsample_freq': 0.9961802289581205,
    'colsample_bytree': 0.6157140152844784,
    'reg_alpha': 2.657030645814501,
    'reg_lambda': 0.015450253262708286,
    'min_child_weight': 137,
    'min_child_samples': 54,
    'bagging_fraction': 0.4169496814206163,
    'bagging_freq': 1
}

## LGBMClassifier

In [None]:
model = lgb.LGBMClassifier(**lgb_params)
model

## Model Training & Evaluation

In [None]:
X_test = test.drop(['id'], axis = 1)

In [None]:
splits = 5
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

oof_preds = np.zeros((X.shape[0],))
preds = 0
model_fi = 0
total_mean_auc = 0

for num, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
    y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
    
    imputer = SimpleImputer(strategy='median')
    X_train = imputer.fit_transform(X_train)
    X_valid = imputer.transform(X_valid)
    
    scaler = RobustScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    
    model.fit(X_train, y_train,
              verbose=False,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="auc",
              early_stopping_rounds=300,
              )
    
    X_test = imputer.transform(X_test)
    X_test = scaler.transform(X_test)
    
    preds += model.predict_proba(X_test)[:, 1] / splits
    model_fi += model.feature_importances_ / splits
    
    oof_preds[valid_idx] = model.predict_proba(X_valid)[:, 1]
    
    fold_auc = roc_auc_score(y_valid, oof_preds[valid_idx])
    print(f"Fold {num} ROC AUC: {fold_auc}")

    total_mean_auc += fold_auc / splits
    
print(f"\nOverall ROC AUC: {total_mean_auc}")

## Feature Importance

In [None]:
importance = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['importance'])
importance = importance.sort_values('importance', ascending=False)
importance

## Make Submission

In [None]:
submission.claim = preds
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

## If you like this kernel, please upvote:)