As noted in this [discussion](https://www.kaggle.com/competitions/amex-default-prediction/discussion/327926), Private LB and Public LB seem to be split chronologically.  
We believe that analysis of these data will be important to avoid Shake down.  
In this Notebook, we would like to try Adversarial Validation as one of the methods.  
This result is based on the analysis of Private and Public LBs, and it does not mean that Shake down will occur.

In [None]:
import cudf
import cupy
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoost
from catboost import Pool
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
test = cudf.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet')

In [None]:
test = test.drop_duplicates(subset=["customer_ID"], keep="last")
test['S_2'] = cudf.to_datetime(test['S_2'])
test['month'] = (test['S_2'].dt.month).astype('int8')
test = test.reset_index(drop=True)

In [None]:
test['month'].value_counts(normalize = True)

In [None]:
test['private'] = 0
test.loc[test['month'] == 4,'private'] = 1

In [None]:
cat_params = {
        'loss_function' : 'Logloss',
        'eval_metric' : 'AUC',
        'learning_rate': 0.08,
        'num_boost_round': 5000,
        'early_stopping_rounds': 100,
        'random_state': 127,
        'task_type': 'GPU'
    }

In [None]:
kf = KFold(n_splits=3)
test['fold'] = 999
for fold, (idx_tr, idx_va) in enumerate(kf.split(test)):
    test.loc[test.index.isin(idx_va),'fold'] = fold

In [None]:
TARGET = 'private'
drop_cols = ['S_2','month','customer_ID','fold',TARGET]
use_cols = [c for c in test.columns if c not in drop_cols]

In [None]:
oof = cupy.zeros(len(test))
fi_df = pd.DataFrame()
for fold in range(3):
    train_x,train_y = test[test['fold'] != fold][use_cols],test[test['fold'] != fold][TARGET]
    valid_x,valid_y = test[test['fold'] == fold][use_cols],test[test['fold'] == fold][TARGET]

    trn_data = Pool(train_x.to_pandas(), label=train_y.to_array())
    val_data = Pool(valid_x.to_pandas(), label=valid_y.to_array())

    model = CatBoost(cat_params)
    model.fit(trn_data,
            eval_set=val_data,
            verbose_eval=500,
            use_best_model=True
          )


    pred = model.predict(val_data)
    auc_score = roc_auc_score(valid_y.to_array(),pred)
    oof[valid_x.index] = pred

    fi_df[f'fold_{fold}'] = model.get_feature_importance(Pool(train_x.to_pandas(), train_y.to_array()))

In [None]:
fi_df['importance'] = fi_df.mean(axis=1)
fi_df['features'] = use_cols
plt.figure(figsize=(10, 10))
sns.barplot(x="importance", y="features", data=fi_df.sort_values(by="importance", ascending=False)[:30])
plt.title('CatBoost Features')
plt.tight_layout()

The AUC score is above 0.99 and a check of the feature importance shows a significant difference in 'B_29'.

In [None]:
test = test.to_pandas()
plt.hist(test[(test['private'] == 0) & (test['B_29']<0.02)]['B_29'],label='public')
plt.hist(test[(test['private'] == 1) & (test['B_29']<0.02)]['B_29'],label='private')
plt.legend()
plt.xlim(0,0.03)
plt.show()

In [None]:
print('public B_29:',test[test['private'] == 0]['B_29'].isnull().sum())
print('private B_29:',test[test['private'] == 1]['B_29'].isnull().sum())

The above shows that 'B_29' needs to be analyzed in depth.