# Adversarial Validation
In this Notebook, Adversarial Validation was performed on Train data and Public data, and on Train data and Private data, and the differences were compared.  
The correlation between CV and LB is stable, so there is no need to worry too much. However, Public and Private are divided in time series, so we need to be very careful to prevent Shake Down.  

# Load Libraries

In [None]:
import cudf
import cupy
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from catboost import Pool
from catboost import CatBoost
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import gc
import warnings
warnings.simplefilter('ignore')

# Load Data

In [None]:
test = cudf.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet')
test = test.drop_duplicates(subset=["customer_ID"], keep="last")
test['S_2'] = cudf.to_datetime(test['S_2'])
test['month'] = (test['S_2'].dt.month).astype('int8')

train = cudf.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet')
train = train.drop_duplicates(subset=["customer_ID"], keep="last")
train['S_2'] = cudf.to_datetime(train['S_2'])
train['month'] = (train['S_2'].dt.month).astype('int8')

In [None]:
train['target'] = 1
test['target'] = 0
test['target'] = 0

test_private = test[test['month'] == 4].reset_index(drop=True)
test_public = test[test['month'] == 10].reset_index(drop=True)
del test
gc.collect()

# Train vs Public
In this case, Catboost (GPU) is used.  
You may use feature importance such as Lightgbm for comparison.  
Also, to save time, we stopped running at AUC<0.75, but we recommend continuing until the AUC is less than 0.6.

In [None]:
data = cudf.concat([train,test_public]).reset_index(drop=True)

In [None]:
TARGET = 'target'
drop_cols = ['S_2','month','customer_ID',TARGET]
use_cols = [c for c in data.columns if c not in drop_cols]

In [None]:
cat_params = {
        'loss_function' : 'Logloss',
        'eval_metric' : 'AUC',
        'learning_rate': 0.1,
        'num_boost_round': 2500,
        'early_stopping_rounds': 50,
        'random_state': 127,
        'task_type': 'GPU'
    }

In [None]:
drop_feats = []
fi_df_all = []
while True:
    train_x, valid_x, train_y, valid_y = train_test_split(data[use_cols].to_pandas(), data[TARGET].to_pandas(), test_size=0.33, random_state=42)
    
    trn_data = Pool(train_x, label=train_y)
    val_data = Pool(valid_x ,label=valid_y)

    model = CatBoost(cat_params)
    model.fit(trn_data,
            eval_set=val_data,
            verbose_eval=500,
            use_best_model=True
          )
    pred = model.predict(val_data)
    auc_score = roc_auc_score(valid_y,pred)
    print(f'AUC Score : {auc_score}')
    
    # time savings
    if auc_score < 0.75:
        break
    else:
        fi_df = pd.DataFrame()
        fi_df['importance'] = model.get_feature_importance(Pool(train_x, train_y))
        fi_df['features'] = use_cols
        fi_df = fi_df.sort_values(by="importance", ascending=False)
        fi_df_all.append(fi_df)
        drop_feats += fi_df['features'].to_list()[:5]
        use_cols = [c for c in use_cols if c not in drop_feats]
        print(f'Drop Features : {drop_feats}')

# Check Features

In [None]:
plt.figure(figsize=(20,15))
for i,feat in enumerate(drop_feats):
    plt.subplot(5,5,i+1)
    sns.distplot(train[feat].to_pandas(),label='Train')
    sns.distplot(test_public[feat].to_pandas(),label='public')
    plt.legend()

## Null Ratio

In [None]:
for feat in drop_feats:
    print(f'========================= {feat} =========================')
    print('Train Nan Ratio:',train[feat].isnull().sum()/len(train))
    print('Public Nan Ratio:',test_public[feat].isnull().sum()/len(test_public))

# Train vs Private

In [None]:
data = cudf.concat([train,test_private]).reset_index(drop=True)

In [None]:
TARGET = 'target'
drop_cols = ['S_2','month','customer_ID',TARGET]
use_cols = [c for c in data.columns if c not in drop_cols]

In [None]:
cat_params = {
        'loss_function' : 'Logloss',
        'eval_metric' : 'AUC',
        'learning_rate': 0.1,
        'num_boost_round': 2500,
        'early_stopping_rounds': 50,
        'random_state': 127,
        'task_type': 'GPU'
    }

In [None]:
drop_feats = []
fi_df_all = []
while True:
    train_x, valid_x, train_y, valid_y = train_test_split(data[use_cols].to_pandas(), data[TARGET].to_pandas(), test_size=0.33, random_state=42)
    
    trn_data = Pool(train_x, label=train_y)
    val_data = Pool(valid_x ,label=valid_y)

    model = CatBoost(cat_params)
    model.fit(trn_data,
            eval_set=val_data,
            verbose_eval=500,
            use_best_model=True
          )
    pred = model.predict(val_data)
    auc_score = roc_auc_score(valid_y,pred)
    print(f'AUC Score : {auc_score}')
    
    # time savings
    if auc_score < 0.75:
        break
    else:
        fi_df = pd.DataFrame()
        fi_df['importance'] = model.get_feature_importance(Pool(train_x, train_y))
        fi_df['features'] = use_cols
        fi_df = fi_df.sort_values(by="importance", ascending=False)
        fi_df_all.append(fi_df)
        drop_feats += fi_df['features'].to_list()[:5]
        use_cols = [c for c in use_cols if c not in drop_feats]
        print(f'Drop Features : {drop_feats}')

# Check Features

In [None]:
plt.figure(figsize=(20,15))
for i,feat in enumerate(drop_feats):
    plt.subplot(3,5,i+1)
    sns.distplot(train[feat].to_pandas(),label='Train')
    sns.distplot(test_private[feat].to_pandas(),label='private')
    plt.legend()

## Null Ratio

In [None]:
for feat in drop_feats:
    print(f'========================= {feat} =========================')
    print('Train Nan Ratio:',train[feat].isnull().sum()/len(train))
    print('Public Nan Ratio:',test_private[feat].isnull().sum()/len(test_private))

# Public vs Private

In [None]:
public_drop_feats = ['B_29', 'R_1', 'D_59', 'S_11', 'S_15', 'S_9', 'S_24', 'D_121', 'S_27', 'S_22', 'D_45', 'R_27', 'D_62', 'S_13', 'D_91', 'D_39', 'D_42', 'D_77', 'B_8', 'D_142', 'P_4', 'B_17', 'P_3', 'D_120', 'S_17']
private_drop_feats = ['R_1', 'D_59', 'S_11', 'S_9', 'S_27', 'D_121', 'R_27', 'S_15', 'S_22', 'S_24', 'D_39', 'D_62', 'D_45', 'B_17', 'D_60']
only_private_drop_feats = [f for f in private_drop_feats if f not in public_drop_feats]
only_private_drop_feats