In [None]:
import cudf
import cupy
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoost
from catboost import Pool
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier


In [None]:
def create_adversarial_data(df_train, df_test, cols, N_val=70000):
    df_master = df_train[cols].append(df_test[cols], ignore_index=True)
    adversarial_val = df_master.sample(N_val, replace=False)
    adversarial_train = df_master[
        ~df_master.index.isin(adversarial_val.index)
    ]
    return adversarial_train, adversarial_val

This work inspired me to create this notebook - https://www.kaggle.com/code/zakopur0/adversarial-validation-private-vs-public/notebook

## This notebook is an extension of the notebook: https://www.kaggle.com/code/mikhaildonskoy/looking-for-risky-features-in-train-data

## In this work, I want to find out which features differ the most on the test and training dataset, which can lead to overfitting 
## Unlike previous work, I will take the first lines for each customer

# Data preparation

In [None]:
train = cudf.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet').drop_duplicates(subset=["customer_ID"], keep="first")
test = cudf.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet').drop_duplicates(subset=["customer_ID"], keep="first")

In [None]:
train["is_train"] = 1
test["is_train"] = 0
target = ['is_train']
drop_cols = ['S_2','customer_ID',"is_train"]
use_cols = [c for c in train.columns if c not in drop_cols]
            
    
adversarial_train, adversarial_test = create_adversarial_data(train, test, list(train.columns))

# Starting Negative Feature Selection

In [None]:
train_data = Pool(
    data=adversarial_train[use_cols].to_pandas(),
    label=adversarial_train[target].to_pandas()
)
holdout_data = Pool(
    data=adversarial_test[use_cols].to_pandas(),
    label=adversarial_test[target].to_pandas()
)

In [None]:
params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'od_type': 'Iter',
    'num_boost_round': 100,
    'early_stopping_rounds': 10,
    "depth": 2,
    'task_type': 'GPU'
}

model = CatBoostClassifier(**params)
_ = model.fit(train_data, eval_set=holdout_data, verbose_eval=50)

In [None]:
feature_importance =pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': adversarial_train[use_cols].columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)
f, ax = plt.subplots(figsize=(10, 10))
sns.set_color_codes("pastel")
sns.barplot(x="feature_names", y="feature_importance", data=feature_importance.sort_values(by="feature_importance", ascending=False)[:10],color="b")

# Drop R1

In [None]:
drop_cols = ['S_2','customer_ID',"is_train","R_1"]
use_cols = [c for c in train.columns if c not in drop_cols]
            
    
adversarial_train, adversarial_test = create_adversarial_data(train, test, list(train.columns))
train_data = Pool(
    data=adversarial_train[use_cols].to_pandas(),
    label=adversarial_train[target].to_pandas()
)
holdout_data = Pool(
    data=adversarial_test[use_cols].to_pandas(),
    label=adversarial_test[target].to_pandas()
)
params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'od_type': 'Iter',
    'num_boost_round': 100,
    'early_stopping_rounds': 10,
    "depth": 2,
    'task_type': 'GPU'
}

model = CatBoostClassifier(**params)
_ = model.fit(train_data, eval_set=holdout_data, verbose_eval=50)

feature_importance =pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': adversarial_train[use_cols].columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)
f, ax = plt.subplots(figsize=(10, 10))
sns.set_color_codes("pastel")
sns.barplot(x="feature_names", y="feature_importance", data=feature_importance.sort_values(by="feature_importance", ascending=False)[:10],color="b")

# Drop S_11

In [None]:
drop_cols = ['S_2','customer_ID',"is_train","R_1","S_11"]
use_cols = [c for c in train.columns if c not in drop_cols]
            
    
adversarial_train, adversarial_test = create_adversarial_data(train, test, list(train.columns))
train_data = Pool(
    data=adversarial_train[use_cols].to_pandas(),
    label=adversarial_train[target].to_pandas()
)
holdout_data = Pool(
    data=adversarial_test[use_cols].to_pandas(),
    label=adversarial_test[target].to_pandas()
)
params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'od_type': 'Iter',
    'num_boost_round': 100,
    'early_stopping_rounds': 10,
    "depth": 2,
    'task_type': 'GPU'
}

model = CatBoostClassifier(**params)
_ = model.fit(train_data, eval_set=holdout_data, verbose_eval=50)

feature_importance =pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': adversarial_train[use_cols].columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)
f, ax = plt.subplots(figsize=(10, 10))
sns.set_color_codes("pastel")
sns.barplot(x="feature_names", y="feature_importance", data=feature_importance.sort_values(by="feature_importance", ascending=False)[:10],color="b")

# Drop D_59

In [None]:
drop_cols = ['S_2','customer_ID',"is_train","R_1","S_11","D_59"]
use_cols = [c for c in train.columns if c not in drop_cols]
            
    
adversarial_train, adversarial_test = create_adversarial_data(train, test, list(train.columns))
train_data = Pool(
    data=adversarial_train[use_cols].to_pandas(),
    label=adversarial_train[target].to_pandas()
)
holdout_data = Pool(
    data=adversarial_test[use_cols].to_pandas(),
    label=adversarial_test[target].to_pandas()
)
params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'od_type': 'Iter',
    'num_boost_round': 100,
    'early_stopping_rounds': 10,
    "depth": 2,
    'task_type': 'GPU'
}

model = CatBoostClassifier(**params)
_ = model.fit(train_data, eval_set=holdout_data, verbose_eval=50)

feature_importance =pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': adversarial_train[use_cols].columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)
f, ax = plt.subplots(figsize=(10, 10))
sns.set_color_codes("pastel")
sns.barplot(x="feature_names", y="feature_importance", data=feature_importance.sort_values(by="feature_importance", ascending=False)[:10],color="b")

# Drop D_121

In [None]:
drop_cols = ['S_2','customer_ID',"is_train","R_1","D_59","S_11","D_121"]
use_cols = [c for c in train.columns if c not in drop_cols]
            
    
adversarial_train, adversarial_test = create_adversarial_data(train, test, list(train.columns))
train_data = Pool(
    data=adversarial_train[use_cols].to_pandas(),
    label=adversarial_train[target].to_pandas()
)
holdout_data = Pool(
    data=adversarial_test[use_cols].to_pandas(),
    label=adversarial_test[target].to_pandas()
)
params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'od_type': 'Iter',
    'num_boost_round': 100,
    'early_stopping_rounds': 10,
    "depth": 2,
    'task_type': 'GPU'
}

model = CatBoostClassifier(**params)
_ = model.fit(train_data, eval_set=holdout_data, verbose_eval=50)

feature_importance =pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': adversarial_train[use_cols].columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)
f, ax = plt.subplots(figsize=(10, 10))
sns.set_color_codes("pastel")
sns.barplot(x="feature_names", y="feature_importance", data=feature_importance.sort_values(by="feature_importance", ascending=False)[:10],color="b")

# Drop S_27

In [None]:
drop_cols = ['S_2','customer_ID',"is_train","R_1","D_59","S_11","D_121","S_27"]
use_cols = [c for c in train.columns if c not in drop_cols]
            
    
adversarial_train, adversarial_test = create_adversarial_data(train, test, list(train.columns))
train_data = Pool(
    data=adversarial_train[use_cols].to_pandas(),
    label=adversarial_train[target].to_pandas()
)
holdout_data = Pool(
    data=adversarial_test[use_cols].to_pandas(),
    label=adversarial_test[target].to_pandas()
)
params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'od_type': 'Iter',
    'num_boost_round': 100,
    'early_stopping_rounds': 10,
    "depth": 2,
    'task_type': 'GPU'
}

model = CatBoostClassifier(**params)
_ = model.fit(train_data, eval_set=holdout_data, verbose_eval=50)

feature_importance =pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': adversarial_train[use_cols].columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)
f, ax = plt.subplots(figsize=(10, 10))
sns.set_color_codes("pastel")
sns.barplot(x="feature_names", y="feature_importance", data=feature_importance.sort_values(by="feature_importance", ascending=False)[:10],color="b")

# Drop D_118

In [None]:
drop_cols = ['S_2','customer_ID',"is_train","R_1","D_59","S_11","D_121","S_27","D_118"]
use_cols = [c for c in train.columns if c not in drop_cols]
            
    
adversarial_train, adversarial_test = create_adversarial_data(train, test, list(train.columns))
train_data = Pool(
    data=adversarial_train[use_cols].to_pandas(),
    label=adversarial_train[target].to_pandas()
)
holdout_data = Pool(
    data=adversarial_test[use_cols].to_pandas(),
    label=adversarial_test[target].to_pandas()
)
params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'od_type': 'Iter',
    'num_boost_round': 100,
    'early_stopping_rounds': 10,
    "depth": 2,
    'task_type': 'GPU'
}

model = CatBoostClassifier(**params)
_ = model.fit(train_data, eval_set=holdout_data, verbose_eval=50)

feature_importance =pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': adversarial_train[use_cols].columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)
f, ax = plt.subplots(figsize=(10, 10))
sns.set_color_codes("pastel")
sns.barplot(x="feature_names", y="feature_importance", data=feature_importance.sort_values(by="feature_importance", ascending=False)[:10],color="b")

# Drop D_119

In [None]:
drop_cols = ['S_2','customer_ID',"is_train","R_1","D_59","S_11","D_121","S_27","D_118","D_119"]
use_cols = [c for c in train.columns if c not in drop_cols]
            
    
adversarial_train, adversarial_test = create_adversarial_data(train, test, list(train.columns))
train_data = Pool(
    data=adversarial_train[use_cols].to_pandas(),
    label=adversarial_train[target].to_pandas()
)
holdout_data = Pool(
    data=adversarial_test[use_cols].to_pandas(),
    label=adversarial_test[target].to_pandas()
)
params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'od_type': 'Iter',
    'num_boost_round': 100,
    'early_stopping_rounds': 10,
    "depth": 2,
    'task_type': 'GPU'
}

model = CatBoostClassifier(**params)
_ = model.fit(train_data, eval_set=holdout_data, verbose_eval=50)

feature_importance =pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': adversarial_train[use_cols].columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)
f, ax = plt.subplots(figsize=(10, 10))
sns.set_color_codes("pastel")
sns.barplot(x="feature_names", y="feature_importance", data=feature_importance.sort_values(by="feature_importance", ascending=False)[:10],color="b")

# Drop D_120

In [None]:
drop_cols = ['S_2','customer_ID',"is_train","R_1","D_59","S_11","D_121","S_27","D_118","D_119","D_120"]
use_cols = [c for c in train.columns if c not in drop_cols]
            
    
adversarial_train, adversarial_test = create_adversarial_data(train, test, list(train.columns))
train_data = Pool(
    data=adversarial_train[use_cols].to_pandas(),
    label=adversarial_train[target].to_pandas()
)
holdout_data = Pool(
    data=adversarial_test[use_cols].to_pandas(),
    label=adversarial_test[target].to_pandas()
)
params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'od_type': 'Iter',
    'num_boost_round': 100,
    'early_stopping_rounds': 10,
    "depth": 2,
    'task_type': 'GPU'
}

model = CatBoostClassifier(**params)
_ = model.fit(train_data, eval_set=holdout_data, verbose_eval=50)

feature_importance =pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': adversarial_train[use_cols].columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)
f, ax = plt.subplots(figsize=(10, 10))
sns.set_color_codes("pastel")
sns.barplot(x="feature_names", y="feature_importance", data=feature_importance.sort_values(by="feature_importance", ascending=False)[:10],color="b")

# Drop P_4

In [None]:
drop_cols = ['S_2','customer_ID',"is_train","R_1","D_59","S_11","D_121","S_27","D_118","D_119","D_120","P_4"]
use_cols = [c for c in train.columns if c not in drop_cols]
            
    
adversarial_train, adversarial_test = create_adversarial_data(train, test, list(train.columns))
train_data = Pool(
    data=adversarial_train[use_cols].to_pandas(),
    label=adversarial_train[target].to_pandas()
)
holdout_data = Pool(
    data=adversarial_test[use_cols].to_pandas(),
    label=adversarial_test[target].to_pandas()
)
params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'od_type': 'Iter',
    'num_boost_round': 100,
    'early_stopping_rounds': 10,
    "depth": 2,
    'task_type': 'GPU'
}

model = CatBoostClassifier(**params)
_ = model.fit(train_data, eval_set=holdout_data, verbose_eval=50)

feature_importance =pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': adversarial_train[use_cols].columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)
f, ax = plt.subplots(figsize=(10, 10))
sns.set_color_codes("pastel")
sns.barplot(x="feature_names", y="feature_importance", data=feature_importance.sort_values(by="feature_importance", ascending=False)[:10],color="b")

# Drop D_39

In [None]:
drop_cols = ['S_2','customer_ID',"is_train","R_1","D_59","S_11","D_121","S_27","D_118","D_119","D_120","P_4","D_39"]
use_cols = [c for c in train.columns if c not in drop_cols]
            
    
adversarial_train, adversarial_test = create_adversarial_data(train, test, list(train.columns))
train_data = Pool(
    data=adversarial_train[use_cols].to_pandas(),
    label=adversarial_train[target].to_pandas()
)
holdout_data = Pool(
    data=adversarial_test[use_cols].to_pandas(),
    label=adversarial_test[target].to_pandas()
)
params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'od_type': 'Iter',
    'num_boost_round': 100,
    'early_stopping_rounds': 10,
    "depth": 2,
    'task_type': 'GPU'
}

model = CatBoostClassifier(**params)
_ = model.fit(train_data, eval_set=holdout_data, verbose_eval=50)

feature_importance =pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': adversarial_train[use_cols].columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)
f, ax = plt.subplots(figsize=(10, 10))
sns.set_color_codes("pastel")
sns.barplot(x="feature_names", y="feature_importance", data=feature_importance.sort_values(by="feature_importance", ascending=False)[:10],color="b")

# Drop R_27

In [None]:
drop_cols = ['S_2','customer_ID',"is_train","R_1","D_59","S_11","D_121","S_27","D_118","D_119","D_120","P_4","D_39","R_27"]
use_cols = [c for c in train.columns if c not in drop_cols]
            
    
adversarial_train, adversarial_test = create_adversarial_data(train, test, list(train.columns))
train_data = Pool(
    data=adversarial_train[use_cols].to_pandas(),
    label=adversarial_train[target].to_pandas()
)
holdout_data = Pool(
    data=adversarial_test[use_cols].to_pandas(),
    label=adversarial_test[target].to_pandas()
)
params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'od_type': 'Iter',
    'num_boost_round': 100,
    'early_stopping_rounds': 10,
    "depth": 2,
    'task_type': 'GPU'
}

model = CatBoostClassifier(**params)
_ = model.fit(train_data, eval_set=holdout_data, verbose_eval=50)

feature_importance =pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': adversarial_train[use_cols].columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)
f, ax = plt.subplots(figsize=(10, 10))
sns.set_color_codes("pastel")
sns.barplot(x="feature_names", y="feature_importance", data=feature_importance.sort_values(by="feature_importance", ascending=False)[:10],color="b")

# Drop D_126

In [None]:
drop_cols = ['S_2','customer_ID',"is_train","R_1","D_59","S_11","D_121","S_27","D_118","D_119","D_120","P_4","D_39","R_27","D_126"]
use_cols = [c for c in train.columns if c not in drop_cols]
            
    
adversarial_train, adversarial_test = create_adversarial_data(train, test, list(train.columns))
train_data = Pool(
    data=adversarial_train[use_cols].to_pandas(),
    label=adversarial_train[target].to_pandas()
)
holdout_data = Pool(
    data=adversarial_test[use_cols].to_pandas(),
    label=adversarial_test[target].to_pandas()
)
params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'od_type': 'Iter',
    'num_boost_round': 100,
    'early_stopping_rounds': 10,
    "depth": 2,
    'task_type': 'GPU'
}

model = CatBoostClassifier(**params)
_ = model.fit(train_data, eval_set=holdout_data, verbose_eval=50)

feature_importance =pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': adversarial_train[use_cols].columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)
f, ax = plt.subplots(figsize=(10, 10))
sns.set_color_codes("pastel")
sns.barplot(x="feature_names", y="feature_importance", data=feature_importance.sort_values(by="feature_importance", ascending=False)[:10],color="b")

# Сonclusion


### Risky features on the last lines: "R_1","D_59","S_11","B_29","S_9","S_15","D_121","S_24", "D_62","R_27","S_17","S_13", "S_18","D_45". 

### Risky features on the first lines "R_1","D_59","S_11","D_121","S_27","D_118","D_119","D_120","P_4","D_39","R_27","D_126"


#### You may notice that after the removal of the D_120 feature, roc-auc began to decline more slowly. In general, I decided to stop at 0.67 for the time being, because I still don’t see the point in looking for more features. I think that I will continue to try to carry out the same work on aggregated features

### Vote for this notebook if its content was useful or interesting to you