In [None]:
import os, gc

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import matplotlib.pyplot as plt

# Hello kagglers
I want to share with you one finding that i made by performing adversarial validation between test set and training set. 
*Refere to [this notebook](https://www.kaggle.com/code/carlmcbrideellis/what-is-adversarial-validation/notebook) if you want to understand what is adversarial validation and how to use that as a validation strategy.*

My guessing is that the **test set is shifted in time** respect to the training set and there are some features that contains a seasonal component like D_59, S_11, D_121, S_9.

EDIT: i found that https://www.kaggle.com/code/ambrosm/amex-eda-which-makes-sense comes to the same conclusion.

In [None]:
# Data took from processed https://www.kaggle.com/datasets/ruchi798/parquet-files-amexdefault-prediction

train_data = pd.read_feather(
    '../input/parquet-files-amexdefault-prediction/train_data.ftr'
)

test_data = pd.read_feather(
    '../input/parquet-files-amexdefault-prediction/test_data.ftr'
)

In [None]:
# code took from https://www.kaggle.com/code/lucasmorin/amex-lgbm-features-eng MANY THANKS!!

def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

# from ambrosm notebook
def lgb_amex_metric(y_true, y_pred):
    """The competition metric with lightgbm's calling convention"""
    return ('amex',
            amex_metric(pd.DataFrame({'target': y_true}), pd.Series(y_pred, name='prediction')),
            True)

#see : https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod_lgbm(y_pred: np.ndarray, data: lgb.Dataset):

    y_true = data.get_label()
    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 'AMEX', 0.5 * (gini[1]/gini[0]+ top_four), True   

In [None]:
train_data = (
    train_data
    .groupby('customer_ID')
    .tail(1)
    .set_index('customer_ID', drop=True)
    .sort_index()
    .drop(['S_2'], axis='columns')
)

train_data['target'] = 0
train_data.reset_index(inplace=True, drop=True)

In [None]:
test_data = (
    test_data
    .groupby('customer_ID')
    .tail(1)
    .set_index('customer_ID', drop=True)
    .sort_index()
    .drop(['S_2'], axis='columns')
)

test_data['target'] = 1

test_data.reset_index(inplace=True, drop=True)

In [None]:
df = train_data.append(
    test_data
)

del test_data, train_data
gc.collect()

In [None]:
# split in training and testing set
train, test = train_test_split(df, test_size=0.2, random_state=1234)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
# ohe encoding
cat_featurs = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

enc = OneHotEncoder()
train_enc_features = pd.DataFrame(
    data=enc.fit_transform(train[cat_featurs]).toarray(),
    columns=enc.get_feature_names_out()
)

test_enc_features = pd.DataFrame(
    data=enc.transform(test[cat_featurs]).toarray(),
    columns=enc.get_feature_names_out()
)

train = pd.concat([train, train_enc_features], axis=1)
test = pd.concat([test, test_enc_features], axis=1)

del train_enc_features, test_enc_features
gc.collect()

In [None]:
features = [f for f in train.columns if f != 'target' and f != 'customer_ID' and f not in cat_featurs]

parameters = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'learning_rate': 0.05,
    'min_child_samples': 1000,
    'reg_lambda':10,
    'verbose': 25,
    'seed':1234,
    'n_estimators':100
}

train_df, eval_df = train_test_split(train, test_size=0.2, random_state=1234)


lgb_train_data = lgb.Dataset(train_df[features], label=train_df['target'])
lgb_test_data = lgb.Dataset(eval_df[features], label=eval_df['target'])

clf = lgb.train(
    parameters,
    lgb_train_data,
    valid_sets=lgb_test_data,
    early_stopping_rounds=50
)

In [None]:
preds = clf.predict(test[features], num_iteration=clf.best_iteration)
score = roc_auc_score(test['target'], preds)

del train_df, eval_df, lgb_test_data, lgb_train_data
gc.collect()

In [None]:
score

In [None]:
feat_imp = pd.DataFrame(
    {
        'feat_imp': clf.feature_importance(),
        'feat_name': clf.feature_name()
    }
).sort_values(by='feat_imp', ascending=False)

In [None]:
feat_imp[:6]

In [None]:
for idx, col in feat_imp.iloc[:6].iterrows():
    plt.figure()
    plt.title(
        f"distribution for {col['feat_name']}"
    )
    plt.hist(
        train.loc[train['target']==1, col['feat_name']],
        color='orange',
        alpha=0.3,
        label = 'test_set',
        bins=100,
        density=True
    )
    plt.hist(
        train.loc[train['target']==0, col['feat_name']],
        color='blue',
        alpha=0.3,
        label = 'train_set',
        bins=100,
        density=True        
    )
    plt.show()

In [None]:
train['valid_prob'] = clf.predict(
    train[features],
    num_iteration=clf.best_iteration
)

In [None]:
plt.figure()
plt.hist(train['valid_prob'], bins=100)
plt.show()

In [None]:
inf_data = pd.read_feather(
    '../input/parquet-files-amexdefault-prediction/train_data.ftr'
)

inf_data = (
    inf_data
    .groupby('customer_ID')
    .tail(1)
    .set_index('customer_ID', drop=True)
    .sort_index()
    .drop(['S_2'], axis='columns')
)

inf_enc_features = pd.DataFrame(
    data=enc.transform(inf_data[cat_featurs]).toarray(),
    columns=enc.get_feature_names_out()
)

inference = pd.concat([inf_data, inf_enc_features], axis=1)

del inf_data, inf_enc_features

In [None]:
inference['valid_prob'] = clf.predict(
    inference[features],
    num_iteration=clf.best_iteration
)

inference.reset_index(inplace=True)

In [None]:
plt.figure()
plt.hist(inference['valid_prob'], bins=100)
plt.show()

inference.sort_values(
    by='valid_prob', 
    inplace=True
)

In [None]:
inference['valid_prob'].quantile(
    q=0.9
)

In [None]:
# infer over the training set for successive works
inference[['index', 'valid_prob']].to_pickle(
    './adversarial_df.pkl'
)

I hope that you find that intresting and you can use that in order to improve your score.

# THANK YOU!!