# What did I change:
- Cloned from https://www.kaggle.com/code/lucasmorin/amex-woe-baseline

- Replace the original Private dataset to my newly created dataset:
https://www.kaggle.com/competitions/amex-default-prediction/discussion/327228

- Notice that in the original WoE baseline notebook, the private dataset contains only 354 columns after prepare_df. which indicates there were some feature selection offline.

Simple weight of evidence baseline; WoE is a target encoding technique replacing values by an associated value that has nice additive properties.
Give strong baseline, generally at the cost of feature interactions.

**Don't Forget to upvote if you find this interesting or usefull**

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import matplotlib as mpl
import matplotlib.pyplot as plt

import gc

from sklearn.base import BaseEstimator, TransformerMixin

# matplotlib setting
mpl.rcParams['figure.dpi'] = 200
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False

# pandas setting
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 1000)

## Aggregate_data

In [None]:
def prepare_df(df):
    df_out = df.groupby('customer_ID').agg([np.mean,np.std])
    df_out.columns = [c[0]+'_'+c[1] for c in df_out.columns]
    df_out = df_out.fillna(np.nanmean(df_out))
    return df_out

In [None]:
%%time
train_data = pd.read_pickle('../input/ae-credit-id-encoded-dataset-fp16/id_encoded_fp16_train_data.pkl')
# Do some feature selection first
train_data.drop(columns=["S_2"], inplace=True)

In [None]:
train_data_grp = prepare_df(train_data)
del train_data
gc.collect()

In [None]:
train_labels = pd.read_pickle('../input/ae-credit-id-encoded-dataset-fp16/id_encoded_train_labels.pkl')

In [None]:
%%time
test_data = pd.read_pickle('../input/ae-credit-id-encoded-dataset-fp16/id_encoded_fp16_test_data.pkl')
# Do some feature selection first
test_data.drop(columns=["S_2"], inplace=True)

In [None]:
test_data_grp = prepare_df(test_data)
del test_data

In [None]:
import gc
gc.collect()

# weight of evidence

Standard Credit Scoring Technique. Target encoding technique that replace feature value by an additive value that helps build credit Scorecards. Personal sklearn implementation (doesn't handle edge case very well).

In [None]:
class WoE_Imputer(BaseEstimator, TransformerMixin):
# Bins the features and impute Weight of Evidence associated with each bin
# Weight of Evidence is calculated as the log ratio of positive outcome to negative ones in each bin
# This imputation technique is adapted to the specific functionnal form of logistic regression
# Allows to impute missing values
# Also allows to calculate Information Values for feature selection
    def __init__(self, feature_name, n_bin = 100, Categorical = False, verbosity = 1):  
        self.feature_name = feature_name
        self.n_bin = n_bin
        self.bins = []
        self.WoE_values = []
        self.Categorical = Categorical 
        self.verbosity = verbosity
        self.IV = 0

    def fit(self, X, y = None):
        if y is None:
            raise ValueError('Woe Imputer is a supervised imputer. It needs a target')

        if self.Categorical:
            values_quantiles = X[self.feature_name].astype('category')
            self.bins = values_quantiles.cat
        else:
            values_quantiles, self.bins = pd.qcut(X[self.feature_name], q=self.n_bin, duplicates = 'drop', retbins=True)   
            self.bins[0] = -np.Inf
            self.bins[-1] = np.Inf
            values_quantiles = pd.cut(X[self.feature_name], bins = self.bins)

        values_quantiles = values_quantiles.cat.add_categories('missing_value')
        values_quantiles.fillna('missing_value', inplace = True) 

        df = pd.DataFrame({'group': values_quantiles, 'val': X[self.feature_name], 'target': y.values.flatten()})

        sum_positive_by_quantile = df.groupby('group').sum().target
        sum_negative_by_quantile = df.groupby('group').count().target - df.groupby('group').sum().target

        data = np.log(sum_positive_by_quantile / sum_negative_by_quantile)
        
        #interpolate in case of na - there are other tricks
        mask = np.isnan(data)
        data[mask] = np.interp(np.flatnonzero(mask), np.flatnonzero(~mask), data[~mask])

        self.WoE_values =  data

        self.IV = ((sum_positive_by_quantile - sum_negative_by_quantile) * self.WoE_values / df.shape[0]).sum()

        if self.verbosity>0:
            print('Information Value ' + str(self.feature_name)+': ' + str(round(self.IV,5)))
            
        return self

    def transform(self, X):
        feature_to_transform = X[self.feature_name].copy()
        transformed_feature = pd.cut(feature_to_transform, bins =  self.bins, labels = np.array(self.WoE_values[:-1]), ordered = False).astype('float32')
        transformed_feature = transformed_feature.replace(np.nan, self.WoE_values[-1])
        X[self.feature_name] = transformed_feature
        return X

    def __get_val__(self):  
        return self.feature_name, self.n_bin, self.bins, self.WoE_values, self.IV

In [None]:
from tqdm import tqdm_notebook

Features = train_data_grp.columns
Features = [f for f in Features if not f.startswith('target')]

IV_list = []

for f in tqdm_notebook(Features):
    WoE_imp = WoE_Imputer(f, n_bin = 50, verbosity = 0)
    WoE_imp.fit(train_data_grp, y = train_labels.target)
    train_data_grp = WoE_imp.transform(train_data_grp)
    test_data_grp = WoE_imp.transform(test_data_grp)
    feature_name, n_bin, bins, WoE_values, IV = WoE_imp.__get_val__()
    IV_list.append(IV)

In [None]:
sorted_IV = pd.DataFrame({'Features':Features,'IV':IV_list}).sort_values('IV',ascending=False).reset_index(drop=True)
plt.plot(sorted_IV.IV);

In [None]:
IV_threshold = 1.2
list_features = sorted_IV[sorted_IV.IV>IV_threshold].Features.to_list()

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

pos / (pos + neg) = 1/(1+neg/pos) = (1/1+exp(-log(pos/neg)))

In [None]:
pred_train = train_data_grp[list_features].mean(axis=1)
prob_train = 1/(1+np.exp(-pred_train))

In [None]:
amex_metric(train_labels.set_index('customer_ID'), prob_train.rename('prediction'))

# submission

In [None]:
df_sub = pd.read_pickle('../input/ae-credit-id-encoded-dataset-fp16/id_encoded_sample_submission.pkl')

pred_test = test_data_grp[list_features].mean(axis=1)
prob_test = 1/(1+np.exp(-pred_test))
df_sub.prediction = prob_test.values

In [None]:
from sklearn.preprocessing import LabelEncoder
loaded_encoder = LabelEncoder()
loaded_encoder.classes_ = np.load(f"../input/ae-credit-id-encoded-dataset-fp16/id_encodings.npy", allow_pickle=True)
df_sub["customer_ID"] = loaded_encoder.inverse_transform(df_sub["customer_ID"])

In [None]:
df_sub.set_index('customer_ID').to_csv('submission.csv')

In [None]:
df_sub