In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

input_path = Path('/kaggle/input/amex-default-prediction/')

We copied the python implementation from the [competition host's notebook](https://www.kaggle.com/code/inversion/amex-competition-metric-python)

In [None]:
def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

In [None]:
def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

In [None]:
def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

## Simple Benchmark

We use the sample testing instances to compare to my implementation in R.

### Test 1

In [None]:
y_true = pd.DataFrame({'target': [0, 1, 0, 1, 0, 1]})
y_pred = pd.DataFrame({'prediction': [0.1, 0.9, 0.2, 0.88, 0.3, 0.75]})

print(f"top_four_percent_captured: {top_four_percent_captured(y_true, y_pred):.6f}\n")
print(f"normalized_weighted_gini: {normalized_weighted_gini(y_true, y_pred):.6f}\n")
print(f"amex_metric: {amex_metric(y_true, y_pred):.6f}\n")

### Test 2

In [None]:
y_true = pd.DataFrame({'target': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
y_pred = pd.DataFrame({'prediction': [0.9, 0.3, 0.8, 0.75, 0.65, 0.6, 0.78, 0.7, 0.05, 0.41, 0.42, 0.05, 0.5, 0.11, 0.12]})

print(f"top_four_percent_captured: {top_four_percent_captured(y_true, y_pred):.6f}\n")
print(f"normalized_weighted_gini: {normalized_weighted_gini(y_true, y_pred):.6f}\n")
print(f"amex_metric: {amex_metric(y_true, y_pred):.6f}\n")

### Test 3

In [None]:
y_true = pd.DataFrame({'target': [1, 1, 0, 1, 0, 1, 0, 0, 0, 0]})
y_pred = pd.DataFrame({'prediction': [0.11, 0.62, 0.61, 0.62, 0.86, 0.64, 0.01, 0.23, 0.67, 0.51]})

print(f"top_four_percent_captured: {top_four_percent_captured(y_true, y_pred):.6f}\n")
print(f"normalized_weighted_gini: {normalized_weighted_gini(y_true, y_pred):.6f}\n")
print(f"amex_metric: {amex_metric(y_true, y_pred):.6f}\n")

### AMEX Test

In [None]:
train_data = pd.read_csv(
    input_path / 'train_data.csv',
    index_col='customer_ID',
    usecols=['customer_ID', 'P_2'])

train_labels = pd.read_csv(input_path / 'train_labels.csv', index_col='customer_ID')

In [None]:
ave_p2 = (train_data
          .groupby('customer_ID')
          .mean()
          .rename(columns={'P_2': 'prediction'}))

# Scale the mean P_2 by the max value and take the compliment
ave_p2['prediction'] = 1.0 - (ave_p2['prediction'] / ave_p2['prediction'].max())

In [None]:
print(amex_metric(train_labels, ave_p2)) # 0.572773