In [None]:
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import category_encoders as ce
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

pd.set_option('display.max_columns', None)

In [None]:
def compute_recall_at4(y_true: np.array, y_pred: np.array) -> float:
    
    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos
    
    # desc sorting by prediction values
    indices = np.argsort(y_pred)[::-1]
    target = y_true[indices]
    
    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    
    # default rate captured at 4%
    d = target[four_pct_mask].sum() / n_pos
    
    return d

def compute_normalized_gini(y_true: np.array, y_pred: np.array) -> float:
    
    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting desc by prediction values
    indices = np.argsort(y_pred)[::-1]
    target = y_true[indices]

    # weighted gini coefficient
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()

    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max
    
    return g
    
def compute_amex_metric(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting desc by prediction values
    indices = np.argsort(y_pred)[::-1]
    target = y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

In [None]:
# metrics in lgbm format

def metric_recall_at4(y_pred: np.ndarray, data: lgb.Dataset):
    y_true = data.get_label()
    return 'recall_at4', compute_recall_at4(y_true, y_pred), True

def metric_normalized_gini(y_pred: np.ndarray, data: lgb.Dataset):
    y_true = data.get_label()
    return 'norm_gini', compute_normalized_gini(y_true, y_pred), True

def metric_amex(y_pred: np.ndarray, data: lgb.Dataset):
    y_true = data.get_label()
    return 'amex_metric', compute_amex_metric(y_true, y_pred), True

***
## load and prepare data

In [None]:
train = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train.parquet")
train_labels = pd.read_csv("../input/amex-default-prediction/train_labels.csv")

In [None]:
train_agg = (
    train
    .sort_values(["customer_ID","S_2"], ascending=[True,False])
    .drop_duplicates(subset=["customer_ID"], keep="first", ignore_index=True)
)

del train
gc.collect()

In [None]:
input_feats = train_agg.columns[2:].tolist()
categ_feats = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [None]:
train_agg = pd.merge(train_agg, train_labels, how="inner", on="customer_ID")

In [None]:
for col in input_feats:
    print(col, train_agg[col].dtype)

***
## model training

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2112)
skf_split = list(skf.split(train_agg, train_agg["target"].values))

In [None]:
model_params = {
    'objective': 'binary',
    'metric': 'None',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'force_col_wise': True,
    'bagging_freq': 1,
    'seed': 2112,
    'verbosity': 0,
    'first_metric_only': True,
    'bin_construct_sample_cnt': 100000000,
    'feature_pre_filter': False,
    'bagging_fraction': 0.9,
    'feature_fraction': 0.2,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'min_data_in_leaf': 1000,
    'path_smooth': 10,
    'max_bin': 255,
}

In [None]:
%%time
models = list()

# dataframe to store the oof predictions
oof = train_agg[["target"]].copy()
oof["pred"] = -1

for fold,(train_idx,valid_idx) in enumerate(skf_split):
    
    print(f" training model {fold+1}/{len(skf_split)} ".center(100, "#"))
    
    train_dset = lgb.Dataset(
        data=train_agg.loc[train_idx,input_feats],
        label=train_agg.loc[train_idx,"target"].values,
        categorical_feature=categ_feats,
        free_raw_data=True
    )
    valid_dset = lgb.Dataset(
        data=train_agg.loc[valid_idx,input_feats],
        label=train_agg.loc[valid_idx,"target"].values,
        categorical_feature=categ_feats,
        free_raw_data=True
    )
    
    model = lgb.train(
        params=model_params,
        train_set=train_dset,
        valid_sets=[valid_dset,],
        feval=[metric_amex, metric_recall_at4, metric_normalized_gini],
        num_boost_round=3000,
        callbacks=[lgb.log_evaluation(period=50), lgb.early_stopping(50)],
    )
    
    lgb.plot_importance(model, figsize=(8,15), importance_type="split", max_num_features=30)
    lgb.plot_importance(model, figsize=(8,15), importance_type="gain", max_num_features=30)
    plt.show()
    
    oof.loc[valid_idx,"pred"] = model.predict(train_agg.loc[valid_idx,input_feats])
    
    models.append(model)
    del train_dset,valid_dset
    gc.collect()

In [None]:
# oof metrics
print("OOF recall_at4:", compute_recall_at4(oof.target.values, oof.pred.values))
print("OOF normalized_gini:", compute_normalized_gini(oof.target.values, oof.pred.values))
print("OOF competition metric:", compute_amex_metric(oof.target.values, oof.pred.values))

In [None]:
del train_agg
gc.collect()

***
## make predictions and submit

In [None]:
test = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test.parquet")
sample_sub = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")
sample_sub

In [None]:
test_agg = (
    test
    .sort_values(["customer_ID","S_2"], ascending=[True,False])
    .drop_duplicates(subset=["customer_ID"], keep="first", ignore_index=True)
)

del test
gc.collect()

In [None]:
%%time
preds = [model.predict(test_agg[input_feats]) for model in models]
test_agg["prediction"] = np.mean(preds, axis=0)

In [None]:
sub = pd.merge(sample_sub[["customer_ID"]], test_agg[["customer_ID","prediction"]])

In [None]:
assert sub.prediction.isna().sum() == 0

In [None]:
sub.to_csv("submission.csv", index=False)

***