In [None]:
import os
import gc
import glob
import numpy as np
import pandas as pd
import xgboost as xgb
SEED = 42

In [None]:
train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv')
train_labels['customer_ID'] = train_labels['customer_ID'].apply(lambda x: int(x[-16:], 16)).astype(np.int64)
train_labels = train_labels.set_axis(train_labels['customer_ID'])
train_labels = train_labels.drop(['customer_ID'], axis=1)

train_pkls = sorted(glob.glob('../input/amex-processed-dataset/train_data_*'))
test_pkls = sorted(glob.glob('../input/amex-processed-dataset/test_data_*'))

train_df = pd.read_pickle(train_pkls[0]).astype(np.float32)
print(train_pkls[0])
for i in train_pkls[1:]:
    print(i)
    train_df = train_df.append(pd.read_pickle(i))
    train_df = train_df.astype(np.float32)
    gc.collect()
    
y = train_labels.loc[train_df.index.values].values.astype(np.int8)
train_df = train_df.drop(['D_64_-1', 'D_66_0.0', 'D_68_0.0'], axis=1).astype(np.float32)
print(train_df.shape, y.shape)

In [None]:
# train_df.head()

In [None]:
def amex_metric(y_true, y_pred):
    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_df, y,
                                                    stratify=y, 
                                                    test_size=0.25)

del train_df, y
gc.collect()

In [None]:
model = xgb.XGBClassifier(
        n_estimators = 5000,
        max_depth = 3,
        learning_rate = 0.05, 
        subsample = 1,
        colsample_bytree = 0.2, 
        tree_method ='gpu_hist',
        predictor = 'gpu_predictor',
        eval_metric = amex_metric,
        random_state = SEED
    )

In [None]:
model.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_val, y_val)],verbose=50)

In [None]:
submission = pd.read_csv('../input/amex-default-prediction/sample_submission.csv')
submission['customer_ID_encoded'] = train_labels['customer_ID'] = submission['customer_ID'].apply(lambda x: int(x[-16:], 16)).astype(np.int64)
submission.set_axis(submission['customer_ID_encoded'], inplace=True)
submission = submission.drop(['customer_ID_encoded'], axis=1)
submission['prediction'] = submission['prediction'].astype(np.float32)

In [None]:
%%time

customer_ids_list = []
preds_list = []
for t in test_pkls:
    test_df = pd.read_pickle(t)
    customer_ids = test_df.axes[0].values
    customer_ids = submission.loc[customer_ids]['customer_ID'].values
    customer_ids_list.extend(customer_ids)
    preds = model.predict_proba(test_df)[:, 1]
    preds_list.extend(preds)
    gc.collect()

preds_list = np.array(preds_list).reshape(-1, 1)
customer_ids_list = np.array(customer_ids_list).reshape(-1, 1)

In [None]:
sub = pd.DataFrame(data=np.concatenate([customer_ids_list, preds_list], axis=1), columns=['customer_ID', 'prediction'])
sub.to_csv('submission.csv', index=False)