In [None]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import KFold, GridSearchCV, ParameterGrid, train_test_split

import lightgbm as lgb  # LightGBM, fast.
import xgboost as xgb  # XGBoost

from itertools import cycle
import time  # sleep()

In [None]:
# Global variables and Constants

RANDOM_STATE = 22
pd.set_option('display.float_format', '{:.4f}'.format)

In [None]:
# class CustomUtil:
#     def infer_by_knn(_cls):
        

In [None]:
# https://www.kaggle.com/code/inversion/amex-competition-metric-python
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()

    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename({'target': 'prediction'}, axis='columns')
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)


In [None]:
# Load data

# This won't work. Out-of-memory exception occurred.
# train_df = pd.read_csv('../input/amex-default-prediction/train_data.csv')
# test_df = pd.read_csv('../input/amex-default-prediction/test_data.csv')

# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327400
train_df = pd.read_feather('../input/amex-default-prediction-feather/train.feather')
test_df = pd.read_feather('../input/amex-default-prediction-feather/test.feather')
train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv', dtype={'target': 'category'})

In [None]:
train_df.shape, test_df.shape, train_labels.shape

# Preprocessing

In [None]:
# Cast types
train_df['S_2'] = pd.to_datetime(train_df['S_2'])
test_df['S_2'] = pd.to_datetime(test_df['S_2'])

# Categorical featues
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
# train_df = train_df.drop(columns=cat_cols)
train_df = train_df.astype(dict(zip(cat_cols, cycle(['category']))))
test_df = test_df.astype(dict(zip(cat_cols, cycle(['category']))))

# NaN cells
train_df = train_df.dropna(axis='columns')

In [None]:
# categorical_df = pd.concat([train_df[['customer_ID', 'S_2', *cat_cols]], test_df[['customer_ID', 'S_2', *cat_cols]]], axis='index')
# categorical_df = pd.get_dummies(categorical_df, sparse=True)

In [None]:
# train_df = train_df.drop(columns=cat_cols).merge(categorical_df, how='left')
# test_df = test_df.drop(columns=cat_cols).merge(categorical_df, how='left')

In [None]:
test_df = test_df[train_df.columns]

In [None]:
# Merge train data with labels
train_df = train_df.merge(train_labels, on='customer_ID', how='left')

In [None]:
# Only use latest record per customer_ID (for brevity)
train_df = train_df.groupby('customer_ID').tail(1).reset_index(drop=True)
test_df = test_df.groupby('customer_ID').tail(1).reset_index(drop=True)

In [None]:
train_df

# Feature engineering

In [None]:
train_df.info()

# Modeling

In [None]:
target_col = 'target'
droppable_cols = ['customer_ID', 'S_2', target_col]

## LightGBM

In [None]:
rm submission_*.*

In [None]:
# Split data
X_train = train_df.drop(columns=droppable_cols)
y_train = train_df[target_col]
X_test = test_df.drop(columns=droppable_cols, errors='ignore')

# Modeling
is_modeling = False
if is_modeling:
    param_grid = {'learning_rate': [0.015], 'n_estimators': [1_500], 'num_leaves': [500], 'objective': ['binary']}
    kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    model = GridSearchCV(estimator=lgb.LGBMClassifier(), param_grid=param_grid, verbose=3, cv=kf)
    model.fit(X_train, y_train)

    # Output result
    print('* Best params :', model.best_params_, sep='\n')
    print('* Best score :', model.best_score_, sep='\t')
    
    # Feature importances
    feature_importance_df = pd.DataFrame({'column': X_train.columns, 'importance': model.feature_importances_})
    plt.figure(figsize=(20, 10))
    sns.barplot(data=feature_importance_df, x='importance', y='column')
    plt.show()
else:
    model = lgb.LGBMClassifier(**{'learning_rate': 0.015, 'n_estimators': 1500, 'num_leaves': 500, 'objective': ['binary']})
    model.fit(X_train, y_train)

# Calculate amex metric
am = amex_metric(y_train.astype('int8').to_frame(), pd.DataFrame({'prediction': model.predict(X_train).astype('int8')}))
print('amex metric :', am)

# Inference
submission_df = pd.DataFrame({'customer_ID': test_df['customer_ID'], 'prediction': model.predict(X_test)})
submission_df.to_csv('submission_lgbm.csv.zip', index=False)

## XGBoost

In [None]:
# https://xgboost.readthedocs.io/en/stable/parameter.html

In [None]:
import gc

gc.collect()

In [None]:
# Sleep for GC
time.sleep(120)

In [None]:
# Split data
X_train = train_df.drop(columns=droppable_cols)
y_train = train_df[target_col].astype('int8')  # XGBClassifier requires integer type
X_test = test_df.drop(columns=droppable_cols, errors='ignore')

In [None]:
# # Modeling
# is_modeling = False
# if is_modeling:
#     param_grid = {'learning_rate': [0.01, 0.015], 'max_depth': [50, 100], 'n_estimators': [50, 100]}
#     scores = []
    
#     for param in ParameterGrid(param_grid):
#         model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='error', tree_method='approx',
#                                   enable_categorical=True, random_state=RANDOM_STATE, verbosity=1, **param)
#         model.fit(X_train, y_train)
#         score = model.score(X_train, y_train)
#         scores.append((param, score))
#         print('param & score', param, score, sep='\n')
        
#     # Output result
#     score_best = min(scores, key=lambda s: s[1])
#     print(*scores, sep='\n')
# #     print('* Best score :', model.best_score, sep='\t')
    
#     # Feature importances
#     feature_importance_df = pd.DataFrame({'column': model.feature_names_in_, 'importance': model.feature_importances_})
#     plt.figure(figsize=(20, 10))
#     sns.barplot(data=feature_importance_df, x='importance', y='column')
#     plt.show()
# else:
#     params = {'learning_rate': 0.01, 'max_depth': 50, 'n_estimators': 100}
#     model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='error', tree_method='approx',
#                               enable_categorical=True, random_state=RANDOM_STATE, verbosity=1, **params)
#     model.fit(X_train, y_train)

# # Inference
# test_df['prediction'] = model.predict(X_test)
# test_df[['customer_ID', 'prediction']].to_csv('submission_xgb.csv.zip', index=False)