## **Import Necessary Library**

In [None]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier, early_stopping, log_evaluation

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OrdinalEncoder

import gc

In [None]:
class CONFIG:
    random_state = 69420
    kaggle = True
    path = '../input/amexfeather/'
    local_path = ''

## **Data Preprocessing**

In [None]:
%%time

train = pd.read_feather(CONFIG.path + 'train_data.ftr')
train.head()

In [None]:
train.shape

In [None]:
%%time

# Only keep last statement month per customer

# train_data =  (train
#             .groupby('customer_ID')
#             .tail(1)
#             .drop(['S_2'], axis=1)
#             .set_index('customer_ID', inplace=True)
# #             .sort_index()
#             )

In [None]:
train = train.groupby('customer_ID')
train = train.tail(1)
train = train.drop(['S_2'], axis=1)
train.set_index('customer_ID', inplace=True)
            

reference: https://www.kaggle.com/competitions/amex-default-prediction/discussion/327094

In [None]:
_ = gc.collect()

In [None]:
train.shape

In [None]:
total_cols = train.columns.to_list()

cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

num_features = [col for col in total_cols if col not in cat_features + ["target"]]

In [None]:
x = train[cat_features + num_features]
y = train['target']

x.shape, y.shape

#### Apply OrdinalEncoder

In [None]:
%%time

enc = OrdinalEncoder()
x[cat_features] = enc.fit_transform(x[cat_features])
_ = gc.collect

## **Competition Metrix**

In [None]:
def amex_metrix(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

## **Model Training**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
model = LGBMClassifier(
    n_estimators=50000,
    device='gpu',
    random_state=CONFIG.random_state,
    extra_trees=True
)

In [None]:
%%time
model.fit(
    X_train, y_train, 
    eval_set=[(X_test,y_test)],
    callbacks=[early_stopping(50), log_evaluation(0)]
)

In [None]:
y_pred = pd.DataFrame(y_test.copy(deep=True))
y_pred = y_pred.rename(columns={'target':'prediction'})
y_pred

In [None]:
%%time
y_pred["prediction"] = model.predict_proba(X_test)[:,1]

In [None]:
y_pred

In [None]:
y_test = pd.DataFrame(y_test)

In [None]:
%%time
amex_metrix(y_test, y_pred)

## **Submission**

In [None]:
del train, x, y, X_test, X_train, y_train, y_test, y_pred
_ = gc.collect()

In [None]:
%%time
test = pd.read_feather(CONFIG.path+'test_data.ftr')
test.head()

In [None]:
test = test.groupby('customer_ID')
test = test.tail(1)
test = test.drop(['S_2'], axis=1)
test.set_index('customer_ID', inplace=True)

In [None]:
_ = gc.collect()

In [None]:
%%time
test[cat_features] = enc.transform(test[cat_features])
_ = gc.collect()

In [None]:
test["prediction"] = model.predict_proba(test[cat_features + num_features])[:,1]
test.head()

In [None]:
test["prediction"].to_csv("submission.csv", index=True)
