In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

import optuna

from sklearn.model_selection import train_test_split
import sklearn.metrics

from xgboost import XGBClassifier

import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt, gc, os

import gc
import warnings
warnings.filterwarnings('ignore')

In [None]:
def read_train_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # FILL NAN
    df = df.fillna(0) 
    print('shape of data:', df.shape)
    
    return df

print('Reading train data...')
TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
train = read_train_file(path = TRAIN_PATH)

In [None]:
train.head()

In [None]:
def process_and_feature_engineer(df):
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    df = cudf.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    print('shape after engineering', df.shape )
    
    return df

train = process_and_feature_engineer(train)

In [None]:
train.head()

In [None]:
# ADD TARGETS
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets = targets.set_index('customer_ID')
train = train.merge(targets, left_index=True, right_index=True, how='left')
train.target = train.target.astype('int8')
del targets

# NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
train = train.sort_index().reset_index()

# FEATURES
FEATURES = train.columns[1:-1]
print(f'There are {len(FEATURES)} features!')

In [None]:
train_pd = train.to_pandas()
del train
_ = gc.collect()

In [None]:
train_df, test_df = train_test_split(train_pd, test_size=0.25, stratify=train_pd['target'])
del train_pd
_ = gc.collect()

In [None]:
len(train_df),len(test_df)

In [None]:
X_train = train_df.drop(['customer_ID', 'target'], axis=1)
X_test = test_df.drop(['customer_ID', 'target'], axis=1)

In [None]:
X_train

In [None]:
y_train = train_df['target']
y_test = test_df['target']

In [None]:
y_train

In [None]:
del train_df, test_df
_ = gc.collect()

In [None]:
# optuna

def objective(trial):
    
    param = {
        'booster':'gbtree',
        'tree_method':'gpu_hist', 
        "objective": "binary:logistic",
        'lambda': trial.suggest_loguniform(
            'lambda', 0.01, 1.0
        ),
        'alpha': trial.suggest_loguniform(
            'alpha', 5, 20.0
        ),
        'colsample_bytree': trial.suggest_float(
            'colsample_bytree', 0.3,0.9,step=0.1
        ),
        'subsample': trial.suggest_float(
            'subsample', 0.5,1,step=0.1
        ),
        'learning_rate': trial.suggest_float(
            'learning_rate', 0.01,0.1,step=0.001
        ),
        'n_estimators': trial.suggest_int(
            "n_estimators", 800,1200,20
        ),
        'max_depth': trial.suggest_int(
            'max_depth', 4,12,1
        ),
        'random_state': 99,
        'min_child_weight': trial.suggest_int(
            'min_child_weight', 64,256,1
        ),
    }
    
    model = XGBClassifier(**param, enable_categorical = True) 
    
    model.fit(X_train,y_train)
    
    preds = pd.DataFrame(model.predict(X_test))
    
    accuracy = sklearn.metrics.accuracy_score(pd.DataFrame(y_test.reset_index()['target']),preds)
    
    return accuracy

In [None]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials= 200)

In [None]:
best_params = study.best_trial.params
best_params['tree_method'] = 'gpu_hist'
best_params['booster'] = 'gbtree'
print(best_params)

In [None]:
final_model = XGBClassifier(**best_params,enable_categorical = True)

In [None]:
final_model.fit(X_train,y_train)

In [None]:
del X_train,X_test,y_train,y_test
_ = gc.collect()

In [None]:
import joblib
joblib.dump(final_model, "xgb_classifier_v1.h5")

In [None]:
# def read_test_file(path = '', usecols = None):
#     # LOAD DATAFRAME
#     if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
#     else: df = cudf.read_parquet(path)
#     # REDUCE DTYPE FOR CUSTOMER AND DATE
#     #df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
#     df.S_2 = cudf.to_datetime( df.S_2 )
#     # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
#     #df = df.sort_values(['customer_ID','S_2'])
#     #df = df.reset_index(drop=True)
#     # FILL NAN
#     df = df.fillna(0) 
#     print('shape of data:', df.shape)
    
#     return df

# print('Reading test data...')
# TEST_PATH = '../input/amex-data-integer-dtypes-parquet-format/test.parquet'
# test = read_test_file(path = TEST_PATH)

In [None]:
# test.head()

In [None]:
# test = process_and_feature_engineer(test)

In [None]:
# test['prediction'] = final_model.predict_proba(test)[:,1]

In [None]:
# final = pd.DataFrame(test['prediction'].to_pandas())

In [None]:
# final.to_csv("submission.csv", index=True)