Refer this notebook for modeling part: https://www.kaggle.com/awaldeep/xgboost-optuna-baseline

In [None]:
# Import libraries
import os
import warnings

import numpy as np
import pandas as pd

import gc  # Garbage collector


warnings.filterwarnings('ignore')

In [None]:
# GPU libraries
import cupy, cudf 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from xgboost import XGBClassifier

In [None]:
# load the model
import joblib
# xgb_classifier = joblib.load("../input/01-starter-xgboost-implementation/xgb_classifier_v1.h5")
final_model = joblib.load("../input/xgboost-optuna-baseline/xgb_classifier_v1.h5")

In [None]:
final_model.get_xgb_params()

In [None]:
def process_and_feature_engineer(df):
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    df = cudf.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    print('shape after engineering', df.shape )
    
    return df

In [None]:
def read_test_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    #df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
    #df = df.sort_values(['customer_ID','S_2'])
    #df = df.reset_index(drop=True)
    # FILL NAN
    df = df.fillna(0) 
    print('shape of data:', df.shape)
    
    return df

print('Reading test data...')
TEST_PATH = '../input/amex-data-integer-dtypes-parquet-format/test.parquet'
test = read_test_file(path = TEST_PATH)

In [None]:
test = process_and_feature_engineer(test)

In [None]:
test['prediction'] = final_model.predict_proba(test)[:,1]

In [None]:
final = pd.DataFrame(test['prediction'].to_pandas())

In [None]:
final.to_csv("submission.csv", index=True)