In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cupy, cudf
import gc,os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from xgboost import XGBClassifier

In [None]:
train_data = cudf.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train.parquet")
train_data.head()

In [None]:
train_data.shape

In [None]:
def process_and_feature_engineer(df):
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    df["S_2"] = cudf.to_datetime(df["S_2"])
    df["S_2_month"] = df["S_2"].dt.month
    df["S_2_year"] = df["S_2"].dt.year
    df["S_2_day"] = df["S_2"].dt.day  
    
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    df = cudf.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    print('shape after engineering', df.shape )
    
    return df

In [None]:
train_data = process_and_feature_engineer(train_data)

In [None]:
targets = cudf.read_csv("../input/amex-default-prediction/train_labels.csv")
targets.head()

In [None]:
targets.shape

In [None]:
train_data.fillna(-999,inplace=True)

In [None]:
train_data.reset_index(inplace=True)

In [None]:
train_data = cudf.merge(train_data,targets,on="custmer_ID",left_index=True,right_index=True)

In [None]:
train_data.drop(["customer_ID_x","customer_ID_y"],axis=1,inplace=True)

In [None]:
train_data.head()

In [None]:
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
useful_features = [feature for feature in train_data.columns if feature != "target"]

In [None]:
kfold = KFold(n_splits=5,shuffle=True,random_state=42)

In [None]:
XGBOOST_PARAMS = {'learning_rate': 0.0170074900458309, 
'reg_lambda': 5.9599291346341776e-05, 
'reg_alpha': 0.015370240971015697,
 'subsample': 0.7027704916274289, 
'colsample_bytree': 0.5705441270528481,
 'max_depth': 4, 
'n_estimators': 6542}

In [None]:
train_data = train_data.to_pandas()
for fold,(train_idx,valid_idx) in enumerate(kfold.split(train_data, train_data.target)):
    
    X_train = train_data.iloc[train_idx][useful_features]
    X_valid = train_data.iloc[valid_idx][useful_features]
    y_train = train_data.iloc[train_idx]["target"]
    y_valid = train_data.iloc[valid_idx]["target"]
    
    model = XGBClassifier( 
        random_state=fold,
        objective='binary:logistic',
        tree_method='gpu_hist',  
        gpu_id=0,
        predictor='gpu_predictor',
        n_jobs = -1,
        **XGBOOST_PARAMS
    )
    
    model.fit(X_train, y_train,
              eval_set=[(X_valid, y_valid)],
              verbose=0)
    
    preds_valid = model.predict_proba(X_valid)[:,1]
        
    print(f"the kaggle metric score after {fold} fold is : {amex_metric_mod(y_valid, preds_valid)}")    
    
    model.save_model(f'XGB_fold{fold}.xgb')
    
    print(f"Model saved for {fold} fold")
    
    del X_train, X_valid, y_train, y_valid, model
    
    _ =  gc.collect()

In [None]:
del train_data

In [None]:
from numba import cuda
cuda.select_device(0)
cuda.close()
cuda.select_device(0)

In [None]:
def load_preprocess_test():
    
    df = cudf.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test.parquet")
    
    df = process_and_feature_engineer(df)
    
    df.fillna(-999,inplace=True)
    
    return df

In [None]:
test_data = load_preprocess_test()

In [None]:
test_data.reset_index(inplace=True)
test_data.head()

In [None]:
customer_id = test_data["customer_ID"]

In [None]:
test_data.drop("customer_ID",axis=1,inplace=True)

In [None]:
test_data.head()

In [None]:
final_predictions = []
for fold in range(5):
    
    model = XGBClassifier(random_state=fold,
                        objective='binary:logistic',
                        tree_method='gpu_hist',  
                        gpu_id=0,
                        predictor='gpu_predictor',
                        n_jobs = -1)
    
    model.load_model(f'XGB_fold{fold}.xgb')
    
    test_preds = model.predict_proba(test_data)[:,1]
    
    print(f"{fold} fold is completed and prediction is appended into final prediction list")
    
    del model
    
    final_predictions.append(test_preds)

In [None]:
final_predictions

In [None]:
target = np.mean(np.column_stack(final_predictions), axis=1)

In [None]:
target

In [None]:
output_data = pd.DataFrame({"customer_ID":customer_id.to_pandas(),"prediction":target})
output_data

In [None]:
output_data.to_csv("third_submission.csv",index=False)