# Acknowledegment

* XGBoost Starter - [0.793] [link](https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793)

# Load Libraries

In [None]:
# LOAD LIBRARIES
import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt, gc, os

import sys
import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

from tqdm.auto import tqdm
from functools import partial


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
from torch.optim.optimizer import Optimizer
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau


from torch.cuda.amp import autocast, GradScaler

import warnings
warnings.filterwarnings('ignore')

print('RAPIDS version',cudf.__version__)

# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8

# Process and Feature Engineer Train Data

In [None]:
%%time
def read_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
    #df = df.sort_values(['customer_ID','S_2'])
    #df = df.reset_index(drop=True)
    # FILL NAN
    df = df.fillna(NAN_VALUE) 
    print('shape of data:', df.shape)
    
    return df

print('Reading train data...')
TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
train = read_file(path = TRAIN_PATH)

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.columns

In [None]:
%%time
def process_and_feature_engineer(df):
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    df = cudf.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    print('shape after engineering', df.shape )
    
    return df

train = process_and_feature_engineer(train)

# Clean Ram
del TRAIN_PATH, NAN_VALUE
_ = gc.collect()

In [None]:
train.shape

In [None]:
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')

In [None]:
targets.head()

In [None]:
# ADD TARGETS
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets = targets.set_index('customer_ID')
train = train.merge(targets, left_index=True, right_index=True, how='left')
del targets
train = train.reset_index()
# Convert train to CPU DataFrame
train = train.to_pandas()

# Impute missing values

In [None]:
CAT_FEATURES = [col for col in train.columns if (col.split("_")[-1] in ['count', 'nunique']) | (col in ["B_30_last","B_38_last","D_114_last","D_116_last","D_117_last","D_120_last","D_126_last","D_63_last","D_64_last","D_66_last","D_68_last"])]
NUM_FEATURES = [col for col in train.columns if (col.split("_")[-1] in ['mean', 'std', 'min', 'max', 'last']) & (col not in ["B_30_last","B_38_last","D_114_last","D_116_last","D_117_last","D_120_last","D_126_last","D_63_last","D_64_last","D_66_last","D_68_last"])]

In [None]:
num_na_columns = train[NUM_FEATURES].loc[:, train.isnull().any()].columns
cat_na_columns = train[CAT_FEATURES].loc[:, train.isnull().any()].columns
print(len(num_na_columns))
print(len(cat_na_columns))

In [None]:
%%time
# Fill num_na_cols using KNNImputer
knn_imputer = KNNImputer(n_neighbors=5, weights='uniform')
knn_imputer.fit(train[num_na_columns])
train_num_na_cols = knn_imputer.transform(train[num_na_columns])
train[num_na_columns] = train_num_na_cols

In [None]:
train.isna().sum().max()

In [None]:
train.to_parquet("./train.parquet", index=False)
FEATURES = train.columns[1:-1]
print(f'There are {len(FEATURES)} features!')