In [1]:
import pandas as pd, numpy as np
import cupy, cudf
import matplotlib.pyplot as plt, gc, os

print('RAPIDS version', cudf.__version__)

RAPIDS version 21.10.01


In [2]:
VER = 1

SEED = 17

NAN_VALUE = -127

FOLDS = 5

In [3]:
def read_file(path = '', usecols = None):
    
    if usecols is not None:
        df = cudf.read_parquet(path, columns = usecols)
    else:
        df = cudf.read_parquet(path)
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime(df.S_2)
    
    df = df.fillna(NAN_VALUE)
    print('shape of data:', df.shape)
    
    return df

print('Reading train data...')
TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
train = read_file(path = TRAIN_PATH)

Reading train data...
shape of data: (5531451, 190)


In [4]:
train.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,-4532153018459703766,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,-1,0,0,0.0,-127.0,0,0.00061,0
1,-4532153018459703766,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,...,-1,-1,-1,0,0,0.0,-127.0,0,0.005492,0
2,-4532153018459703766,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,-1,0,0,0.0,-127.0,0,0.006986,0
3,-4532153018459703766,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,...,-1,-1,-1,0,0,0.0,-127.0,0,0.006527,0
4,-4532153018459703766,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,-1,0,0,0.0,-127.0,0,0.008126,0


In [5]:
def process_and_feature_engineer(df):
    
    all_cols = [col for col in list(df.columns) if not (col == 'cutomer_ID' or col == 'S_2')]
    
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]
    
    test_num_agg = df.groupby('customer_ID')[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    print('Num_Agg Columns:', test_num_agg.columns)
    
    test_cat_agg = df.groupby('customer_ID')[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    print('Cat_Agg Columns:', test_num_agg.columns)

    
    df = cudf.concat([test_num_agg, test_cat_agg], axis = 1)
    
    del test_num_agg, test_cat_agg
    print('Shape after engineering:', df.shape)
    
    return df
train = process_and_feature_engineer(train)

Num_Agg Columns: Index(['P_2_mean', 'P_2_std', 'P_2_min', 'P_2_max', 'P_2_last', 'D_39_mean',
       'D_39_std', 'D_39_min', 'D_39_max', 'D_39_last',
       ...
       'D_144_mean', 'D_144_std', 'D_144_min', 'D_144_max', 'D_144_last',
       'D_145_mean', 'D_145_std', 'D_145_min', 'D_145_max', 'D_145_last'],
      dtype='object', length=885)
Cat_Agg Columns: Index(['P_2_mean', 'P_2_std', 'P_2_min', 'P_2_max', 'P_2_last', 'D_39_mean',
       'D_39_std', 'D_39_min', 'D_39_max', 'D_39_last',
       ...
       'D_144_mean', 'D_144_std', 'D_144_min', 'D_144_max', 'D_144_last',
       'D_145_mean', 'D_145_std', 'D_145_min', 'D_145_max', 'D_145_last'],
      dtype='object', length=885)
Shape after engineering: (458913, 918)


In [6]:
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets = targets.set_index('customer_ID')
train = train.merge(targets, left_index = True, right_index = True, how = 'left')
del targets
train = train.reset_index()
FEATURES = train.columns[1:-1]
print(f'There are {len(FEATURES)} features!')

There are 918 features!


In [7]:
from sklearn.model_selection import KFold
import xgboost as xgb
print('XGB Version', xgb.__version__)

xgb_params = {
    'max_depth': 4,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'eval_metric': 'logloss',
    'objective': 'binary:logistic',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'random_state': SEED
}

XGB Version 1.6.1


In [8]:
class IterLoadForDMatrix(xgb.core.DataIter):
    def __init__(self, df = None, features = None, target = None, batch_size = 256 * 1024):
        self.features = features
        self.target = target
        self.df = df
        self.it = 0
        self.batch_size = batch_size
        self.batches = int(np.ceil(len(df) / self.batch_size))
        super().__init__()
    
    def reset(self):
        self.it = 0
    
    def next(self, input_data):
        if self.it == self.batches:
            return 0
        
        a = self.it * self.batch_size
        b = min((self.it + 1) * self.batch_size, len(self.df))
        
        dt = cudf.DataFrame(self.df.iloc[a:b])
        
        input_data(data = dt[self.features], label = dt[self.target])
        
        self.it += 1
        return 1

In [9]:
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [10]:
importances = []
oof = np.zeros(len(train))
train = train.to_pandas()
TRAIN_SUBSAMPLE = 1.0
gc.collect()

skf = KFold(n_splits = FOLDS)
for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):
    if TRAIN_SUBSAMPLE < 1.0:
        np.random.seed(SEED)
        train_idx = np.random.choice(train_idx, int(len(train_idx)*TRAIN_SUBSAMPLE), replace = False)
        np.random.seed(None)
    
    print('#' * 25)
    print('### Fold', fold + 1)
    print('### Train size', len(train_idx), 'Valid size', len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#' * 25)
    
    Xy_train = IterLoadForDMatrix(train.loc[train_idx], FEATURES, 'target')
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, 'target']
    
    dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin = 256)
    dvalid = xgb.DMatrix(data = X_valid, label = y_valid)
    
    model = xgb.train(xgb_params,
                      dtrain = dtrain,
                      evals = [(dtrain, 'dtrain'), (dvalid, 'dvalid')],
                      num_boost_round = 9999,
                      early_stopping_rounds = 100,
                      verbose_eval = 100
                     )
    model.save_model(f'XGB_v{VER}_fold{fold}.xgb')
    
    dd = model.get_score(importance_type = 'weight')
    df = pd.DataFrame({'feature': dd.keys(), f'importance_{fold}': dd.values()})
    importances.append(df)
    
    oof_preds = model.predict(dvalid)
    acc = amex_metric_mod(y_valid.values, oof_preds)
    print('Kaggle Metric =', acc, '\n')
    
    oof[valid_idx] = oof_preds
    
    del dtrain, Xy_train, dd, df
    del X_valid, y_valid, dvalid, model
    _ = gc.collect()
    
print('#' * 25)
acc = amex_metric_mod(train.target.values, oof)
print('OVERALL CV Kaggle Metric =', acc)

#########################
### Fold 1
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################
[0]	dtrain-logloss:0.66203	dvalid-logloss:0.66213
[100]	dtrain-logloss:0.23650	dvalid-logloss:0.23997
[200]	dtrain-logloss:0.22231	dvalid-logloss:0.22796
[300]	dtrain-logloss:0.21624	dvalid-logloss:0.22392
[400]	dtrain-logloss:0.21226	dvalid-logloss:0.22194
[500]	dtrain-logloss:0.20908	dvalid-logloss:0.22082
[600]	dtrain-logloss:0.20638	dvalid-logloss:0.22003
[700]	dtrain-logloss:0.20386	dvalid-logloss:0.21946
[800]	dtrain-logloss:0.20155	dvalid-logloss:0.21910
[900]	dtrain-logloss:0.19940	dvalid-logloss:0.21873
[1000]	dtrain-logloss:0.19728	dvalid-logloss:0.21853
[1100]	dtrain-logloss:0.19531	dvalid-logloss:0.21835
[1200]	dtrain-logloss:0.19338	dvalid-logloss:0.21819
[1300]	dtrain-logloss:0.19148	dvalid-logloss:0.21809
[1400]	dtrain-logloss:0.18963	dvalid-logloss:0.21795
[1500]	dtrain-logloss:0.18784	dvalid-logloss:0.21783
[1600]	dtrain-logloss:0.18

In [11]:
del train
_ = gc.collect

In [12]:


# CALCULATE SIZE OF EACH SEPARATE TEST PART
def get_rows(customers, test, NUM_PARTS = 4, verbose = ''):
    chunk = len(customers)//NUM_PARTS
    if verbose != '':
        print(f'We will process {verbose} data as {NUM_PARTS} separate parts.')
        print(f'There will be {chunk} customers in each part (except the last part).')
        print('Below are number of rows in each part:')
    rows = []

    for k in range(NUM_PARTS):
        if k==NUM_PARTS-1: cc = customers[k*chunk:]
        else: cc = customers[k*chunk:(k+1)*chunk]
        s = test.loc[test.customer_ID.isin(cc)].shape[0]
        rows.append(s)
    if verbose != '': print( rows )
    return rows,chunk

# COMPUTE SIZE OF 4 PARTS FOR TEST DATA
NUM_PARTS = 4
TEST_PATH = '../input/amex-data-integer-dtypes-parquet-format/test.parquet'

print(f'Reading test data...')
test = read_file(path = TEST_PATH, usecols = ['customer_ID','S_2'])
customers = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
rows,num_cust = get_rows(customers, test[['customer_ID']], NUM_PARTS = NUM_PARTS, verbose = 'test')

Reading test data...
shape of data: (11363762, 2)
We will process test data as 4 separate parts.
There will be 231155 customers in each part (except the last part).
Below are number of rows in each part:
[2841209, 2839857, 2842105, 2840591]


In [13]:
# INFER TEST DATA IN PARTS
skip_rows = 0
skip_cust = 0
test_preds = []

for k in range(NUM_PARTS):
    
    # READ PART OF TEST DATA
    print(f'\nReading test data...')
    test = read_file(path = TEST_PATH)
    test = test.iloc[skip_rows:skip_rows+rows[k]]
    skip_rows += rows[k]
    print(f'=> Test part {k+1} has shape', test.shape )
    
    # PROCESS AND FEATURE ENGINEER PART OF TEST DATA
    test = process_and_feature_engineer(test)
    if k==NUM_PARTS-1: test = test.loc[customers[skip_cust:]]
    else: test = test.loc[customers[skip_cust:skip_cust+num_cust]]
    skip_cust += num_cust
    
    # TEST DATA FOR XGB
    X_test = test[FEATURES]
    dtest = xgb.DMatrix(data=X_test)
    test = test[['P_2_mean']] # reduce memory
    del X_test
    gc.collect()

    # INFER XGB MODELS ON TEST DATA
    model = xgb.Booster()
    model.load_model(f'XGB_v{VER}_fold0.xgb')
    preds = model.predict(dtest)
    for f in range(1,FOLDS):
        model.load_model(f'XGB_v{VER}_fold{f}.xgb')
        preds += model.predict(dtest)
    preds /= FOLDS
    test_preds.append(preds)

    # CLEAN MEMORY
    del dtest, model
    _ = gc.collect()


Reading test data...
shape of data: (11363762, 190)
=> Test part 1 has shape (2841209, 190)
Num_Agg Columns: Index(['P_2_mean', 'P_2_std', 'P_2_min', 'P_2_max', 'P_2_last', 'D_39_mean',
       'D_39_std', 'D_39_min', 'D_39_max', 'D_39_last',
       ...
       'D_144_mean', 'D_144_std', 'D_144_min', 'D_144_max', 'D_144_last',
       'D_145_mean', 'D_145_std', 'D_145_min', 'D_145_max', 'D_145_last'],
      dtype='object', length=885)
Cat_Agg Columns: Index(['P_2_mean', 'P_2_std', 'P_2_min', 'P_2_max', 'P_2_last', 'D_39_mean',
       'D_39_std', 'D_39_min', 'D_39_max', 'D_39_last',
       ...
       'D_144_mean', 'D_144_std', 'D_144_min', 'D_144_max', 'D_144_last',
       'D_145_mean', 'D_145_std', 'D_145_min', 'D_145_max', 'D_145_last'],
      dtype='object', length=885)
Shape after engineering: (231155, 918)

Reading test data...
shape of data: (11363762, 190)
=> Test part 2 has shape (2839857, 190)
Num_Agg Columns: Index(['P_2_mean', 'P_2_std', 'P_2_min', 'P_2_max', 'P_2_last', 'D_39_

In [14]:
# WRITE SUBMISSION FILE
test_preds = np.concatenate(test_preds)
test = cudf.DataFrame(index=customers,data={'prediction':test_preds})
sub = cudf.read_csv('../input/amex-default-prediction/sample_submission.csv')[['customer_ID']]
sub['customer_ID_hash'] = sub['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
sub = sub.set_index('customer_ID_hash')
sub = sub.merge(test[['prediction']], left_index=True, right_index=True, how='left')
sub = sub.reset_index(drop=True)

# DISPLAY PREDICTIONS
sub.to_csv(f'submission_xgb_v{VER}.csv',index=False)
print('Submission file shape is', sub.shape )
sub.head()

Submission file shape is (924621, 2)


Unnamed: 0,customer_ID,prediction
0,0359e97c244bbbbe2db7c21e891debe80e82291f2e470e...,0.002059
1,035b3479c9020483c00b7dac8f816759bb3aa6fdd8dfab...,0.000367
2,035a556cc13aae13de7bdcc71c81a1ab27f586f2ddf50e...,0.00286
3,035bca6744c2fe912b15a0bc6011f3ec679cbc7c60e049...,0.039637
4,0359f31145b54da7258ed5ff894cbe50dd4302d3d4a1e9...,0.036869
