In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt, gc, os

print('RAPIDS version',cudf.__version__)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8

def read_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
    #df = df.sort_values(['customer_ID','S_2'])
    #df = df.reset_index(drop=True)
    # FILL NAN
    df = df.fillna(NAN_VALUE)
    print('shape of data:', df.shape)
    
    return df

print('Reading train data...')
TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
X_train = read_file(path = TRAIN_PATH)

In [None]:
y_train = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')

In [None]:
X_train.head(5)

In [None]:
all_cols = [c for c in list(X_train.columns) if c not in ['customer_ID','S_2']]
cat_cols = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
num_cols = [col for col in all_cols if col not in cat_cols]

In [None]:
delinquency_cols = [col for col in list(X_train.columns) if 'D_' in col]
spend_cols = [col for col in list(X_train.columns) if 'S_' in col]
payment_cols = [col for col in list(X_train.columns) if 'P_' in col]
balance_cols = [col for col in list(X_train.columns) if 'B_' in col]
risk_cols = [col for col in list(X_train.columns) if 'R_' in col]

In [None]:
d_cat_matches = [x for x in delinquency_cols if x in cat_cols]
# X_train[d_cat_matches]

b_cat_matches = [x for x in balance_cols if x in cat_cols]
# X_train[b_cat_matches]

In [None]:
'''
Since CUDF DataFrame doesn't seem to have a `value_counts()` attribute (even though the documentation says
otherwise..), we'll have to implement the algorithm on our own.

Create a value counts based on customer id for the delinquency categorical variables.

'''
val_cnt = {}
tmp = list(X_train.groupby('customer_ID')[d_cat_matches[0]].agg('unique').index.to_pandas())
for cst_id in tmp:
    val_cnt[str(int(cst_id))] = {}
    if j != 0 and j % 100000 == 0:
        print(f'column {d_col} processed, {j+1} of {len(d_cat_matches)} columns processed.')

for (j, d_col) in enumerate(d_cat_matches):
    tmp = X_train.groupby('customer_ID')[d_col].agg('unique')
    for i in range(len(tmp)):
        cst_id = tmp.index[i]
        cnt = Counter(tmp.iloc[i])
        val_cnt[str(int(cst_id))][str(d_col)] = dict(cnt)
        if i % 100000 == 0 and i != 0:
            print(f'row {i} in column name {d_col}, {j+1} of {len(d_cat_matches)} columns processed.')
        

In [None]:
import pickle
# ...
with open('/kaggle/working/d_cat_val_cnts.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(val_cnt, f, pickle.HIGHEST_PROTOCOL)

In [None]:
'''
Since CUDF DataFrame doesn't seem to have a `value_counts()` attribute (even though the documentation says
otherwise..), we'll have to implement the algorithm on our own.

Create a value counts based on customer id for the balance categorical variables.

'''
val_cnt = {}
tmp = list(X_train.groupby('customer_ID')[b_cat_matches[0]].agg('unique').index.to_pandas())
for cst_id in tmp:
    val_cnt[str(int(cst_id))] = {}
    if j != 0 and j % 100000 == 0:
        print(f'column {d_col} processed, {j+1} of {len(d_cat_matches)} columns processed.')

for (j, b_col) in enumerate(b_cat_matches):
    tmp = X_train.groupby('customer_ID')[b_col].agg('unique')
    for i in range(len(tmp)):
        cst_id = tmp.index[i]
        cnt = Counter(tmp.iloc[i])
        val_cnt[str(int(cst_id))][str(b_col)] = dict(cnt)
        if i % 100000 == 0 and i != 0:
            print(f'row {i} in column name {b_col}, {j+1} of {len(b_cat_matches)} columns processed.')
        

In [None]:
import pickle
# ...
with open('/kaggle/working/b_cat_val_cnts.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(val_cnt, f, pickle.HIGHEST_PROTOCOL)

In [None]:
val_cnt

In [None]:
val_cnt[str(int(cst_id))][0]