Notebook to prepare features for the AMEX competition.

In [None]:
import numpy as np
import pandas as pd
import gc

DEBUG = False

In [None]:
train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv')

In [None]:
test_read = pd.read_csv('../input/amex-default-prediction/train_data.csv',nrows=10)

test_read['S_2'] = pd.to_datetime(test_read['S_2'])
test_read['S_2_max'] = test_read[['S_2','customer_ID']].groupby('customer_ID').S_2.transform('max')
test_read['S_2_diff'] = test_read[['S_2','customer_ID']].groupby('customer_ID').S_2.transform('diff').dt.days
test_read['S_2'] = (test_read['S_2_max']-test_read['S_2']).dt.days

In [None]:
remove = ['customer_ID','S_2_max']

agg_dict_num = {}
agg_dict_cat = {}

agg_dict_cat['customer_ID'] = ['count']

mean_diff = lambda x: np.nanmean(np.diff(x.values))
mean_diff.__name__ = 'mean_diff'

for c in test_read.columns:
    if c not in remove:
        if c not in cat_features+bin_features:
            agg_dict_num[c] = ['mean','std','min','max','last','nunique','first'] #'median','skew', mean_diff] # too slow with pandas
        else:
            agg_dict_cat[c] = ['nunique','last']
            

In [None]:
def prepare_df_num(df):
    df.loc[:,'S_2'] = (pd.to_datetime(df.S_2).max() - pd.to_datetime(df.S_2)).dt.days
    
    # compute "after pay" features
    for bcol in [f'B_{i}' for i in [11,14,17]]+['D_39','D_131']+[f'S_{i}' for i in [16,23]]:
        for pcol in ['P_2','P_3']:
            if bcol in df.columns:
                df[f'{bcol}-{pcol}'] = df[bcol] - df[pcol]
                
    df_agg = df.groupby('customer_ID').agg(['mean','std','min','max','last'])
    df_agg.columns = [str(c[0])+'_'+str(c[1]) for c in df_agg.columns]
    
    return df_agg

def prepare_df_cat(df):
    df.loc[:,cat_features+bin_features] = df.loc[:,cat_features+bin_features].astype(str)
    df_agg = df.groupby('customer_ID').agg(agg_dict_cat)
    df_agg.columns = [str(c[0])+'_'+str(c[1]) for c in df_agg.columns]
    df_list = []
    for c in cat_features+bin_features:
        df_cat = df.groupby(['customer_ID',c])[c].count()
        df_cat = df_cat.unstack()
        df_cat.columns = [df_cat.columns.name + '_' + c for c in df_cat.columns]
        df_cat = df_cat.fillna(0)
        df_list.append(df_cat)
    df_out = pd.concat([df_agg]+df_list, axis=1)
    return df_out

In [None]:

iv_score_dict = {}
for col in tqdm(train_cols):
    if col in cat_cols:
        optb = optbinning.OptimalBinning(dtype='categorical')
        optb.fit(train_df[col], train_df['target'])
    else:
        optb = optbinning.OptimalBinning(dtype='numerical')
        optb.fit(train_df[col], train_df['target'])
    binning_table = optb.binning_table
    binning_table.build()
    iv_score_dict[col] = binning_table.iv

In [None]:
train_data

In [None]:
%%time

train_data = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet')

if DEBUG:
    # around 2% of data
    train_data = train_data.iloc[:100000]
    
train_data['S_2'] = pd.to_datetime(train_data['S_2'])
train_data['S_2_max'] = train_data[['S_2','customer_ID']].groupby('customer_ID').S_2.transform('max')
train_data['S_2_diff'] = train_data[['S_2','customer_ID']].groupby('customer_ID').S_2.transform('diff').dt.days
train_data['S_2'] = (train_data['S_2_max']-train_data['S_2']).dt.days

In [None]:
%%time

# https://stackoverflow.com/questions/2130016/splitting-a-list-into-n-parts-of-approximately-equal-length
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

split_ids = split(train_data.customer_ID.unique(),10)

df_list_train = []

for (i,ids) in enumerate(split_ids):
    print(i)
    train_data_ids = train_data[train_data.customer_ID.isin(ids)]
    train_data_num = prepare_df_num(train_data_ids).astype('float16')
    train_data_cat = prepare_df_cat(train_data_ids).astype('float16')
    df_list_train.append(pd.concat([train_data_num,train_data_cat],axis=1))
    gc.collect()

In [None]:
pd.concat(df_list_train,axis=0).astype('float16').to_pickle('train_data_agg.pkl')

del train_data, train_data_num, train_data_cat, train_data_ids, df_list_train
gc.collect()

In [None]:
%%time

test_data = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet')

if DEBUG:
    test_data = test_data.iloc[:100000]
    
test_data['S_2'] = pd.to_datetime(test_data['S_2'])
test_data['S_2_max'] = test_data[['S_2','customer_ID']].groupby('customer_ID').S_2.transform('max')
test_data['S_2_diff'] = test_data[['S_2','customer_ID']].groupby('customer_ID').S_2.transform('diff').dt.days
test_data['S_2'] = (test_data['S_2_max']-test_data['S_2']).dt.days

In [None]:
%%time

split_ids = split(test_data.customer_ID.unique(),10)

df_list_test = []

for (i,ids) in enumerate(split_ids):
    print(i)
    test_data_ids = test_data[test_data.customer_ID.isin(ids)]
    test_data_num = prepare_df_num(test_data_ids).astype('float16')
    test_data_cat = prepare_df_cat(test_data_ids).astype('float16')
    df_list_test.append(pd.concat([test_data_num,test_data_cat],axis=1))
    gc.collect()
    
pd.concat(df_list_test,axis=0).astype('float16').to_pickle('test_data_agg.pkl')

del test_data, test_data_num, test_data_cat, test_data_ids, df_list_test
gc.collect()