# Customer Loyalty Feature Engineering

In [1]:
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
import time
from datetime import datetime 
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('./Data/encoded_df.csv')
df = df.sort_values(by='purchase_date')

# Aggregating Boolean Columns

In [5]:
# Shorten column names and isolate boolean features

renaming = {
       'new_merchant_flag': 'new_flag',
       'authorized_flag': 'auth_flag',
       'category_1': 'c1',
       'category_2_2.0': 'c2_2',
       'category_2_3.0': 'c2_3', 
       'category_2_4.0': 'c2_4',
       'category_2_5.0': 'c2_5', 
       'category_2_NA': 'c2_NA', 
       'category_3_B': 'c3_B',
       'category_3_C': 'c3_A', 
       'category_3_NA': 'c3_NA'
}

bool_cols = list(renaming.values())

bool_df = df[['card_id'] + list(renaming.keys())]
bool_df.rename(columns=renaming, inplace=True)
bool_df.head()

Unnamed: 0,card_id,new_flag,auth_flag,c1,c2_2,c2_3,c2_4,c2_5,c2_NA,c3_B,c3_A,c3_NA
7258152,C_ID_da2090f28e,False,True,False,False,False,False,False,False,False,False,False
18429662,C_ID_efced389a0,False,True,False,False,True,False,False,False,True,False,False
14876034,C_ID_83561fe74a,False,True,False,False,False,False,False,False,True,False,False
28523797,C_ID_479fd6392a,False,True,True,False,False,False,False,True,True,False,False
19914410,C_ID_1cf6056088,False,True,False,False,False,False,False,False,False,False,False


In [43]:
# Defining aggregation functions

def sum0(series):
    sum1 = series.sum()
    sum0 = len(series) - sum1
    return sum0

def quantile25(series):
    return series.quantile(0.25)

def quantile75(series):
    return series.quantile(0.75)

def mode(series):
    return series.mode().iloc[0] if not series.empty else np.nan

def changes(series):
    return (series != series.shift()).sum()

In [89]:
# Aggregating by card_id

bool_features = bool_df.groupby('card_id')[bool_cols].agg([
    'mean',
    'median',
    'sum',
    sum0,
    mode,
    'std',
    'skew',
    changes
])

bool_features.columns = [f"{col}_{agg}" for col, agg in bool_features.columns]
bool_features.reset_index(inplace=True)

bool_features = bool_features.astype({
    col: 'float32' if bool_features[col].dtype == 'float64' else 'int32' if bool_features[col].dtype == 'int64' else bool_features[col].dtype
    for col in bool_features.columns
})

bool_features.head()

Unnamed: 0_level_0,new_flag_mean,new_flag_median,new_flag_sum,new_flag_sum0,new_flag_mode,new_flag_std,new_flag_skew,new_flag_changes,auth_flag_mean,auth_flag_median,...,c3_A_skew,c3_A_changes,c3_NA_mean,c3_NA_median,c3_NA_sum,c3_NA_sum0,c3_NA_mode,c3_NA_std,c3_NA_skew,c3_NA_changes
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_ID_00007093c1,0.013245,0.0,2,149,False,0.114703,8.60116,2,0.768212,1.0,...,1.884419,34,0.0,0.0,0,151,False,0.0,0.0,1
C_ID_0001238066,0.168919,0.0,25,123,False,0.375953,1.785419,2,0.97973,1.0,...,1.209177,51,0.027027,0.0,4,144,False,0.162713,5.893231,9
C_ID_0001506ef0,0.014925,0.0,1,66,False,0.122169,8.185352,2,0.940298,1.0,...,0.0,1,0.0,0.0,0,67,False,0.0,0.0,1
C_ID_0001793786,0.125506,0.0,31,216,False,0.331965,2.274647,2,0.890688,1.0,...,0.0,1,0.0,0.0,0,247,False,0.0,0.0,1
C_ID_000183fdda,0.070968,0.0,11,144,False,0.257603,3.374495,2,0.954839,1.0,...,1.078235,56,0.032258,0.0,5,150,False,0.177257,5.346532,8


In [113]:
# Fix one boolean at a time to either 0 or 1 and run statistics on the other booleans

def bool_agg(bool_df, bool_cols):

    merged_features = None
    steps = len(bool_cols)
    start_time = time.time()

    for i, group in enumerate(bool_cols):

        prefix = group.split('_')[0]
        filtered_cols = [value for value in bool_cols if prefix not in value]

        features = bool_df.groupby(['card_id', group])[filtered_cols].agg([
            'mean',
            'median',
            'sum',
            sum0
        ]).reset_index()

        features.columns = ['card_id', group] + [f"{col}_{agg}_{group}" for col, agg in features.columns[2:]]

        features = features.astype({
            col: 'float32' if features[col].dtype == 'float64' 
            else 'int32' if features[col].dtype == 'int64' 
            else features[col].dtype 
            for col in features.columns
        })

        features1 = features[features[group].to_numpy()]
        features0 = features[~features[group].to_numpy()]
        features1.drop(group, axis=1, inplace=True)
        features0.drop(group, axis=1, inplace=True)
        features1.columns = ['card_id'] + [f"{col}_1" for col in features1.columns[1:]]
        features0.columns = ['card_id'] + [f"{col}_0" for col in features0.columns[1:]]

        features = pd.merge(features1, features0, on='card_id', how='outer')

        if merged_features is None:
            merged_features = features
        else:
            merged_features = pd.merge(merged_features, features, on='card_id')

        elapsed_time = time.time() - start_time
        print(f"[{i+1}/{steps}]\tshape:", merged_features.shape, "\ttime:", elapsed_time)

    return merged_features

In [85]:
# Aggregate using above function

merged = bool_agg(bool_df, bool_cols)
merged.head()

[1/11]	shape: (325540, 81) 	time: 165.7909939289093
[2/11]	shape: (325540, 161) 	time: 330.08673906326294
[3/11]	shape: (325540, 241) 	time: 463.56731629371643
[4/11]	shape: (325540, 289) 	time: 531.3272840976715
[5/11]	shape: (325540, 337) 	time: 606.7221009731293
[6/11]	shape: (325540, 385) 	time: 676.7104630470276
[7/11]	shape: (325540, 433) 	time: 754.7787525653839
[8/11]	shape: (325540, 481) 	time: 838.8245973587036
[9/11]	shape: (325540, 545) 	time: 960.0623030662537
[10/11]	shape: (325540, 609) 	time: 1074.687548160553
[11/11]	shape: (325540, 673) 	time: 1175.1983017921448


Unnamed: 0,card_id,auth_flag_mean_new_flag_1,auth_flag_median_new_flag_1,auth_flag_sum_new_flag_1,auth_flag_sum0_new_flag_1,c1_mean_new_flag_1,c1_median_new_flag_1,c1_sum_new_flag_1,c1_sum0_new_flag_1,c2_2_mean_new_flag_1,...,c2_4_sum_c3_NA_0,c2_4_sum0_c3_NA_0,c2_5_mean_c3_NA_0,c2_5_median_c3_NA_0,c2_5_sum_c3_NA_0,c2_5_sum0_c3_NA_0,c2_NA_mean_c3_NA_0,c2_NA_median_c3_NA_0,c2_NA_sum_c3_NA_0,c2_NA_sum0_c3_NA_0
0,C_ID_00007093c1,1.0,1.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0,151,0.006623,0.0,1,150,0.18543,0.0,28,123
1,C_ID_0001238066,1.0,1.0,25.0,0.0,0.08,0.0,2.0,23.0,0.0,...,0,144,0.152778,0.0,22,122,0.0625,0.0,9,135
2,C_ID_0001506ef0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,67,0.0,0.0,0,67,0.0,0.0,0,67
3,C_ID_0001793786,1.0,1.0,31.0,0.0,0.0,0.0,0.0,31.0,0.258065,...,0,247,0.004049,0.0,1,246,0.469636,0.0,116,131
4,C_ID_000183fdda,1.0,1.0,11.0,0.0,0.0,0.0,0.0,11.0,0.0,...,0,150,0.006667,0.0,1,149,0.026667,0.0,4,146


In [98]:
# Merge our two types of aggregations and consolidate

bool_features = pd.merge(bool_features, merged, on='card_id', how='outer')

for col in bool_features.columns:

    unique_values = bool_features[col].unique()

    if set(unique_values) == {0.0, 1.0}:
        bool_features[col] = bool_features[col].astype(bool)
    elif bool_features[col].dtype == 'int64':
        bool_features[col] = bool_features[col].astype('int32')
    elif bool_features[col].dtype == 'float64':
        bool_features[col] = bool_features[col].astype('float32')

bool_features.shape

(325540, 761)

In [101]:
# Check memory usage

bool_features.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325540 entries, 0 to 325539
Columns: 761 entries, card_id to c2_NA_sum0_c3_NA_0
dtypes: bool(11), float32(660), int32(89), object(1)
memory usage: 955.9 MB


In [103]:
# Save features to .csv

bool_features.to_csv('./Data/bool_features.csv', index=False)

# Aggregating Numerical Columns

In [105]:
# Isolate relevant features, we will still make use of booleans

num_cols = [
    'installments', 'month_lag', 'purchase_amount', 'year', 'month', 'day', 'hour'
]

num_df = df[['card_id'] + num_cols + list(renaming.keys())]
num_df.rename(columns=renaming, inplace=True)
num_df.head()

Unnamed: 0,card_id,installments,month_lag,purchase_amount,year,month,day,hour,new_flag,auth_flag,c1,c2_2,c2_3,c2_4,c2_5,c2_NA,c3_B,c3_A,c3_NA
7258152,C_ID_da2090f28e,0,-11,-0.686802,2017,1,1,0,False,True,False,False,False,False,False,False,False,False,False
18429662,C_ID_efced389a0,1,-12,-0.56659,2017,1,1,0,False,True,False,False,True,False,False,False,True,False,False
14876034,C_ID_83561fe74a,1,-13,-0.559227,2017,1,1,0,False,True,False,False,False,False,False,False,True,False,False
28523797,C_ID_479fd6392a,1,-1,-0.737892,2017,1,1,0,False,True,True,False,False,False,False,True,True,False,False
19914410,C_ID_1cf6056088,0,-4,0.004418,2017,1,1,0,False,True,False,False,False,False,False,False,False,False,False


In [111]:
# More aggregation functions

def range(series):
    return series.max() - series.min()

In [112]:
# Aggregating by card_id

num_features = num_df.groupby('card_id')[num_cols].agg([
    'mean',
    'median',
    'sum',
    mode,
    'std',
    'skew',
    'min',
    'max',
    range,
    'var'
])

num_features.columns = [f"{col}_{agg}" for col, agg in num_features.columns]
num_features.reset_index(inplace=True)

num_features = num_features.astype({
    col: 'int32' if num_features[col].dtype == 'int64' else num_features[col].dtype
    for col in num_features.columns
})

num_features.head()

Unnamed: 0_level_0,installments_mean,installments_median,installments_sum,installments_mode,installments_std,installments_skew,installments_min,installments_max,installments_range,installments_var,...,hour_mean,hour_median,hour_sum,hour_mode,hour_std,hour_skew,hour_min,hour_max,hour_range,hour_var
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_ID_00007093c1,1.284768,1.0,194,1,0.760504,3.339258,1,6,5,0.578366,...,14.403974,15.0,2175,16,4.209796,-0.513693,0,22,22,17.722384
C_ID_0001238066,1.614865,1.0,239,1,1.601545,3.036936,-1,10,11,2.564948,...,14.844595,16.0,2197,19,5.873627,-0.939797,0,23,23,34.499494
C_ID_0001506ef0,0.014925,0.0,1,0,0.122169,8.185353,0,1,1,0.014925,...,12.552239,12.0,841,12,3.163636,-0.609968,0,21,21,10.008593
C_ID_0001793786,0.020243,0.0,5,0,0.141116,6.854971,0,1,1,0.019914,...,15.080972,15.0,3725,13,5.103156,-1.059279,0,23,23,26.042197
C_ID_000183fdda,1.806452,1.0,280,1,2.0705,2.681747,-1,10,11,4.286971,...,16.393548,18.0,2541,22,5.408623,-1.125058,0,23,23,29.253205


In [133]:
# Fix one boolean at a time to either 0 or 1 and run statistics on all the numerical features

def num_agg(num_df, num_cols, bool_cols):

    merged_features = None
    steps = len(bool_cols)
    start_time = time.time()

    for i, group in enumerate(bool_cols):

        features = num_df.groupby(['card_id', group])[num_cols].agg([
            'mean',
            'median',
            'sum',
            mode,
            'std',
            'skew',
            'min',
            'max',
            range,
            'var'
        ]).reset_index()

        features.columns = ['card_id', group] + [f"{col}_{agg}_{group}" for col, agg in features.columns[2:]]

        features = features.astype({
            col: 'int32' if features[col].dtype == 'int64' else features[col].dtype
            for col in features.columns
        })

        features1 = features[features[group].to_numpy()]
        features0 = features[~features[group].to_numpy()]
        features1.drop(group, axis=1, inplace=True)
        features0.drop(group, axis=1, inplace=True)
        features1.columns = ['card_id'] + [f"{col}_1" for col in features1.columns[1:]]
        features0.columns = ['card_id'] + [f"{col}_0" for col in features0.columns[1:]]

        features = pd.merge(features1, features0, on='card_id', how='outer')

        if merged_features is None:
            merged_features = features
        else:
            merged_features = pd.merge(merged_features, features, on='card_id')

        elapsed_time = time.time() - start_time
        print(f"[{i+1}/{steps}]\tshape:", merged_features.shape, "\ttime:", elapsed_time)

    return merged_features

In [134]:
# Aggregate using above function

num_merged = num_agg(num_df, num_cols, bool_cols)
num_merged.head()

[1/11]	shape: (325540, 141) 	time: 450.485467672348
[2/11]	shape: (325540, 281) 	time: 899.884345293045
[3/11]	shape: (325540, 421) 	time: 1253.3202724456787
[4/11]	shape: (325540, 561) 	time: 1533.6162810325623
[5/11]	shape: (325540, 701) 	time: 1840.314953327179
[6/11]	shape: (325540, 841) 	time: 2125.7898766994476
[7/11]	shape: (325540, 981) 	time: 2451.9139795303345
[8/11]	shape: (325540, 1121) 	time: 2827.1897642612457
[9/11]	shape: (325540, 1261) 	time: 3243.872863292694
[10/11]	shape: (325540, 1401) 	time: 3612.9010438919067
[11/11]	shape: (325540, 1541) 	time: 3926.4364240169525


Unnamed: 0,card_id,installments_mean_new_flag_1,installments_median_new_flag_1,installments_sum_new_flag_1,installments_mode_new_flag_1,installments_std_new_flag_1,installments_skew_new_flag_1,installments_min_new_flag_1,installments_max_new_flag_1,installments_range_new_flag_1,...,hour_mean_c3_NA_0,hour_median_c3_NA_0,hour_sum_c3_NA_0,hour_mode_c3_NA_0,hour_std_c3_NA_0,hour_skew_c3_NA_0,hour_min_c3_NA_0,hour_max_c3_NA_0,hour_range_c3_NA_0,hour_var_c3_NA_0
0,C_ID_00007093c1,1.0,1.0,2.0,1.0,0.0,,1.0,1.0,0.0,...,14.403974,15.0,2175,16,4.209796,-0.513693,0,22,22,17.722384
1,C_ID_0001238066,1.64,1.0,41.0,1.0,2.118962,3.087259,-1.0,10.0,11.0,...,14.881944,16.0,2143,19,5.93378,-0.954651,0,23,23,35.209742
2,C_ID_0001506ef0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,...,12.552239,12.0,841,12,3.163636,-0.609968,0,21,21,10.008593
3,C_ID_0001793786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,15.080972,15.0,3725,13,5.103156,-1.059279,0,23,23,26.042197
4,C_ID_000183fdda,1.454545,1.0,16.0,1.0,1.29334,0.291348,-1.0,4.0,5.0,...,16.52,18.0,2478,22,5.438762,-1.19294,0,23,23,29.580134


In [135]:
# Merge our two types of aggregations and consolidate

num_features = pd.merge(num_features, num_merged, on='card_id', how='outer')

for col in num_features.columns:

    unique_values = num_features[col].unique()

    if set(unique_values) == {0.0, 1.0}:
        num_features[col] = num_features[col].astype(bool)
    elif num_features[col].dtype == 'int64':
        num_features[col] = num_features[col].astype('int32')
    elif num_features[col].dtype == 'float64' and "purchase" not in col:
        num_features[col] = num_features[col].astype('float32')

num_features.shape

(325540, 1611)

In [136]:
# Check memory usage

num_features.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325540 entries, 0 to 325539
Columns: 1611 entries, card_id to hour_var_c3_NA_0
dtypes: bool(4), float32(1260), float64(230), int32(116), object(1)
memory usage: 2.2 GB


In [137]:
# Save features to .csv

num_features.to_csv('./Data/num_features.csv', index=False)

# Aggregating by ID Columns

In [141]:
# Isolate relevant features, we will still make use of booleans

id_cols = [
    'city_id', 'merchant_category_id', 'merchant_id', 'state_id', 'subsector_id'
]

id_df = df[['card_id'] + id_cols + list(renaming.keys())]
id_df.rename(columns=renaming, inplace=True)
id_df.head()

Unnamed: 0,card_id,city_id,merchant_category_id,merchant_id,state_id,subsector_id,new_flag,auth_flag,c1,c2_2,c2_3,c2_4,c2_5,c2_NA,c3_B,c3_A,c3_NA
7258152,C_ID_da2090f28e,69,623,M_ID_f001319a61,9,4,False,True,False,False,False,False,False,False,False,False,False
18429662,C_ID_efced389a0,76,842,M_ID_18038b5ae7,2,37,False,True,False,False,True,False,False,False,True,False,False
14876034,C_ID_83561fe74a,233,661,M_ID_52d3026407,9,8,False,True,False,False,False,False,False,False,True,False,False
28523797,C_ID_479fd6392a,-1,839,M_ID_e5374dabc0,-1,29,False,True,True,False,False,False,False,True,True,False,False
19914410,C_ID_1cf6056088,69,278,M_ID_2cf6dc1f6f,9,37,False,True,False,False,False,False,False,False,False,False,False


In [142]:
# More aggregation functions

def modelen(series):
    return len(series.mode())

def unique(series):
    return len(series.unique())

In [148]:
# Aggregating by card_id

id_features = id_df.groupby('card_id')[id_cols].agg([
    mode,
    modelen,
    unique
])

id_features.columns = [f"{col}_{agg}" for col, agg in id_features.columns]
id_features.reset_index(inplace=True)

id_features = id_features.astype({
    col: 'int32' if id_features[col].dtype == 'int64' else id_features[col].dtype
    for col in id_features.columns
})

id_features.head()

Unnamed: 0,card_id,city_id_mode,city_id_modelen,city_id_unique,merchant_category_id_mode,merchant_category_id_modelen,merchant_category_id_unique,merchant_id_mode,merchant_id_modelen,merchant_id_unique,state_id_mode,state_id_modelen,state_id_unique,subsector_id_mode,subsector_id_modelen,subsector_id_unique
0,C_ID_00007093c1,244,1,5,307,1,19,M_ID_9400cf2342,1,31,2,1,4,19,1,13
1,C_ID_0001238066,314,1,19,307,1,34,M_ID_d17aabd756,1,90,9,1,6,19,1,19
2,C_ID_0001506ef0,137,1,3,705,1,19,M_ID_b1fc88154d,1,29,19,1,2,33,1,12
3,C_ID_0001793786,179,1,11,278,1,57,M_ID_923d57de8d,1,150,-1,1,5,37,1,25
4,C_ID_000183fdda,161,1,10,367,1,38,M_ID_f9cfe0a43b,1,84,3,1,7,16,1,21


In [149]:
# Fix one boolean at a time to either 0 or 1 and run statistics on all the id features

def id_agg(id_df, id_cols, bool_cols):

    merged_features = None
    steps = len(bool_cols)
    start_time = time.time()

    for i, group in enumerate(bool_cols):

        features = id_df.groupby(['card_id', group])[id_cols].agg([
            mode,
            modelen,
            unique
        ]).reset_index()

        features.columns = ['card_id', group] + [f"{col}_{agg}_{group}" for col, agg in features.columns[2:]]

        features = features.astype({
            col: 'int32' if features[col].dtype == 'int64' else features[col].dtype
            for col in features.columns
        })

        features1 = features[features[group].to_numpy()]
        features0 = features[~features[group].to_numpy()]
        features1.drop(group, axis=1, inplace=True)
        features0.drop(group, axis=1, inplace=True)
        features1.columns = ['card_id'] + [f"{col}_1" for col in features1.columns[1:]]
        features0.columns = ['card_id'] + [f"{col}_0" for col in features0.columns[1:]]

        features = pd.merge(features1, features0, on='card_id', how='outer')

        if merged_features is None:
            merged_features = features
        else:
            merged_features = pd.merge(merged_features, features, on='card_id')

        elapsed_time = time.time() - start_time
        print(f"[{i+1}/{steps}]\tshape:", merged_features.shape, "\ttime:", elapsed_time)

    return merged_features

In [150]:
# Aggregate using above function

id_merged = id_agg(id_df, id_cols, bool_cols)
id_merged.head()

[1/11]	shape: (325540, 31) 	time: 458.8687789440155
[2/11]	shape: (325540, 61) 	time: 900.7084941864014
[3/11]	shape: (325540, 91) 	time: 1257.4173288345337
[4/11]	shape: (325540, 121) 	time: 1535.8870153427124
[5/11]	shape: (325540, 151) 	time: 1850.7451286315918
[6/11]	shape: (325540, 181) 	time: 2138.4059381484985
[7/11]	shape: (325540, 211) 	time: 2463.44172334671
[8/11]	shape: (325540, 241) 	time: 2834.83682346344
[9/11]	shape: (325540, 271) 	time: 3246.853098154068
[10/11]	shape: (325540, 301) 	time: 3613.6353969573975
[11/11]	shape: (325540, 331) 	time: 3926.5266625881195


Unnamed: 0,card_id,city_id_mode_new_flag_1,city_id_modelen_new_flag_1,city_id_unique_new_flag_1,merchant_category_id_mode_new_flag_1,merchant_category_id_modelen_new_flag_1,merchant_category_id_unique_new_flag_1,merchant_id_mode_new_flag_1,merchant_id_modelen_new_flag_1,merchant_id_unique_new_flag_1,...,merchant_category_id_unique_c3_NA_0,merchant_id_mode_c3_NA_0,merchant_id_modelen_c3_NA_0,merchant_id_unique_c3_NA_0,state_id_mode_c3_NA_0,state_id_modelen_c3_NA_0,state_id_unique_c3_NA_0,subsector_id_mode_c3_NA_0,subsector_id_modelen_c3_NA_0,subsector_id_unique_c3_NA_0
0,C_ID_00007093c1,69.0,2.0,2.0,222.0,2.0,2.0,M_ID_00a6ca8a8a,2.0,2.0,...,19,M_ID_9400cf2342,1,31,2,1,4,19,1,13
1,C_ID_0001238066,314.0,1.0,8.0,278.0,1.0,14.0,M_ID_00a6ca8a8a,25.0,25.0,...,32,M_ID_d17aabd756,1,86,9,1,5,19,1,18
2,C_ID_0001506ef0,137.0,1.0,1.0,705.0,1.0,1.0,M_ID_ab756f937e,1.0,1.0,...,19,M_ID_b1fc88154d,1,29,19,1,2,33,1,12
3,C_ID_0001793786,69.0,1.0,7.0,278.0,1.0,21.0,M_ID_0360f86430,31.0,31.0,...,57,M_ID_923d57de8d,1,150,-1,1,5,37,1,25
4,C_ID_000183fdda,161.0,1.0,2.0,367.0,1.0,9.0,M_ID_113378fe3b,11.0,11.0,...,37,M_ID_f9cfe0a43b,1,81,3,1,6,16,1,21


In [151]:
# Merge our two types of aggregations and consolidate

id_features = pd.merge(id_features, id_merged, on='card_id', how='outer')

for col in id_features.columns:

    unique_values = id_features[col].unique()

    if set(unique_values) == {0.0, 1.0}:
        id_features[col] = id_features[col].astype(bool)
    elif id_features[col].dtype == 'int64':
        id_features[col] = id_features[col].astype('int32')
    elif id_features[col].dtype == 'float64' and "purchase" not in col:
        id_features[col] = id_features[col].astype('float32')

id_features.shape

(325540, 346)

In [152]:
# Check memory usage

id_features.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325540 entries, 0 to 325539
Columns: 346 entries, card_id to subsector_id_unique_c3_NA_0
dtypes: float32(266), int32(56), object(24)
memory usage: 862.8 MB


In [153]:
# Save features to .csv

id_features.to_csv('./Data/id_features.csv', index=False)

# Aggregating By Time

In [168]:
# Isolate relevant features

time_df = df[['card_id', 'purchase_date'] + list(renaming.keys())]
time_df.rename(columns=renaming, inplace=True)
time_df.head()

Unnamed: 0,card_id,purchase_date,new_flag,auth_flag,c1,c2_2,c2_3,c2_4,c2_5,c2_NA,c3_B,c3_A,c3_NA
7258152,C_ID_da2090f28e,2017-01-01 00:00:08,False,True,False,False,False,False,False,False,False,False,False
18429662,C_ID_efced389a0,2017-01-01 00:00:59,False,True,False,False,True,False,False,False,True,False,False
14876034,C_ID_83561fe74a,2017-01-01 00:01:41,False,True,False,False,False,False,False,False,True,False,False
28523797,C_ID_479fd6392a,2017-01-01 00:02:03,False,True,True,False,False,False,False,True,True,False,False
19914410,C_ID_1cf6056088,2017-01-01 00:02:12,False,True,False,False,False,False,False,False,False,False,False


In [179]:
# More aggregation functions

def daysactive(series):
    series = pd.to_datetime(series)
    return (series.max() - series.min()).days

def frequency(series):
    series = pd.to_datetime(series)
    daysactive = (series.max() - series.min()).days
    if (daysactive == 0):
        return 0
    return len(series)/daysactive

In [175]:
# Aggregating by card_id

time_features = time_df.groupby('card_id')['purchase_date'].agg([
    'count',
    'max',
    'min',
    daysactive,
    frequency
])

time_features.columns = ['count', 'latest_transaction', 'earliest_transaction', 'days_active', 'transaction_frequency']
time_features.reset_index(inplace=True)

time_features.head()

Unnamed: 0,card_id,count,latest_transaction,earliest_transaction,days_active,transaction_frequency
0,C_ID_00007093c1,151,2018-04-09 16:23:59,2017-02-14 14:00:43,419,0.360382
1,C_ID_0001238066,148,2018-04-30 19:57:30,2017-09-28 22:25:14,213,0.694836
2,C_ID_0001506ef0,67,2018-03-22 09:14:30,2017-01-14 16:16:01,431,0.155452
3,C_ID_0001793786,247,2017-12-31 17:35:56,2017-01-21 10:15:21,344,0.718023
4,C_ID_000183fdda,155,2018-04-30 14:59:53,2017-08-07 09:49:14,266,0.582707


In [183]:
# Fix one boolean at a time to either 0 or 1 and run statistics on purchase_date

def time_agg(time_df, bool_cols):

    merged_features = None
    steps = len(bool_cols)
    start_time = time.time()

    for i, group in enumerate(bool_cols):

        features = time_df.groupby(['card_id', group])['purchase_date'].agg([
            'count',
            'max',
            'min',
            daysactive,
            frequency
        ]).reset_index()

        features.columns = ['card_id', group] + [f'count_{group}', f'latest_transaction_{group}', f'earliest_transaction_{group}', f'days_active_{group}', f'transaction_frequency_{group}']

        features1 = features[features[group].to_numpy()]
        features0 = features[~features[group].to_numpy()]
        features1.drop(group, axis=1, inplace=True)
        features0.drop(group, axis=1, inplace=True)
        features1.columns = ['card_id'] + [f"{col}_1" for col in features1.columns[1:]]
        features0.columns = ['card_id'] + [f"{col}_0" for col in features0.columns[1:]]

        features = pd.merge(features1, features0, on='card_id', how='outer')

        if merged_features is None:
            merged_features = features
        else:
            merged_features = pd.merge(merged_features, features, on='card_id')

        elapsed_time = time.time() - start_time
        print(f"[{i+1}/{steps}]\tshape:", merged_features.shape, "\ttime:", elapsed_time)

    return merged_features

In [184]:
# Aggregate using above function

time_merged = time_agg(time_df, bool_cols)
time_merged.head()

[1/11]	shape: (325540, 11) 	time: 575.4006333351135
[2/11]	shape: (325540, 21) 	time: 1139.6197907924652
[3/11]	shape: (325540, 31) 	time: 1596.5925059318542
[4/11]	shape: (325540, 41) 	time: 1957.7057888507843
[5/11]	shape: (325540, 51) 	time: 2376.2382147312164
[6/11]	shape: (325540, 61) 	time: 2775.269195318222
[7/11]	shape: (325540, 71) 	time: 3214.710313796997
[8/11]	shape: (325540, 81) 	time: 3710.1078159809113
[9/11]	shape: (325540, 91) 	time: 4257.502503156662
[10/11]	shape: (325540, 101) 	time: 4750.073505163193
[11/11]	shape: (325540, 111) 	time: 5172.783169031143


Unnamed: 0,card_id,count_new_flag_1,latest_transaction_new_flag_1,earliest_transaction_new_flag_1,days_active_new_flag_1,transaction_frequency_new_flag_1,count_new_flag_0,latest_transaction_new_flag_0,earliest_transaction_new_flag_0,days_active_new_flag_0,...,count_c3_NA_1,latest_transaction_c3_NA_1,earliest_transaction_c3_NA_1,days_active_c3_NA_1,transaction_frequency_c3_NA_1,count_c3_NA_0,latest_transaction_c3_NA_0,earliest_transaction_c3_NA_0,days_active_c3_NA_0,transaction_frequency_c3_NA_0
0,C_ID_00007093c1,2.0,2018-04-09 16:23:59,2018-04-03 11:13:35,6.0,0.333333,149,2018-02-27 05:14:57,2017-02-14 14:00:43,377,...,,,,,,151,2018-04-09 16:23:59,2017-02-14 14:00:43,419,0.360382
1,C_ID_0001238066,25.0,2018-04-30 19:57:30,2018-03-01 16:48:27,60.0,0.416667,123,2018-02-27 16:18:59,2017-09-28 22:25:14,151,...,4.0,2018-03-16 18:49:21,2018-01-06 12:14:16,69.0,0.057971,144,2018-04-30 19:57:30,2017-09-28 22:25:14,213,0.676056
2,C_ID_0001506ef0,1.0,2018-03-22 09:14:30,2018-03-22 09:14:30,0.0,0.0,66,2018-02-17 12:33:56,2017-01-14 16:16:01,398,...,,,,,,67,2018-03-22 09:14:30,2017-01-14 16:16:01,431,0.155452
3,C_ID_0001793786,31.0,2017-12-31 17:35:56,2017-11-15 15:44:20,46.0,0.673913,216,2017-10-31 20:20:18,2017-01-21 10:15:21,283,...,,,,,,247,2017-12-31 17:35:56,2017-01-21 10:15:21,344,0.718023
4,C_ID_000183fdda,11.0,2018-04-30 14:59:53,2018-03-02 12:26:26,59.0,0.186441,144,2018-02-25 20:57:08,2017-08-07 09:49:14,202,...,5.0,2018-04-30 14:59:53,2017-12-28 16:56:20,122.0,0.040984,150,2018-04-26 13:46:18,2017-08-07 09:49:14,262,0.572519


In [190]:
# Merge our two types of aggregations and consolidate

time_features = pd.merge(time_features, time_merged, on='card_id', how='outer')

for col in time_features.columns:

    if time_features[col].dtype == 'int64':
        time_features[col] = time_features[col].astype('int32')
    elif time_features[col].dtype == 'float64':
        time_features[col] = time_features[col].astype('float32')

time_features.shape

(325540, 116)

In [191]:
# Check memory usage

time_features.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325540 entries, 0 to 325539
Columns: 116 entries, card_id to transaction_frequency_c3_NA_0
dtypes: float32(61), int32(8), object(47)
memory usage: 1.0 GB


In [194]:
# Save features to .csv

time_features.to_csv('./Data/time_features.csv', index=False)

# Misc Aggregation

In [195]:
# Get which id of each type the user has spent the most money on

city_sum = df.groupby(['card_id', 'city_id']).agg({
    'purchase_amount': lambda x: (np.abs(x)).sum()
}).reset_index()
max_city = city_sum.groupby('card_id').apply(lambda x: x.loc[x['purchase_amount'].idxmax()])[['city_id']]
max_city.rename(columns={'city_id':'city_max_spent'}, inplace=True)

merchant_category_sum = df.groupby(['card_id', 'merchant_category_id']).agg({
    'purchase_amount': lambda x: (np.abs(x)).sum()
}).reset_index()
max_merchant_category = merchant_category_sum.groupby('card_id').apply(lambda x: x.loc[x['purchase_amount'].idxmax()])[['merchant_category_id']]
max_merchant_category.rename(columns={'merchant_category_id':'merchant_category_max_spent'}, inplace=True)

merchant_sum = df.groupby(['card_id', 'merchant_id']).agg({
    'purchase_amount': lambda x: (np.abs(x)).sum()
}).reset_index()
max_merchant = merchant_sum.groupby('card_id').apply(lambda x: x.loc[x['purchase_amount'].idxmax()])[['merchant_id']]
max_merchant.rename(columns={'merchant_id':'merchant_max_spent'}, inplace=True)

state_sum = df.groupby(['card_id', 'state_id']).agg({
    'purchase_amount': lambda x: (np.abs(x)).sum()
}).reset_index()
max_state = state_sum.groupby('card_id').apply(lambda x: x.loc[x['purchase_amount'].idxmax()])[['state_id']]
max_state.rename(columns={'state_id':'state_max_spent'}, inplace=True)

subsector_sum = df.groupby(['card_id', 'subsector_id']).agg({
    'purchase_amount': lambda x: (np.abs(x)).sum()
}).reset_index()
max_subsector = subsector_sum.groupby('card_id').apply(lambda x: x.loc[x['purchase_amount'].idxmax()])[['subsector_id']]
max_subsector.rename(columns={'subsector_id':'subsector_max_spent'}, inplace=True)

# Merge the results

merged_df = pd.merge(max_city, max_merchant_category, on='card_id', how='inner')
merged_df = pd.merge(merged_df, max_state, on='card_id', how='inner')
merged_df = pd.merge(merged_df, max_merchant, on='card_id', how='inner')
merged_df = pd.merge(merged_df, max_subsector, on='card_id', how='inner')
merged_df.head(3)

Unnamed: 0_level_0,city_max_spent,merchant_category_max_spent,state_max_spent,merchant_max_spent,subsector_max_spent
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C_ID_00007093c1,244,307,2,M_ID_9400cf2342,19
C_ID_0001238066,314,307,9,M_ID_d17aabd756,19
C_ID_0001506ef0,137,705,19,M_ID_b1fc88154d,33


In [196]:
# Check memory usage

merged_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 325540 entries, C_ID_00007093c1 to C_ID_fffffd5772
Data columns (total 5 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   city_max_spent               325540 non-null  int64 
 1   merchant_category_max_spent  325540 non-null  int64 
 2   state_max_spent              325540 non-null  int64 
 3   merchant_max_spent           325540 non-null  object
 4   subsector_max_spent          325540 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 54.6 MB


In [197]:
# Save features to .csv

merged_df.to_csv('./Data/extra_features.csv', index=False)

# Merging into a Single DataFrame

In [198]:
# Put all features together

final_df = pd.merge(bool_features, num_features, on='card_id', how='outer')
final_df = pd.merge(final_df, id_features, on='card_id', how='outer')
final_df = pd.merge(final_df, time_features, on='card_id', how='outer')
final_df = pd.merge(final_df, merged_df, on='card_id', how='outer')
final_df.shape

(325540, 2836)

In [199]:
# View sample

final_df.head()

Unnamed: 0,card_id,new_flag_mean,new_flag_median,new_flag_sum,new_flag_sum0,new_flag_mode,new_flag_std,new_flag_skew,new_flag_changes,auth_flag_mean,...,count_c3_NA_0,latest_transaction_c3_NA_0,earliest_transaction_c3_NA_0,days_active_c3_NA_0,transaction_frequency_c3_NA_0,city_max_spent,merchant_category_max_spent,state_max_spent,merchant_max_spent,subsector_max_spent
0,C_ID_00007093c1,0.013245,0.0,2,149,False,0.114703,8.60116,2,0.768212,...,151,2018-04-09 16:23:59,2017-02-14 14:00:43,419,0.360382,244,307,2,M_ID_9400cf2342,19
1,C_ID_0001238066,0.168919,0.0,25,123,False,0.375953,1.785419,2,0.97973,...,144,2018-04-30 19:57:30,2017-09-28 22:25:14,213,0.676056,314,307,9,M_ID_d17aabd756,19
2,C_ID_0001506ef0,0.014925,0.0,1,66,False,0.122169,8.185352,2,0.940298,...,67,2018-03-22 09:14:30,2017-01-14 16:16:01,431,0.155452,137,705,19,M_ID_b1fc88154d,33
3,C_ID_0001793786,0.125506,0.0,31,216,False,0.331965,2.274647,2,0.890688,...,247,2017-12-31 17:35:56,2017-01-21 10:15:21,344,0.718023,179,705,-1,M_ID_923d57de8d,37
4,C_ID_000183fdda,0.070968,0.0,11,144,False,0.257603,3.374495,2,0.954839,...,150,2018-04-26 13:46:18,2017-08-07 09:49:14,262,0.572519,161,367,3,M_ID_f9cfe0a43b,16


In [200]:
# Check memory usage

final_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325540 entries, 0 to 325539
Columns: 2836 entries, card_id to subsector_max_spent
dtypes: bool(15), float32(2247), float64(230), int32(269), int64(4), object(71)
memory usage: 5.0 GB


In [202]:
# Save

final_df.to_csv('./Data/features.csv', index=False)

# Adding Target from train.csv

In [210]:
# Read in the training and testing datasets

train_df = pd.read_csv('./Data/train.csv')
test_df = pd.read_csv('./Data/test.csv')

print(train_df.shape, test_df.shape)

(201917, 6) (123623, 5)


In [213]:
# Merge train dataset

train_final = pd.merge(train_df, final_df, on='card_id', how='inner')
test_final = pd.merge(test_df, final_df, on='card_id', how='inner')

print(train_final.shape, test_final.shape)

(201917, 2841) (123623, 2840)


In [214]:
# Save final .csv's

train_final.to_csv('./Data/train_final.csv', index=False)
test_final.to_csv('./Data/test_final.csv', index=False)