# Customer Loyalty Feature Engineering

In [2]:
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
import time
from datetime import datetime 
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('./Data/encoded_df.csv')
df = df.sort_values(by='purchase_date')
df.head(3)

Unnamed: 0,card_id,city_id,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,...,new_merchant_flag,category_1,category_2_2.0,category_2_3.0,category_2_4.0,category_2_5.0,category_2_NA,category_3_B,category_3_C,category_3_NA
7258152,C_ID_da2090f28e,69,0,623,M_ID_f001319a61,-11,-0.686802,2017-01-01 00:00:08,9,4,...,False,False,False,False,False,False,False,False,False,False
18429662,C_ID_efced389a0,76,1,842,M_ID_18038b5ae7,-12,-0.56659,2017-01-01 00:00:59,2,37,...,False,False,False,True,False,False,False,True,False,False
14876034,C_ID_83561fe74a,233,1,661,M_ID_52d3026407,-13,-0.559227,2017-01-01 00:01:41,9,8,...,False,False,False,False,False,False,False,True,False,False


# Aggregating Boolean Columns

In [5]:
# Shorten column names and isolate boolean features

renaming = {
       'new_merchant_flag': 'new_flag',
       'authorized_flag': 'auth_flag',
       'category_1': 'c1',
       'category_2_2.0': 'c2_2',
       'category_2_3.0': 'c2_3', 
       'category_2_4.0': 'c2_4',
       'category_2_5.0': 'c2_5', 
       'category_2_NA': 'c2_NA', 
       'category_3_B': 'c3_B',
       'category_3_C': 'c3_A', 
       'category_3_NA': 'c3_NA'
}

bool_cols = list(renaming.values())

bool_df = df[['card_id'] + list(renaming.keys())]
bool_df.rename(columns=renaming, inplace=True)
bool_df.head()

Unnamed: 0,card_id,new_flag,auth_flag,c1,c2_2,c2_3,c2_4,c2_5,c2_NA,c3_B,c3_A,c3_NA
7258152,C_ID_da2090f28e,False,True,False,False,False,False,False,False,False,False,False
18429662,C_ID_efced389a0,False,True,False,False,True,False,False,False,True,False,False
14876034,C_ID_83561fe74a,False,True,False,False,False,False,False,False,True,False,False
28523797,C_ID_479fd6392a,False,True,True,False,False,False,False,True,True,False,False
19914410,C_ID_1cf6056088,False,True,False,False,False,False,False,False,False,False,False


In [43]:
# Defining aggregation functions

def sum0(series):
    sum1 = series.sum()
    sum0 = len(series) - sum1
    return sum0

def quantile25(series):
    return series.quantile(0.25)

def quantile75(series):
    return series.quantile(0.75)

def mode(series):
    return series.mode().iloc[0] if not series.empty else np.nan

def changes(series):
    return (series != series.shift()).sum()

In [89]:
# Aggregating by card_id

bool_features = bool_df.groupby('card_id')[bool_cols].agg([
    'mean',
    'median',
    'sum',
    sum0,
    mode,
    'std',
    'skew',
    changes
])

bool_features.columns = [f"{col}_{agg}" for col, agg in bool_features.columns]
bool_features.reset_index()

bool_features = bool_features.astype({
    col: 'float32' if bool_features[col].dtype == 'float64' else 'int32' if bool_features[col].dtype == 'int64' else bool_features[col].dtype
    for col in bool_features.columns
})

bool_features.head()

Unnamed: 0_level_0,new_flag_mean,new_flag_median,new_flag_sum,new_flag_sum0,new_flag_mode,new_flag_std,new_flag_skew,new_flag_changes,auth_flag_mean,auth_flag_median,...,c3_A_skew,c3_A_changes,c3_NA_mean,c3_NA_median,c3_NA_sum,c3_NA_sum0,c3_NA_mode,c3_NA_std,c3_NA_skew,c3_NA_changes
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_ID_00007093c1,0.013245,0.0,2,149,False,0.114703,8.60116,2,0.768212,1.0,...,1.884419,34,0.0,0.0,0,151,False,0.0,0.0,1
C_ID_0001238066,0.168919,0.0,25,123,False,0.375953,1.785419,2,0.97973,1.0,...,1.209177,51,0.027027,0.0,4,144,False,0.162713,5.893231,9
C_ID_0001506ef0,0.014925,0.0,1,66,False,0.122169,8.185352,2,0.940298,1.0,...,0.0,1,0.0,0.0,0,67,False,0.0,0.0,1
C_ID_0001793786,0.125506,0.0,31,216,False,0.331965,2.274647,2,0.890688,1.0,...,0.0,1,0.0,0.0,0,247,False,0.0,0.0,1
C_ID_000183fdda,0.070968,0.0,11,144,False,0.257603,3.374495,2,0.954839,1.0,...,1.078235,56,0.032258,0.0,5,150,False,0.177257,5.346532,8


In [113]:
# Fix one boolean at a time to either 0 or 1 and run statistics on the other booleans

def bool_agg(bool_df, bool_cols):

    merged_features = None
    steps = len(bool_cols)
    start_time = time.time()

    for i, group in enumerate(bool_cols):

        prefix = group.split('_')[0]
        filtered_cols = [value for value in bool_cols if prefix not in value]

        features = bool_df.groupby(['card_id', group])[filtered_cols].agg([
            'mean',
            'median',
            'sum',
            sum0
        ]).reset_index()

        features.columns = ['card_id', group] + [f"{col}_{agg}_{group}" for col, agg in features.columns[2:]]

        features = features.astype({
            col: 'float32' if features[col].dtype == 'float64' 
            else 'int32' if features[col].dtype == 'int64' 
            else features[col].dtype 
            for col in features.columns
        })

        features1 = features[features[group].to_numpy()]
        features0 = features[~features[group].to_numpy()]
        features1.drop(group, axis=1, inplace=True)
        features0.drop(group, axis=1, inplace=True)
        features1.columns = ['card_id'] + [f"{col}_1" for col in features1.columns[1:]]
        features0.columns = ['card_id'] + [f"{col}_0" for col in features0.columns[1:]]

        features = pd.merge(features1, features0, on='card_id', how='outer')

        if merged_features is None:
            merged_features = features
        else:
            merged_features = pd.merge(merged_features, features, on='card_id')

        elapsed_time = time.time() - start_time
        print(f"[{i+1}/{steps}]\tshape:", merged_features.shape, "\ttime:", elapsed_time)

    return merged_features

In [85]:
# Aggregate using above function

merged = bool_agg(bool_df, bool_cols)
merged.head()

[1/11]	shape: (325540, 81) 	time: 165.7909939289093
[2/11]	shape: (325540, 161) 	time: 330.08673906326294
[3/11]	shape: (325540, 241) 	time: 463.56731629371643
[4/11]	shape: (325540, 289) 	time: 531.3272840976715
[5/11]	shape: (325540, 337) 	time: 606.7221009731293
[6/11]	shape: (325540, 385) 	time: 676.7104630470276
[7/11]	shape: (325540, 433) 	time: 754.7787525653839
[8/11]	shape: (325540, 481) 	time: 838.8245973587036
[9/11]	shape: (325540, 545) 	time: 960.0623030662537
[10/11]	shape: (325540, 609) 	time: 1074.687548160553
[11/11]	shape: (325540, 673) 	time: 1175.1983017921448


Unnamed: 0,card_id,auth_flag_mean_new_flag_1,auth_flag_median_new_flag_1,auth_flag_sum_new_flag_1,auth_flag_sum0_new_flag_1,c1_mean_new_flag_1,c1_median_new_flag_1,c1_sum_new_flag_1,c1_sum0_new_flag_1,c2_2_mean_new_flag_1,...,c2_4_sum_c3_NA_0,c2_4_sum0_c3_NA_0,c2_5_mean_c3_NA_0,c2_5_median_c3_NA_0,c2_5_sum_c3_NA_0,c2_5_sum0_c3_NA_0,c2_NA_mean_c3_NA_0,c2_NA_median_c3_NA_0,c2_NA_sum_c3_NA_0,c2_NA_sum0_c3_NA_0
0,C_ID_00007093c1,1.0,1.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0,151,0.006623,0.0,1,150,0.18543,0.0,28,123
1,C_ID_0001238066,1.0,1.0,25.0,0.0,0.08,0.0,2.0,23.0,0.0,...,0,144,0.152778,0.0,22,122,0.0625,0.0,9,135
2,C_ID_0001506ef0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,67,0.0,0.0,0,67,0.0,0.0,0,67
3,C_ID_0001793786,1.0,1.0,31.0,0.0,0.0,0.0,0.0,31.0,0.258065,...,0,247,0.004049,0.0,1,246,0.469636,0.0,116,131
4,C_ID_000183fdda,1.0,1.0,11.0,0.0,0.0,0.0,0.0,11.0,0.0,...,0,150,0.006667,0.0,1,149,0.026667,0.0,4,146


In [98]:
# Merge our two types of aggregations and consolidate

bool_features = pd.merge(bool_features, merged, on='card_id', how='outer')

for col in bool_features.columns:

    unique_values = bool_features[col].unique()

    if set(unique_values) == {0.0, 1.0}:
        bool_features[col] = bool_features[col].astype(bool)
    elif bool_features[col].dtype == 'int64':
        bool_features[col] = bool_features[col].astype('int32')
    elif bool_features[col].dtype == 'float64':
        bool_features[col] = bool_features[col].astype('float32')

bool_features.shape

(325540, 761)

In [101]:
# Check memory usage

bool_features.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325540 entries, 0 to 325539
Columns: 761 entries, card_id to c2_NA_sum0_c3_NA_0
dtypes: bool(11), float32(660), int32(89), object(1)
memory usage: 955.9 MB


In [103]:
# Save features to .csv

bool_features.to_csv('./Data/bool_features.csv', index=False)

# Aggregating Numerical Columns

In [105]:
# Isolate relevant features, we will still make use of booleans

num_cols = [
    'installments', 'month_lag', 'purchase_amount', 'year', 'month', 'day', 'hour'
]

num_df = df[['card_id'] + num_cols + list(renaming.keys())]
num_df.rename(columns=renaming, inplace=True)
num_df.head()

Unnamed: 0,card_id,installments,month_lag,purchase_amount,year,month,day,hour,new_flag,auth_flag,c1,c2_2,c2_3,c2_4,c2_5,c2_NA,c3_B,c3_A,c3_NA
7258152,C_ID_da2090f28e,0,-11,-0.686802,2017,1,1,0,False,True,False,False,False,False,False,False,False,False,False
18429662,C_ID_efced389a0,1,-12,-0.56659,2017,1,1,0,False,True,False,False,True,False,False,False,True,False,False
14876034,C_ID_83561fe74a,1,-13,-0.559227,2017,1,1,0,False,True,False,False,False,False,False,False,True,False,False
28523797,C_ID_479fd6392a,1,-1,-0.737892,2017,1,1,0,False,True,True,False,False,False,False,True,True,False,False
19914410,C_ID_1cf6056088,0,-4,0.004418,2017,1,1,0,False,True,False,False,False,False,False,False,False,False,False


In [111]:
# More aggregation functions

def range(series):
    return series.max() - series.min()

In [112]:
# Aggregating by card_id

num_features = num_df.groupby('card_id')[num_cols].agg([
    'mean',
    'median',
    'sum',
    mode,
    'std',
    'skew',
    'min',
    'max',
    range,
    'var'
])

num_features.columns = [f"{col}_{agg}" for col, agg in num_features.columns]
num_features.reset_index()

num_features = num_features.astype({
    col: 'int32' if num_features[col].dtype == 'int64' else num_features[col].dtype
    for col in num_features.columns
})

num_features.head()

Unnamed: 0_level_0,installments_mean,installments_median,installments_sum,installments_mode,installments_std,installments_skew,installments_min,installments_max,installments_range,installments_var,...,hour_mean,hour_median,hour_sum,hour_mode,hour_std,hour_skew,hour_min,hour_max,hour_range,hour_var
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_ID_00007093c1,1.284768,1.0,194,1,0.760504,3.339258,1,6,5,0.578366,...,14.403974,15.0,2175,16,4.209796,-0.513693,0,22,22,17.722384
C_ID_0001238066,1.614865,1.0,239,1,1.601545,3.036936,-1,10,11,2.564948,...,14.844595,16.0,2197,19,5.873627,-0.939797,0,23,23,34.499494
C_ID_0001506ef0,0.014925,0.0,1,0,0.122169,8.185353,0,1,1,0.014925,...,12.552239,12.0,841,12,3.163636,-0.609968,0,21,21,10.008593
C_ID_0001793786,0.020243,0.0,5,0,0.141116,6.854971,0,1,1,0.019914,...,15.080972,15.0,3725,13,5.103156,-1.059279,0,23,23,26.042197
C_ID_000183fdda,1.806452,1.0,280,1,2.0705,2.681747,-1,10,11,4.286971,...,16.393548,18.0,2541,22,5.408623,-1.125058,0,23,23,29.253205


In [128]:
# Fix one boolean at a time to either 0 or 1 and run statistics on all the numerical features

def num_agg(num_df, num_cols, bool_cols):

    merged_features = None
    steps = len(bool_cols)
    start_time = time.time()

    for i, group in enumerate(bool_cols):

        features = num_df.groupby(['card_id', group])[num_cols].agg([
            'mean'
            'median',
            'sum',
            mode,
            'std',
            'skew',
            'min',
            'max',
            range,
            'var'
        ]).reset_index()

        features.columns = ['card_id', group] + [f"{col}_{agg}_{group}" for col, agg in features.columns[2:]]

        features = features.astype({
            col: 'int32' if features[col].dtype == 'int64' else features[col].dtype
            for col in features.columns
        })

        features1 = features[features[group].to_numpy()]
        features0 = features[~features[group].to_numpy()]
        features1.drop(group, axis=1, inplace=True)
        features0.drop(group, axis=1, inplace=True)
        features1.columns = ['card_id'] + [f"{col}_1" for col in features1.columns[1:]]
        features0.columns = ['card_id'] + [f"{col}_0" for col in features0.columns[1:]]

        features = pd.merge(features1, features0, on='card_id', how='outer')

        if merged_features is None:
            merged_features = features
        else:
            merged_features = pd.merge(merged_features, features, on='card_id')

        elapsed_time = time.time() - start_time
        print(f"[{i+1}/{steps}]\tshape:", merged_features.shape, "\ttime:", elapsed_time)

    return merged_features

In [129]:
# Aggregate using above function

num_merged = num_agg(num_df, num_cols, bool_cols)
num_merged.head()

[1/11]	shape: (325540, 15) 	time: 11.341003656387329
[2/11]	shape: (325540, 29) 	time: 23.574164867401123
[3/11]	shape: (325540, 43) 	time: 34.1645348072052
[4/11]	shape: (325540, 57) 	time: 43.942261934280396
[5/11]	shape: (325540, 71) 	time: 54.00503945350647
[6/11]	shape: (325540, 85) 	time: 63.753761291503906
[7/11]	shape: (325540, 99) 	time: 73.84504342079163
[8/11]	shape: (325540, 113) 	time: 84.18587017059326
[9/11]	shape: (325540, 127) 	time: 94.50669312477112
[10/11]	shape: (325540, 141) 	time: 105.05255579948425
[11/11]	shape: (325540, 155) 	time: 115.72994184494019


Unnamed: 0,card_id,installments_mean_new_flag_1,month_lag_mean_new_flag_1,purchase_amount_mean_new_flag_1,year_mean_new_flag_1,month_mean_new_flag_1,day_mean_new_flag_1,hour_mean_new_flag_1,installments_mean_new_flag_0,month_lag_mean_new_flag_0,...,month_mean_c3_NA_1,day_mean_c3_NA_1,hour_mean_c3_NA_1,installments_mean_c3_NA_0,month_lag_mean_c3_NA_0,purchase_amount_mean_c3_NA_0,year_mean_c3_NA_0,month_mean_c3_NA_0,day_mean_c3_NA_0,hour_mean_c3_NA_0
0,C_ID_00007093c1,1.0,2.0,-0.664262,2018.0,4.0,6.0,13.5,1.288591,-5.852349,...,,,,1.284768,-5.748344,-0.517706,2017.15894,6.344371,13.152318,14.403974
1,C_ID_0001238066,1.64,1.36,-0.565989,2018.0,3.36,18.0,15.36,1.609756,-1.813008,...,1.5,15.75,13.5,1.6875,-1.298611,-0.596689,2017.493056,6.784722,16.472222,14.881944
2,C_ID_0001506ef0,0.0,1.0,-0.732001,2018.0,3.0,22.0,9.0,0.015152,-4.833333,...,,,,0.014925,-4.746269,-0.527371,2017.208955,6.746269,12.149254,12.552239
3,C_ID_0001793786,0.0,1.322581,-0.007407,2017.0,11.322581,23.612903,11.419355,0.023148,-3.328704,...,,,,0.020243,-2.744939,-0.149861,2017.0,7.255061,17.05668,15.080972
4,C_ID_000183fdda,1.454545,1.272727,-0.599162,2018.0,3.272727,11.727273,15.454545,1.833333,-2.451389,...,6.0,22.2,12.6,1.9,-2.233333,-0.491502,2017.426667,6.646667,13.593333,16.52
