In [None]:
# Set True, if you want to use a subset of the data for faster development. 
# Set False, if you want to use the entire dataset.
use_partial_data = True

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import holidays
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
import time
import os

import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

# allow more data columns to be shown than by default
pd.set_option('display.max_columns', 500)

cwd_path = os.path.abspath(os.getcwd())
project_root = os.path.dirname(cwd_path)


# DATA

In [None]:
if use_partial_data:
    data_path = os.path.join(project_root, 'data/preprocessed_data_small_v001.csv')
    # change Datetime from str to datetime
    data = pd.read_csv(data_path, index_col=0)
    data['Datetime'] = pd.to_datetime(data['Datetime'], yearfirst=True)
else:
    data_path = os.path.join(project_root, 'data/preprocessed_data_v001.csv')
    # change Datetime from str to datetime
    data = pd.read_csv(data_path, index_col=0)
    data['Datetime'] = pd.to_datetime(data['Datetime'], yearfirst=True)
display(data)

# FEATURE ENGINEERING

In [None]:
n_users = data['User'].nunique()
print(f'Number of unique users: {n_users}')

mean_time_between_transactions = (data['Datetime'] - data['Datetime'].shift(1)).mean()
print(f'Mean time between consecutive transactions: {mean_time_between_transactions}')

mean_number_transactions_per_day = 1 / (mean_time_between_transactions / dt.timedelta(days=1))
print(f'Mean number of transactions per day: {mean_number_transactions_per_day}')

## MEAN ENCODING

This calculates mean encoding for feature 'MCC' in such a manner that the mean is updated as time goes by and frauds occur. The easy (and fast) way to implement mean encoding here would be to just calculate the mean over the whole data in one go (see below), but this would in principle introduce data leakage, i.e., information from the future when training, which is generally not desirable (good test results, bad results once in production).

In [None]:
from scipy.sparse import csr_matrix
import sys

In [None]:
%%time
aux = data[['MCC', 'Is Fraud?']].copy()

# one-hot-encode MCC
one_hot_encoder = OneHotEncoder(sparse=True).fit(np.array(aux['MCC']).reshape(-1,1))
mcc_ohe_sparse = one_hot_encoder.transform(np.array(aux['MCC']).reshape(-1,1))
display(mcc_ohe_sparse)

In [None]:
sys.getsizeof(mcc_ohe_sparse) #/ 10**9

In [None]:
%%time
# calculate expanding sum of MCC occurrences
mcc_cumsum = np.cumsum(mcc_ohe_sparse.toarray(), axis=0)

In [None]:
sys.getsizeof(mcc_cumsum) / 10**9

In [None]:
nr_columns = one_hot_encoder.categories_[0].shape[0]
nr_columns

In [None]:
# mark where frauds occur for each mcc_ohe column
is_fraud_sparse = csr_matrix(np.tile(aux['Is Fraud?'], (nr_columns, 1)).T)
is_fraud_sparse

In [None]:
sys.getsizeof(is_fraud_sparse) #/ 10**9

In [None]:
display(is_fraud_sparse.shape)
display(mcc_ohe_sparse.shape)

In [None]:
# calculate where given MCC and fraud co-occur
mcc_is_fraud_sparse = is_fraud_sparse.multiply(mcc_ohe_sparse)
mcc_is_fraud_sparse

In [None]:
# calculate MCC-specific cumulative sum of frauds
mcc_is_fraud_cumsum = np.cumsum(mcc_is_fraud_sparse.toarray(), axis=0)

In [None]:
sys.getsizeof(mcc_is_fraud_cumsum) / 10**9

In [None]:
# proportion of zeros in mcc_is_fraud_cumsum
nr_entries = mcc_is_fraud_cumsum.shape[0]*mcc_is_fraud_cumsum.shape[1]
(nr_entries - np.count_nonzero(mcc_is_fraud_cumsum)) / nr_entries

In [None]:
%%time
# calculate expanding proportion of frauds
mcc_fraud_proportion = np.divide(mcc_is_fraud_cumsum, mcc_cumsum)

In [None]:
# free memory by clearing variables
del mcc_is_fraud_cumsum, mcc_cumsum

In [None]:
sys.getsizeof(mcc_fraud_proportion) / 10**9

In [None]:
mcc_columns = [col[3:] for col in one_hot_encoder.get_feature_names()]

In [None]:
mcc_columns = np.array(mcc_columns)
mcc_columns

In [None]:
%%time
fraud_proportions = [mcc_fraud_proportion[count, np.where(mcc_columns==str(aux.loc[ix,'MCC']))[0][0]] 
                     for count, ix in enumerate(aux.index)]

In [None]:
data['MCC_mean_encoding'] = fraud_proportions
display(data[['MCC', 'MCC_mean_encoding']].tail(50))

In [None]:
# sanity check, these should match each MCC's final entry in data['MCC_mean_encoding']
display(data.groupby('MCC').mean()['Is Fraud?'].to_dict())

In [None]:
# free memory by clearing variable
del mcc_fraud_proportion

#### Below is an alternative way of calculating the mean encoding, but it is too memory demanding for use with full data

In [None]:
# THIS CAN BE USED ONLY IF use_partial_data = True

# time different parts of the cell execution
t0 = time.time()
aux = data[['MCC', 'Is Fraud?']].copy()

# one-hot-encode MCC
one_hot_encoder = OneHotEncoder(sparse=True).fit(np.array(aux['MCC']).reshape(-1,1))
mcc_columns = [col[3:] for col in one_hot_encoder.get_feature_names()]
aux[mcc_columns] = one_hot_encoder.transform(np.array(aux['MCC']).reshape(-1,1)).toarray()
t1 = time.time()
t_block = (t1-t0) / 60
print(f'MCC columns one-hot-encoding done in {t_block} minutes.')

# calculate expanding sum of MCC occurrences
mcc_sum_columns = [col+'_sum' for col in mcc_columns]
aux[mcc_sum_columns] = aux[mcc_columns].expanding().sum()
t2 = time.time()
t_block = (t2-t1) / 60
print(f'mcc_sum_columns calculated in {t_block} minutes.')

# calculate where given MCC and fraud co-occur
mcc_is_fraud_columns = [col+'_is_fraud' for col in mcc_columns]
is_fraud_tiled = np.tile(aux['Is Fraud?'], (len(mcc_columns), 1)).T
aux[mcc_is_fraud_columns] = np.multiply(is_fraud_tiled, aux[mcc_columns])
t3 = time.time()
t_block = (t3-t2) / 60
print(f'mcc_is_fraud_columns calculated in {t_block} minutes.')

# calculate MCC-specific expanding sum of frauds    
mcc_is_fraud_sum_columns = [col+'_is_fraud_sum' for col in mcc_columns]
aux[mcc_is_fraud_sum_columns] = aux[mcc_is_fraud_columns].expanding().sum()
t4 = time.time()
t_block = (t4-t3) / 60
print(f'mcc_is_fraud_sum_columns calculated {t_block} minutes.')

# calculate expanding proportion of frauds
mcc_proportion_frauds_columns = [col+'_proportion_frauds' for col in mcc_columns]
aux[mcc_proportion_frauds_columns] = np.divide(aux[mcc_is_fraud_sum_columns], aux[mcc_sum_columns])
t5 = time.time()
t_block = (t5-t4) / 60
t_cell = (t5-t0) / 60
print(f'mcc_proportion_frauds_columns calculated {t_block} minutes.')

# collect correct fraud proportion for each index matching the original data
fraud_proportions = [aux.loc[ix, str(aux.loc[ix,'MCC'])+'_proportion_frauds'] for ix in aux.index]

print(f'Cell executed in {t_cell} minutes.')

display(aux[mcc_proportion_frauds_columns])

In [None]:
# sanity check, these should match with the final row of aux['proportion_frauds']
data.groupby('MCC').mean()['Is Fraud?'].to_dict()

In [None]:
# add mean encoding for MCC to the main dataframe
data['MCC_mean_encoding'] = fraud_proportions
data

# Keep track of fraud proportions by transaction type over different moving time windows

In [None]:
# Combine chip and swipe transactions into card_present transaction
data['card_present_transaction'] = data[['Chip Transaction', 'Swipe Transaction']].max(axis=1)
#display(data)

In [None]:
# make sure data is sorted by datetime
data = data.sort_values(by=['Datetime'])
#display(data)

In [None]:
# auxiliary features
data['fraud_swipe'] = ((data['Is Fraud?']==1) & (data['Swipe Transaction']==1)).astype(int)
data['fraud_chip'] = ((data['Is Fraud?']==1) & (data['Chip Transaction']==1)).astype(int)
data['fraud_online'] = ((data['Is Fraud?']==1) & (data['Online Transaction']==1)).astype(int)
data['fraud_card_present'] = ((data['Is Fraud?']==1) & (data['card_present_transaction']==1)).astype(int)

#display(data)

In [None]:
%%time
# datetime needed for index with rolling calculations
data = data.reset_index().set_index('Datetime')
    
# calculate rolling averages for different fraud types    
data['fraud_rolling_mean_30_days'] = data['Is Fraud?'].rolling('30d', closed='left').mean()
data['fraud_rolling_mean_60_days'] = data['Is Fraud?'].rolling('60d', closed='left').mean()
data['fraud_rolling_mean_365_days'] = data['Is Fraud?'].rolling('365d', closed='left').mean()
data['fraud_rolling_mean_2_years'] = data['Is Fraud?'].rolling('730d', closed='left').mean()

data['fraud_swipe_rolling_mean_30_days'] = data['fraud_swipe'].rolling('30d', closed='left').mean()
data['fraud_swipe_rolling_mean_60_days'] = data['fraud_swipe'].rolling('60d', closed='left').mean()
data['fraud_swipe_rolling_mean_365_days'] = data['fraud_swipe'].rolling('365d', closed='left').mean()
data['fraud_swipe_rolling_mean_2_years'] = data['fraud_swipe'].rolling('730d', closed='left').mean()

data['fraud_chip_rolling_mean_30_days'] = data['fraud_chip'].rolling('30d', closed='left').mean()
data['fraud_chip_rolling_mean_60_days'] = data['fraud_chip'].rolling('60d', closed='left').mean()
data['fraud_chip_rolling_mean_365_days'] = data['fraud_chip'].rolling('365d', closed='left').mean()
data['fraud_chip_rolling_mean_2_years'] = data['fraud_chip'].rolling('730d', closed='left').mean()

data['fraud_online_rolling_mean_30_days'] = data['fraud_online'].rolling('30d', closed='left').mean()
data['fraud_online_rolling_mean_60_days'] = data['fraud_online'].rolling('60d', closed='left').mean()
data['fraud_online_rolling_mean_365_days'] = data['fraud_online'].rolling('365d', closed='left').mean()
data['fraud_online_rolling_mean_2_years'] = data['fraud_online'].rolling('730d', closed='left').mean()

data['fraud_card_present_rolling_mean_30_days'] = data['fraud_card_present'].rolling('30d', closed='left').mean()
data['fraud_card_present_rolling_mean_60_days'] = data['fraud_card_present'].rolling('60d', closed='left').mean()
data['fraud_card_present_rolling_mean_365_days'] = data['fraud_card_present'].rolling('365d', closed='left').mean()
data['fraud_card_present_rolling_mean_2_years'] = data['fraud_card_present'].rolling('730d', closed='left').mean()

# reset index back to original
data = data.reset_index().set_index('index')

# add delay in information (typically it would not be immediately known/verified whether any given transaction 
# was a fraud or not)
delay_days = 7 # how many days information about frauds is delayed
n_rows_to_shift = int(round(delay_days * mean_number_transactions_per_day, 0))

# shift rows 
data['fraud_rolling_mean_30_days'] = data['fraud_rolling_mean_30_days'].shift(n_rows_to_shift)
data['fraud_rolling_mean_60_days'] = data['fraud_rolling_mean_60_days'].shift(n_rows_to_shift)
data['fraud_rolling_mean_365_days'] = data['fraud_rolling_mean_365_days'].shift(n_rows_to_shift)
data['fraud_rolling_mean_2_years']  = data['fraud_rolling_mean_2_years'].shift(n_rows_to_shift)

data['fraud_swipe_rolling_mean_30_days'] = data['fraud_swipe_rolling_mean_30_days'].shift(n_rows_to_shift)
data['fraud_swipe_rolling_mean_60_days'] = data['fraud_swipe_rolling_mean_60_days'].shift(n_rows_to_shift)
data['fraud_swipe_rolling_mean_365_days'] = data['fraud_swipe_rolling_mean_365_days'].shift(n_rows_to_shift)
data['fraud_swipe_rolling_mean_2_years']  = data['fraud_swipe_rolling_mean_2_years'].shift(n_rows_to_shift)

data['fraud_chip_rolling_mean_30_days'] = data['fraud_chip_rolling_mean_30_days'].shift(n_rows_to_shift)
data['fraud_chip_rolling_mean_60_days'] = data['fraud_chip_rolling_mean_60_days'].shift(n_rows_to_shift)
data['fraud_chip_rolling_mean_365_days'] = data['fraud_chip_rolling_mean_365_days'].shift(n_rows_to_shift)
data['fraud_chip_rolling_mean_2_years']  = data['fraud_chip_rolling_mean_2_years'].shift(n_rows_to_shift)

data['fraud_online_rolling_mean_30_days'] = data['fraud_online_rolling_mean_30_days'].shift(n_rows_to_shift)
data['fraud_online_rolling_mean_60_days'] = data['fraud_online_rolling_mean_60_days'].shift(n_rows_to_shift)
data['fraud_online_rolling_mean_365_days'] = data['fraud_online_rolling_mean_365_days'].shift(n_rows_to_shift)
data['fraud_online_rolling_mean_2_years']  = data['fraud_online_rolling_mean_2_years'].shift(n_rows_to_shift)

data['fraud_card_present_rolling_mean_30_days'] = data['fraud_card_present_rolling_mean_30_days'].shift(n_rows_to_shift)
data['fraud_card_present_rolling_mean_60_days'] = data['fraud_card_present_rolling_mean_60_days'].shift(n_rows_to_shift)
data['fraud_card_present_rolling_mean_365_days'] = data['fraud_card_present_rolling_mean_365_days'].shift(n_rows_to_shift)
data['fraud_card_present_rolling_mean_2_years']  = data['fraud_card_present_rolling_mean_2_years'].shift(n_rows_to_shift)

#display(data)

In [None]:
%%time
# compare different fraud types' recent averages to longer term averages (find spikes and lows)
data['fraud_rolling_30_days_relative_to_365_days'] = data['fraud_rolling_mean_30_days'] / data['fraud_rolling_mean_365_days']
data['fraud_rolling_30_days_relative_to_2_years'] = data['fraud_rolling_mean_30_days'] / data['fraud_rolling_mean_2_years']
data['fraud_rolling_60_days_relative_to_365_days'] = data['fraud_rolling_mean_60_days'] / data['fraud_rolling_mean_365_days']
data['fraud_rolling_60_days_relative_to_2_years'] = data['fraud_rolling_mean_60_days'] / data['fraud_rolling_mean_2_years']

data['fraud_swipe_rolling_30_days_relative_to_365_days'] = data['fraud_swipe_rolling_mean_30_days'] / data['fraud_swipe_rolling_mean_365_days']
data['fraud_swipe_rolling_30_days_relative_to_2_years'] = data['fraud_swipe_rolling_mean_30_days'] / data['fraud_swipe_rolling_mean_2_years']
data['fraud_swipe_rolling_60_days_relative_to_365_days'] = data['fraud_swipe_rolling_mean_60_days'] / data['fraud_swipe_rolling_mean_365_days']
data['fraud_swipe_rolling_60_days_relative_to_2_years'] = data['fraud_swipe_rolling_mean_60_days'] / data['fraud_swipe_rolling_mean_2_years']

data['fraud_chip_rolling_30_days_relative_to_365_days'] = data['fraud_chip_rolling_mean_30_days'] / data['fraud_chip_rolling_mean_365_days']
data['fraud_chip_rolling_30_days_relative_to_2_years'] = data['fraud_chip_rolling_mean_30_days'] / data['fraud_chip_rolling_mean_2_years']
data['fraud_chip_rolling_60_days_relative_to_365_days'] = data['fraud_chip_rolling_mean_60_days'] / data['fraud_chip_rolling_mean_365_days']
data['fraud_chip_rolling_60_days_relative_to_2_years'] = data['fraud_chip_rolling_mean_60_days'] / data['fraud_chip_rolling_mean_2_years']

data['fraud_online_rolling_30_days_relative_to_365_days'] = data['fraud_online_rolling_mean_30_days'] / data['fraud_online_rolling_mean_365_days']
data['fraud_online_rolling_30_days_relative_to_2_years'] = data['fraud_online_rolling_mean_30_days'] / data['fraud_online_rolling_mean_2_years']
data['fraud_online_rolling_60_days_relative_to_365_days'] = data['fraud_online_rolling_mean_60_days'] / data['fraud_online_rolling_mean_365_days']
data['fraud_online_rolling_60_days_relative_to_2_years'] = data['fraud_online_rolling_mean_60_days'] / data['fraud_online_rolling_mean_2_years']

data['fraud_card_present_rolling_30_days_relative_to_365_days'] = data['fraud_card_present_rolling_mean_30_days'] / data['fraud_card_present_rolling_mean_365_days']
data['fraud_card_present_rolling_30_days_relative_to_2_years'] = data['fraud_card_present_rolling_mean_30_days'] / data['fraud_card_present_rolling_mean_2_years']
data['fraud_card_present_rolling_60_days_relative_to_365_days'] = data['fraud_card_present_rolling_mean_60_days'] / data['fraud_card_present_rolling_mean_365_days']
data['fraud_card_present_rolling_60_days_relative_to_2_years'] = data['fraud_card_present_rolling_mean_60_days'] / data['fraud_card_present_rolling_mean_2_years']

# rolling proportions relative all frauds
data['fraud_swipe_rolling_30_days_relative_to_all_frauds'] = data['fraud_swipe_rolling_mean_30_days'] / data['fraud_rolling_mean_30_days']
data['fraud_chip_rolling_30_days_relative_to_all_frauds'] = data['fraud_chip_rolling_mean_30_days'] / data['fraud_rolling_mean_30_days']
data['fraud_online_rolling_30_days_relative_to_all_frauds'] = data['fraud_online_rolling_mean_30_days'] / data['fraud_rolling_mean_30_days']
data['fraud_card_present_rolling_30_days_relative_to_all_frauds'] = data['fraud_card_present_rolling_mean_30_days'] / data['fraud_rolling_mean_30_days']

data['fraud_swipe_rolling_60_days_relative_to_all_frauds'] = data['fraud_swipe_rolling_mean_60_days'] / data['fraud_rolling_mean_60_days']
data['fraud_chip_rolling_60_days_relative_to_all_frauds'] = data['fraud_chip_rolling_mean_60_days'] / data['fraud_rolling_mean_60_days']
data['fraud_online_rolling_60_days_relative_to_all_frauds'] = data['fraud_online_rolling_mean_60_days'] / data['fraud_rolling_mean_60_days']
data['fraud_card_present_rolling_60_days_relative_to_all_frauds'] = data['fraud_card_present_rolling_mean_60_days'] / data['fraud_rolling_mean_60_days']

data['fraud_swipe_rolling_365_days_relative_to_all_frauds'] = data['fraud_swipe_rolling_mean_365_days'] / data['fraud_rolling_mean_365_days']
data['fraud_chip_rolling_365_days_relative_to_all_frauds'] = data['fraud_chip_rolling_mean_365_days'] / data['fraud_rolling_mean_365_days']
data['fraud_online_rolling_365_days_relative_to_all_frauds'] = data['fraud_online_rolling_mean_365_days'] / data['fraud_rolling_mean_365_days']
data['fraud_card_present_rolling_365_days_relative_to_all_frauds'] = data['fraud_card_present_rolling_mean_365_days'] / data['fraud_rolling_mean_365_days']

data['fraud_swipe_rolling_2_years_relative_to_all_frauds'] = data['fraud_swipe_rolling_mean_2_years'] / data['fraud_rolling_mean_2_years']
data['fraud_chip_rolling_2_years_relative_to_all_frauds'] = data['fraud_chip_rolling_mean_2_years'] / data['fraud_rolling_mean_2_years']
data['fraud_online_rolling_2_years_relative_to_all_frauds'] = data['fraud_online_rolling_mean_2_years'] / data['fraud_rolling_mean_2_years']
data['fraud_card_present_rolling_2_years_relative_to_all_frauds'] = data['fraud_card_present_rolling_mean_2_years'] / data['fraud_rolling_mean_2_years']

#display(data)

In [None]:
# drop auxiliary features that should not be included in the data, e.g. 'fraud_swipe', that would indicate fraudulent transactions
drop_features = ['fraud_swipe', 'fraud_chip', 'fraud_online', 'fraud_card_present']
data = data.drop(drop_features, axis=1)
#display(data)

## Time-related features

In [None]:
%%time
# get the hour component of the time
data['hour'] = data['Time'].apply(lambda x: x[0:2]).astype(int)
# get day of week
data['day_of_week'] = data['Datetime'].apply(lambda x: x.weekday())

# create cyclical time features (e.g. hour 23 is closer to 0 than 21)
data["hour_sin"] = data["hour"].apply(lambda x: np.sin((x / 24) * 2 * np.pi))
data["hour_cos"] = data["hour"].apply(lambda x: np.cos((x / 24) * 2 * np.pi))

data["month_sin"] = data["Month"].apply(lambda x: np.sin(((x - 1) / 12) * 2 * np.pi))
data["month_cos"] = data["Month"].apply(lambda x: np.cos(((x - 1) / 12) * 2 * np.pi))

data["day_of_week_sin"] = data["day_of_week"].apply(lambda x: np.sin((x / 7) * 2 * np.pi))
data["day_of_week_cos"] = data["day_of_week"].apply(lambda x: np.cos((x / 7) * 2 * np.pi))

# add indicator for holidays
us_holidays = holidays.US()
data["is_holiday"] = data["Datetime"].apply(lambda x: x in us_holidays)
data["is_holiday"] = data["is_holiday"].apply(lambda x: 1 if x else 0)

# add indicator for weekends
data["weekend"] = (data["day_of_week"] == 5) | (data["day_of_week"] == 6)
data['weekend'] = data["weekend"].apply(lambda x: 1 if x else 0)

# add indicator for whether the year is 2015 or later (known change in transaction type)
data['is_2015_or_later'] = data['Year']>=2015
data['is_2015_or_later'] = data['is_2015_or_later'].apply(lambda x: 1 if x else 0)

In [None]:
# make sure data is sorted by datetime
data = data.sort_values(by=['Datetime'])
#display(data)

In [None]:
#local_vars = list(locals().items())
#for var, obj in local_vars:
#    print(var, sys.getsizeof(obj))

## User-specific features

In [None]:
# calculate user-specific features over time
for user in tqdm(data['User'].unique()):
    
    # transaction history for a given user
    aux = data.loc[data['User']==user, ['Datetime', 'Amount']].copy()
    
    ###
    # AUXILIARY VARIABLES AND TRANSFORMATIONS
    ###
    
    # datetime needed for index with rolling calculations
    aux = aux.reset_index().set_index('Datetime')
    # auxiliary variables for computing transaction frequency for given time range
    aux['transaction_count_auxiliary'] = 1
    aux['auxiliary_365'] = 365
    aux['auxiliary_30'] = 30
    aux['auxiliary_7'] = 7
    aux['auxiliary_2'] = 2
    
    ###
    # SPENDING
    ###
    
    # calculate mean 'Amount' over time, i.e., given transaction, what's been the mean amount spent up to that point
    # not including the current transaction itself
    aux['mean_amount'] = aux['Amount'].expanding().mean()
    # shift one row down so that the mean is given only for past transactions
    aux['mean_amount'] = aux['mean_amount'].shift(1, fill_value=0)
    # mean transaction amount for various time windows
    aux['mean_amount_last_year'] = aux['Amount'].rolling('365d', closed='left').mean().fillna(0)
    aux['mean_amount_last_30_days'] = aux['Amount'].rolling('30d', closed='left').mean().fillna(0)
    aux['mean_amount_last_7_days'] = aux['Amount'].rolling('7d', closed='left').mean().fillna(0)
    aux['mean_amount_last_2_days'] = aux['Amount'].rolling('2d', closed='left').mean().fillna(0)
    aux['mean_amount_last_1_days'] = aux['Amount'].rolling('1d', closed='left').mean().fillna(0)
    # mean amount spent over short time periods relative to longer time periods
    aux['mean_amount_last_7_days_relative_to_last_year'] = aux['mean_amount_last_7_days'] / aux['mean_amount_last_year']
    aux['mean_amount_last_2_days_relative_to_last_year'] = aux['mean_amount_last_2_days'] / aux['mean_amount_last_year']
    aux['mean_amount_last_1_days_relative_to_last_year'] = aux['mean_amount_last_1_days'] / aux['mean_amount_last_year']
    aux['mean_amount_last_7_days_relative_to_last_30_days'] = aux['mean_amount_last_7_days'] / aux['mean_amount_last_30_days']
    aux['mean_amount_last_2_days_relative_to_last_30_days'] = aux['mean_amount_last_2_days'] / aux['mean_amount_last_30_days']
    aux['mean_amount_last_1_days_relative_to_last_30_days'] = aux['mean_amount_last_1_days'] / aux['mean_amount_last_30_days']
    
    ###
    # TRANSACTION FREQUENCY
    ###
    
    # datetime for given customer's first ever transaction
    first_transaction_datetime = aux.index[0]
    # increasing count of total transactions for given customer
    aux['transaction_count'] = np.arange(1,aux.shape[0]+1)
    # auxiliary variable keeping track of time since first transaction
    aux['days_since_first_transaction'] = (aux.index - first_transaction_datetime) / dt.timedelta(days=1)
    # all time transaction frequency not including current transaction
    aux['transaction_frequency_all'] = np.where(aux['days_since_first_transaction']>0, 
                                                (aux['transaction_count']-1) / aux['days_since_first_transaction'], 
                                                0)
    
    # transaction frequency (number of transactions per day) for the last year not including current transaction
    aux['transaction_frequency_last_year'] = ( aux['transaction_count_auxiliary'].rolling('365d', closed='left').sum() 
                                              / (aux[['auxiliary_365', 'days_since_first_transaction']].min(axis=1)) ).fillna(0)
    
    # transaction frequency (number of transactions per day) for the last 30 days not including current transaction
    aux['transaction_frequency_last_30_days'] = ( aux['transaction_count_auxiliary'].rolling('30d', closed='left').sum() 
                                                 / (aux[['auxiliary_30', 'days_since_first_transaction']].min(axis=1)) ).fillna(0)

    # transaction frequency (number of transactions per day) for the last 7 days not including current transaction
    aux['transaction_frequency_last_7_days'] = ( aux['transaction_count_auxiliary'].rolling('7d', closed='left').sum() 
                                                / (aux[['auxiliary_7', 'days_since_first_transaction']].min(axis=1)) ).fillna(0)
    
    # transaction frequency for the last 48 hours not including current transaction
    aux['transaction_frequency_last_2_days'] = ( aux['transaction_count_auxiliary'].rolling('2d', closed='left').sum() 
                                                / (aux[['auxiliary_2', 'days_since_first_transaction']].min(axis=1)) ).fillna(0)
    
    # transaction frequency for the last 24 hours not including current transaction
    aux['transaction_frequency_last_1_days'] = ( aux['transaction_count_auxiliary'].rolling('1d', closed='left').sum()
                                                / (aux[['transaction_count_auxiliary', 'days_since_first_transaction']].min(axis=1)) ).fillna(0)
    
    # calculate statistic related to transaction frequencies
    aux['1_days_transaction_frequency_relative_to_last_30_days'] = aux['transaction_frequency_last_1_days'] / aux['transaction_frequency_last_30_days']
    aux['1_days_transaction_frequency_relative_to_last_year'] = aux['transaction_frequency_last_1_days'] / aux['transaction_frequency_last_year']
    aux['2_days_transaction_frequency_relative_to_last_30_days'] = aux['transaction_frequency_last_2_days'] / aux['transaction_frequency_last_30_days']
    aux['2_days_transaction_frequency_relative_to_last_year'] = aux['transaction_frequency_last_2_days'] / aux['transaction_frequency_last_year']
    aux['7_days_transaction_frequency_relative_to_last_30_days'] = aux['transaction_frequency_last_7_days'] / aux['transaction_frequency_last_30_days']
    aux['7_days_transaction_frequency_relative_to_last_year'] = aux['transaction_frequency_last_7_days'] / aux['transaction_frequency_last_year']
    
    new_features = ['mean_amount', 'mean_amount_last_year', 'mean_amount_last_30_days', 'mean_amount_last_7_days',
                    'mean_amount_last_2_days', 'mean_amount_last_1_days', 'mean_amount_last_7_days_relative_to_last_year', 
                    'mean_amount_last_2_days_relative_to_last_year', 'mean_amount_last_1_days_relative_to_last_year',
                    'mean_amount_last_7_days_relative_to_last_30_days', 'mean_amount_last_2_days_relative_to_last_30_days',
                    'mean_amount_last_1_days_relative_to_last_30_days', 'transaction_count', 'days_since_first_transaction', 
                    'transaction_frequency_all', 'transaction_frequency_last_year', 'transaction_frequency_last_30_days', 
                    'transaction_frequency_last_7_days', 'transaction_frequency_last_2_days', 
                    'transaction_frequency_last_1_days', '1_days_transaction_frequency_relative_to_last_30_days', 
                    '1_days_transaction_frequency_relative_to_last_year', '2_days_transaction_frequency_relative_to_last_30_days', 
                    '2_days_transaction_frequency_relative_to_last_year', '7_days_transaction_frequency_relative_to_last_30_days', 
                    '7_days_transaction_frequency_relative_to_last_year']
    
    # reset index to original index for adding features to main dataframe
    aux = aux.reset_index().set_index('index')
    # add features to main dataframe
    data.loc[aux.index, new_features] = aux[new_features]

#display(data)

In [None]:
if use_partial_data:
    data_path = os.path.join(project_root, 'data/preprocessed_data_with_feature_engineering_small.csv')
else:
    data_path = os.path.join(project_root, 'data/preprocessed_data_with_feature_engineering.csv')
data.to_csv(data_path)