In [16]:
# Import necessary ibraries

import pandas as pd
import numpy as np

In [17]:
# Loading the data files
df = pd.read_csv("mnt/data/reference.csv")
# df_2 = pd.read_csv("mnt/data/analysis.csv")
#
# # merging the analysis and reference files
# df = pd.concat([df_1, df_2])

# printing the no.of rows and no.of columns
print("Number of rows: ", df.shape[0])
print("Number of columns: ", df.shape[1])

# converting the timestamp into date and adding the date column
df['date'] = pd.to_datetime(df['timestamp']).dt.date

print("Number of rows: ", df.shape[0])
print("Number of columns: ", df.shape[1])
df.head()

Number of rows:  50207
Number of columns:  7
Number of rows:  50207
Number of columns:  8


Unnamed: 0,timestamp,time_since_login_min,transaction_amount,transaction_type,is_first_transaction,user_tenure_months,is_fraud,date
0,2018-01-01 00:00:00.000,1.56175,3981.1,PAYMENT,False,0.31898,1.0,2018-01-01
1,2018-01-01 00:08:43.152,1.658074,1267.9,PAYMENT,False,7.391323,0.0,2018-01-01
2,2018-01-01 00:17:26.304,2.454287,1984.7,CASH-IN,False,0.781225,1.0,2018-01-01
3,2018-01-01 00:26:09.456,2.392085,2265.2,CASH-OUT,False,0.680473,1.0,2018-01-01
4,2018-01-01 00:34:52.608,2.189806,2126.8,CASH-IN,False,8.542895,1.0,2018-01-01


In [18]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Creation of time-based features
df['transaction_hour'] = df.timestamp.dt.hour
df['transaction_day'] = df.timestamp.dt.dayofweek

# Creation of Log based features
df['amount_log'] = np.log1p(df['transaction_amount'])

# Creation binary flags
df['high_amount_flag'] = (df['transaction_amount'] > df['transaction_amount'].median()).astype(int)


# Interaction Feature creation
df['amount_x_time'] = df['transaction_amount'] * df['time_since_login_min']

# Replacing null values of transaction_type with its mode
df['transaction_type'].fillna(df['transaction_type'].mode()[0], inplace=True)

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




In [12]:
df_2 = df.copy()

In [19]:
# Bucketed Features Creation
# Goal: Make your model see behavioural patterns instead of only raw inputs


# Creating tenure_bucket
df_2['tenure_bucket'] = pd.cut(
    df['user_tenure_months'],
    bins=[-1, 1, 3, 6, 12, float('inf')],
    labels=['new', 'recent', 'established', 'loyal', 'veteran']
)

# Creation amount_bucket feature
df_2['amount_bucket'] = pd.cut(
    df['transaction_amount'],
    bins=[-1, 1000, 3000, 6000, float('inf')],
    labels=['small', 'medium', 'large', 'very_large']
)

# Creation time_since_login_bucket feature
df_2['time_since_login_bucket'] = pd.cut(
    df['time_since_login_min'],
    bins=[-1, 1.5, 2.5, float('inf')],
    labels=['low', 'medium', 'high']
)

In [20]:
df_2['transaction_type'] = df_2['transaction_type'].map({'PAYMENT':1, 'CASH-OUT':2, 'CASH-IN':3, 'TRANSFER':4})
df_2['is_first_transaction'] = df_2['is_first_transaction'].map({False:0, True:1})
df_2['tenure_bucket'] = df_2['tenure_bucket'].map({'new':1,'recent':2,'established':3,'loyal':4})
df_2['amount_bucket'] = df_2['amount_bucket'].map({'large':1, 'medium':2, 'very_large':3})
df_2['time_since_login_bucket'] = df_2['time_since_login_bucket'].map({'low':1, 'medium':2, 'high':3})

In [21]:
df_2.head()

Unnamed: 0,timestamp,time_since_login_min,transaction_amount,transaction_type,is_first_transaction,user_tenure_months,is_fraud,date,transaction_hour,transaction_day,amount_log,high_amount_flag,amount_x_time,tenure_bucket,amount_bucket,time_since_login_bucket
0,2018-01-01 00:00:00.000,1.56175,3981.1,,,0.31898,1.0,2018-01-01,0,0,8.289565,1,6217.481774,1.0,1.0,2
1,2018-01-01 00:08:43.152,1.658074,1267.9,,,7.391323,0.0,2018-01-01,0,0,7.145906,0,2102.272115,4.0,2.0,2
2,2018-01-01 00:17:26.304,2.454287,1984.7,,,0.781225,1.0,2018-01-01,0,0,7.593727,0,4871.023843,1.0,2.0,2
3,2018-01-01 00:26:09.456,2.392085,2265.2,,,0.680473,1.0,2018-01-01,0,0,7.72586,1,5418.551255,1.0,2.0,2
4,2018-01-01 00:34:52.608,2.189806,2126.8,,,8.542895,1.0,2018-01-01,0,0,7.662844,0,4657.279051,4.0,2.0,2


In [22]:
# df_2['amount_bucket'] = df_2['amount_bucket'].cat.codes
df_2['time_since_login_bucket'] = df_2['time_since_login_bucket'].cat.codes


In [23]:
df_2.time_since_login_bucket.unique()

array([1, 0, 2], dtype=int8)

### Time Series Based Splitting

In [24]:
# Sorting the rows by timestamp
df_2 = df_2.sort_values(by='timestamp')

n = len(df_2)
train_end = int(0.6 * n)
val_end = int(0.8 * n)

train = df_2.loc[:train_end].copy()
val = df_2.loc[train_end:val_end].copy()
test = df_2.loc[val_end:].copy()

cols_to_drop = ['is_fraud', 'timestamp', 'date']

X_train = train.drop(cols_to_drop, axis=1)
y_train = train['is_fraud']

X_val   = val.drop(cols_to_drop, axis=1)
y_val   = val['is_fraud']

X_test  = test.drop(cols_to_drop, axis=1)
y_test  = test['is_fraud']


In [25]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=3)
model_xgb.fit(X_train, y_train)
model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.65      0.85      0.74      4999
         1.0       0.78      0.55      0.64      5043

    accuracy                           0.70     10042
   macro avg       0.72      0.70      0.69     10042
weighted avg       0.72      0.70      0.69     10042



### SHAP Explainability - SHapley Additive exPlanations

In [26]:
# import shap
#
# # initialize javascript for plots (Jupyter only)
# shap.initjs()
#
# # create SHAP explainer
# explainer = shap.TreeExplainer(model_xgb)


In [31]:
import numpy
import xgboost as xgb

# ensure X_test is numeric numpy array and columns order stored
feature_names = X_train.columns.tolist()
dtest = xgb.DMatrix(X_test.values, feature_names=feature_names)

# get contributions; last column is the bias (base value)
contribs = model_xgb.get_booster().predict(dtest, pred_contribs=True)

# example: contributions for row i
i = 10
row_contrib = contribs[i, :-1]   # drop last bias
bias = contribs[i, -1]
pred_proba = model_xgb.predict_proba(X_test.iloc[[i]])[:,1][0]


# pair feature names with contributions
pairs = list(zip(feature_names, row_contrib))
pairs_sorted = sorted(pairs, key=lambda x: abs(x[1]), reverse=True)
print("Bias:", bias, "Pred:", pred_proba)
for f, val in pairs_sorted[:10]:
    print(f, val)

# global importance by mean absolute contribution
glob_imp = np.mean(np.abs(contribs[:, :-1]), axis=0)
imp_df = sorted(zip(feature_names, glob_imp), key=lambda x: x[1], reverse=True)
print("Top global contributors:")
for f, v in imp_df[:10]:
    print(f, v)


Bias: 0.0010492797 Pred: 0.37605098
user_tenure_months -0.43330094
transaction_amount -0.20630871
time_since_login_min 0.06723786
transaction_day 0.06230552
transaction_hour 0.044779778
amount_x_time -0.04164356
tenure_bucket -0.00069235486
amount_bucket 0.0002293566
transaction_type 0.0
is_first_transaction 0.0
Top global contributors:
user_tenure_months 1.0125749
transaction_amount 0.18486726
amount_x_time 0.09427822
time_since_login_min 0.08456531
transaction_hour 0.03983888
transaction_day 0.028235832
tenure_bucket 0.008967541
amount_bucket 0.0020700986
transaction_type 0.0
is_first_transaction 0.0


In [32]:
for f in ['user_tenure_months','transaction_amount','time_since_login_min']:
    display(df.groupby('is_fraud')[f].describe())


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,25116.0,8.426377,2.121946,0.004626,7.572658,9.205215,9.985034,10.495597
1.0,25091.0,5.370237,3.831768,0.00338,1.286827,5.661621,9.32294,10.495751


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,25116.0,2901.765886,1928.465996,1002.1,1474.5,2181.1,3720.4,10800.9
1.0,25091.0,3027.785573,2142.779034,1001.1,1425.1,2201.6,3975.5,10956.4


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,25116.0,1.925881,0.471839,0.1,1.722185,1.980105,2.219699,3.495902
1.0,25091.0,1.927941,0.472252,0.1,1.722768,1.977707,2.223935,3.315152
