# Feature Engineering for Fraud Detection

This notebook is used to:
- Understand the dataset
- Visually validate feature engineering logic
- Prototype behavioral & velocity-based features
- Ensure no data leakage before modeling

Final feature logic will live in `src/feature_engineering/`


In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)


In [2]:
df = pd.read_csv("../data/raw/transactions_raw.csv")
df["timestamp"] = pd.to_datetime(df["timestamp"])

print(df.shape)
df.head()


(100000, 15)


Unnamed: 0,transaction_id,customer_id,card_number,timestamp,amount,merchant_id,merchant_category,merchant_lat,merchant_long,distance_from_home,hour,day_of_week,month,is_fraud,fraud_type
0,TXN_00005946,CUST_00000,73dd02d325d8ba6c,2025-03-08 09:48:00.000000,1044.99,MERCHANT_01868,jewelry,19.073305,72.87268,5.62,9,5,3,0,none
1,TXN_00006401,CUST_00000,73dd02d325d8ba6c,2025-03-08 11:15:27.407530,1205.51,MERCHANT_00792,jewelry,19.055841,72.885882,4.45,11,5,3,0,none
2,TXN_00009276,CUST_00000,73dd02d325d8ba6c,2025-03-08 22:15:19.130894,127.99,MERCHANT_01036,luxury_goods,19.073716,72.885006,4.34,22,5,3,0,none
3,TXN_00012697,CUST_00000,73dd02d325d8ba6c,2025-03-09 07:26:13.375615,423.8,MERCHANT_01430,luxury_goods,19.08425,72.864179,6.72,7,6,3,0,none
4,TXN_00019183,CUST_00000,73dd02d325d8ba6c,2025-03-09 11:11:41.342503,644.5,MERCHANT_01181,jewelry,19.089023,72.890843,4.32,11,6,3,0,none


In [3]:
df.columns


Index(['transaction_id', 'customer_id', 'card_number', 'timestamp', 'amount', 'merchant_id', 'merchant_category',
       'merchant_lat', 'merchant_long', 'distance_from_home', 'hour', 'day_of_week', 'month', 'is_fraud',
       'fraud_type'],
      dtype='object')

In [6]:
df = df.sort_values(["customer_id", "timestamp"]).reset_index(drop=True)
df.head(10)


Unnamed: 0,transaction_id,customer_id,card_number,timestamp,amount,merchant_id,merchant_category,merchant_lat,merchant_long,distance_from_home,hour,day_of_week,month,is_fraud,fraud_type
0,TXN_00005946,CUST_00000,73dd02d325d8ba6c,2025-03-08 09:48:00.000000,1044.99,MERCHANT_01868,jewelry,19.073305,72.87268,5.62,9,5,3,0,none
1,TXN_00006401,CUST_00000,73dd02d325d8ba6c,2025-03-08 11:15:27.407530,1205.51,MERCHANT_00792,jewelry,19.055841,72.885882,4.45,11,5,3,0,none
2,TXN_00009276,CUST_00000,73dd02d325d8ba6c,2025-03-08 22:15:19.130894,127.99,MERCHANT_01036,luxury_goods,19.073716,72.885006,4.34,22,5,3,0,none
3,TXN_00012697,CUST_00000,73dd02d325d8ba6c,2025-03-09 07:26:13.375615,423.8,MERCHANT_01430,luxury_goods,19.08425,72.864179,6.72,7,6,3,0,none
4,TXN_00019183,CUST_00000,73dd02d325d8ba6c,2025-03-09 11:11:41.342503,644.5,MERCHANT_01181,jewelry,19.089023,72.890843,4.32,11,6,3,0,none
5,TXN_00025902,CUST_00000,73dd02d325d8ba6c,2025-03-09 22:47:51.166047,572.5,MERCHANT_00313,jewelry,19.083504,72.875069,5.59,22,6,3,0,none
6,TXN_00036187,CUST_00000,73dd02d325d8ba6c,2025-03-09 23:16:19.896549,883.36,MERCHANT_01497,grocery,19.094257,72.889192,4.79,23,6,3,0,none
7,TXN_00044321,CUST_00000,73dd02d325d8ba6c,2025-03-10 00:50:32.557008,735.52,MERCHANT_01855,jewelry,19.064466,72.884365,4.4,0,0,3,0,none
8,TXN_00045931,CUST_00000,73dd02d325d8ba6c,2025-03-10 14:12:03.517317,1378.04,MERCHANT_00495,luxury_goods,19.051227,72.872674,5.93,14,0,3,0,none
9,TXN_00055491,CUST_00000,73dd02d325d8ba6c,2025-03-10 21:02:11.504790,1028.46,MERCHANT_00977,restaurant,19.08476,72.878279,5.32,21,0,3,0,none


In [7]:
df["prev_txn_time"] = df.groupby("customer_id")["timestamp"].shift(1)

df["time_since_last_txn_sec"] = (
    df["timestamp"] - df["prev_txn_time"]
).dt.total_seconds()

df[[
    "customer_id",
    "timestamp",
    "prev_txn_time",
    "time_since_last_txn_sec"
]].head(10)

# Fraud often happens in bursts
# Very small gaps = suspicious velocity


Unnamed: 0,customer_id,timestamp,prev_txn_time,time_since_last_txn_sec
0,CUST_00000,2025-03-08 09:48:00.000000,NaT,
1,CUST_00000,2025-03-08 11:15:27.407530,2025-03-08 09:48:00.000000,5247.40753
2,CUST_00000,2025-03-08 22:15:19.130894,2025-03-08 11:15:27.407530,39591.723364
3,CUST_00000,2025-03-09 07:26:13.375615,2025-03-08 22:15:19.130894,33054.244721
4,CUST_00000,2025-03-09 11:11:41.342503,2025-03-09 07:26:13.375615,13527.966888
5,CUST_00000,2025-03-09 22:47:51.166047,2025-03-09 11:11:41.342503,41769.823544
6,CUST_00000,2025-03-09 23:16:19.896549,2025-03-09 22:47:51.166047,1708.730502
7,CUST_00000,2025-03-10 00:50:32.557008,2025-03-09 23:16:19.896549,5652.660459
8,CUST_00000,2025-03-10 14:12:03.517317,2025-03-10 00:50:32.557008,48090.960309
9,CUST_00000,2025-03-10 21:02:11.504790,2025-03-10 14:12:03.517317,24607.987473


In [8]:
txn_count_1h = (
    df.groupby("customer_id")
      .rolling("1h", on="timestamp")["transaction_id"]
      .count()
      .reset_index(drop=True)
)

df["txn_count_1h"] = txn_count_1h.values

df[[
    "customer_id",
    "timestamp",
    "txn_count_1h"
]].head(15)

# Normal users: 1–2 txns/hour
# Fraud: rapid spikes

Unnamed: 0,customer_id,timestamp,txn_count_1h
0,CUST_00000,2025-03-08 09:48:00.000000,1.0
1,CUST_00000,2025-03-08 11:15:27.407530,1.0
2,CUST_00000,2025-03-08 22:15:19.130894,1.0
3,CUST_00000,2025-03-09 07:26:13.375615,1.0
4,CUST_00000,2025-03-09 11:11:41.342503,1.0
5,CUST_00000,2025-03-09 22:47:51.166047,1.0
6,CUST_00000,2025-03-09 23:16:19.896549,2.0
7,CUST_00000,2025-03-10 00:50:32.557008,1.0
8,CUST_00000,2025-03-10 14:12:03.517317,1.0
9,CUST_00000,2025-03-10 21:02:11.504790,1.0


In [9]:
txn_count_24h = (
    df.groupby("customer_id")
      .rolling("24h", on="timestamp")["transaction_id"]
      .count()
      .reset_index(drop=True)
)

df["txn_count_24h"] = txn_count_24h.values

df[[
    "customer_id",
    "timestamp",
    "txn_count_24h"
]].head(15)


# Captures daily spending intensity
# Helps separate normal heavy users from bursts

Unnamed: 0,customer_id,timestamp,txn_count_24h
0,CUST_00000,2025-03-08 09:48:00.000000,1.0
1,CUST_00000,2025-03-08 11:15:27.407530,2.0
2,CUST_00000,2025-03-08 22:15:19.130894,3.0
3,CUST_00000,2025-03-09 07:26:13.375615,4.0
4,CUST_00000,2025-03-09 11:11:41.342503,4.0
5,CUST_00000,2025-03-09 22:47:51.166047,3.0
6,CUST_00000,2025-03-09 23:16:19.896549,4.0
7,CUST_00000,2025-03-10 00:50:32.557008,5.0
8,CUST_00000,2025-03-10 14:12:03.517317,4.0
9,CUST_00000,2025-03-10 21:02:11.504790,5.0


In [14]:
avg_amount_24h = (
    df.groupby("customer_id")
      .rolling("24h", on="timestamp")["amount"]
      .mean()
      .reset_index(drop=True)
)

df["avg_amount_24h"] = avg_amount_24h.values

# add a date-only column for readable display
df["timestamp_date"] = df["timestamp"].dt.date

df[[
    "customer_id",
    "amount",
    "timestamp_date",
    "avg_amount_24h"
]].head(15)


# Fraud is relative, not absolute.
# ₹5,000 is:
# normal for some users
# extreme for others

Unnamed: 0,customer_id,amount,timestamp_date,avg_amount_24h
0,CUST_00000,1044.99,2025-03-08,1044.99
1,CUST_00000,1205.51,2025-03-08,1125.25
2,CUST_00000,127.99,2025-03-08,792.83
3,CUST_00000,423.8,2025-03-09,700.5725
4,CUST_00000,644.5,2025-03-09,600.45
5,CUST_00000,572.5,2025-03-09,546.933333
6,CUST_00000,883.36,2025-03-09,631.04
7,CUST_00000,735.52,2025-03-10,651.936
8,CUST_00000,1378.04,2025-03-10,892.355
9,CUST_00000,1028.46,2025-03-10,919.576


In [None]:
df["amount_deviation"] = df["amount"] - df["avg_amount_24h"]

df[[
    "amount",
    "avg_amount_24h",
    "amount_deviation"
]].head(15)

# Positive deviation = unusually large purchase
# Negative deviation = drop (less important) in spending

Unnamed: 0,amount,avg_amount_24h,amount_deviation
0,1044.99,1044.99,0.0
1,1205.51,1125.25,80.26
2,127.99,792.83,-664.84
3,423.8,700.5725,-276.7725
4,644.5,600.45,44.05
5,572.5,546.933333,25.566667
6,883.36,631.04,252.32
7,735.52,651.936,83.584
8,1378.04,892.355,485.685
9,1028.46,919.576,108.884


In [16]:
df[[
    "amount",
    "distance_from_home",
    "txn_count_1h",
    "txn_count_24h",
    "time_since_last_txn_sec",
    "amount_deviation",
    "is_fraud"
]].head(20)


Unnamed: 0,amount,distance_from_home,txn_count_1h,txn_count_24h,time_since_last_txn_sec,amount_deviation,is_fraud
0,1044.99,5.62,1.0,1.0,,0.0,0
1,1205.51,4.45,1.0,2.0,5247.408,80.26,0
2,127.99,4.34,1.0,3.0,39591.72,-664.84,0
3,423.8,6.72,1.0,4.0,33054.24,-276.7725,0
4,644.5,4.32,1.0,4.0,13527.97,44.05,0
5,572.5,5.59,1.0,3.0,41769.82,25.566667,0
6,883.36,4.79,2.0,4.0,1708.731,252.32,0
7,735.52,4.4,1.0,5.0,5652.66,83.584,0
8,1378.04,5.93,1.0,4.0,48090.96,485.685,0
9,1028.46,5.32,1.0,5.0,24607.99,108.884,0


In [18]:
features_df = df.drop(
    columns=[
        "transaction_id",
        "card_number",
        "timestamp",
        "fraud_type"
    ],
    errors="ignore"
)

features_df.head()


Unnamed: 0,customer_id,amount,merchant_id,merchant_category,merchant_lat,merchant_long,distance_from_home,hour,day_of_week,month,is_fraud,prev_txn_time,time_since_last_txn_sec,txn_count_1h,txn_count_24h,avg_amount_24h,timestamp_date,amount_deviation
0,CUST_00000,1044.99,MERCHANT_01868,jewelry,19.073305,72.87268,5.62,9,5,3,0,NaT,,1.0,1.0,1044.99,2025-03-08,0.0
1,CUST_00000,1205.51,MERCHANT_00792,jewelry,19.055841,72.885882,4.45,11,5,3,0,2025-03-08 09:48:00.000000,5247.40753,1.0,2.0,1125.25,2025-03-08,80.26
2,CUST_00000,127.99,MERCHANT_01036,luxury_goods,19.073716,72.885006,4.34,22,5,3,0,2025-03-08 11:15:27.407530,39591.723364,1.0,3.0,792.83,2025-03-08,-664.84
3,CUST_00000,423.8,MERCHANT_01430,luxury_goods,19.08425,72.864179,6.72,7,6,3,0,2025-03-08 22:15:19.130894,33054.244721,1.0,4.0,700.5725,2025-03-09,-276.7725
4,CUST_00000,644.5,MERCHANT_01181,jewelry,19.089023,72.890843,4.32,11,6,3,0,2025-03-09 07:26:13.375615,13527.966888,1.0,4.0,600.45,2025-03-09,44.05


In [19]:
features_df.describe().T


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
amount,100000.0,803.72667,10.0,391.57,640.08,1019.8,11167.71,628.964248
merchant_lat,100000.0,17.2718,10.993044,12.964056,17.383423,19.088059,28.638727,5.393832
merchant_long,100000.0,76.62344,72.849556,75.847137,76.953742,77.602789,80.297143,2.142109
distance_from_home,100000.0,69.835448,2.15,6.38,7.83,9.4,1968.31,258.86262
hour,100000.0,11.53719,0.0,6.0,12.0,18.0,23.0,6.91004
day_of_week,100000.0,3.02965,0.0,1.0,3.0,5.0,6.0,2.015959
month,100000.0,2.05771,1.0,1.0,2.0,3.0,3.0,0.818747
is_fraud,100000.0,0.01038,0.0,0.0,0.0,0.0,1.0,0.101353
prev_txn_time,95000.0,2025-02-16 22:45:41.477806592,2025-01-01 00:22:00,2025-01-26 17:38:06.158126592,2025-02-17 13:04:13.270507008,2025-03-10 02:43:51.572147456,2025-03-31 23:31:56.642329,
time_since_last_txn_sec,95000.0,50564.118898,0.113564,8353.0986,21263.832761,46315.85315,7437693.815186,260683.382479


## Feature engineering --> round 2

In [20]:
# distance between two points (purchases)
from math import radians, sin, cos, sqrt, atan2
import numpy as np

def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km

    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c


In [21]:
df["prev_lat"] = df.groupby("customer_id")["merchant_lat"].shift(1)
df["prev_long"] = df.groupby("customer_id")["merchant_long"].shift(1)
df["prev_time"] = df.groupby("customer_id")["timestamp"].shift(1)


In [22]:
df["travel_distance_km"] = df.apply(
    lambda row: haversine_km(
        row["prev_lat"], row["prev_long"],
        row["merchant_lat"], row["merchant_long"]
    ) if pd.notna(row["prev_lat"]) else 0,
    axis=1
)


In [23]:
df["time_diff_hours"] = (
    (df["timestamp"] - df["prev_time"]).dt.total_seconds() / 3600
)


In [24]:
df["travel_speed_kmh"] = df["travel_distance_km"] / df["time_diff_hours"]


In [25]:
# Handle first txn and very small gaps
df["travel_speed_kmh"] = df["travel_speed_kmh"].replace([np.inf, -np.inf], np.nan)
df["travel_speed_kmh"] = df["travel_speed_kmh"].fillna(0)

# Optional: cap extreme GPS noise
df["travel_speed_kmh"] = df["travel_speed_kmh"].clip(upper=20000)


In [26]:
df["travel_speed_kmh"].describe()

count    100000.000000
mean         81.041161
std         760.629263
min           0.000000
25%           0.144031
50%           0.428247
75%           1.831808
max       20000.000000
Name: travel_speed_kmh, dtype: float64

In [27]:
# Cyclical Hour Encoding

df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

df[["hour", "hour_sin", "hour_cos"]].head(10)

Unnamed: 0,hour,hour_sin,hour_cos
0,9,0.707107,-0.707107
1,11,0.258819,-0.965926
2,22,-0.5,0.866025
3,7,0.965926,-0.258819
4,11,0.258819,-0.965926
5,22,-0.5,0.866025
6,23,-0.258819,0.965926
7,0,0.0,1.0
8,14,-0.5,-0.866025
9,21,-0.707107,0.707107


In [28]:
df[["hour", "hour_sin", "hour_cos"]].sample(20)


Unnamed: 0,hour,hour_sin,hour_cos
63778,23,-0.258819,0.965926
59906,15,-0.7071068,-0.707107
54293,5,0.9659258,0.258819
811,14,-0.5,-0.866025
28725,5,0.9659258,0.258819
53866,9,0.7071068,-0.707107
80575,23,-0.258819,0.965926
26719,4,0.8660254,0.5
48238,23,-0.258819,0.965926
32445,10,0.5,-0.866025
