This notebook creates hypothesis-driven features capturing abnormal behavior over time and across entities, inspired by fraud domain intuition and winning Kaggle strategies, while ensuring strict temporal correctness.

In [1]:
import pandas as pd
import numpy as np

trans = pd.read_csv("../data/train_transaction.csv")
ident = pd.read_csv("../data/train_identity.csv")

trans = trans.sort_values("TransactionDT").reset_index(drop=True)


In [2]:
trans["uid"] = (
    trans["card1"].astype(str) + "_" +
    trans["addr1"].astype(str)
)


In [3]:
trans["uid_txn_count"] = (
    trans.groupby("uid").cumcount()
)


In [4]:
uid_mean_amt = (
    trans.groupby("uid")["TransactionAmt"]
    .expanding()
    .mean()
    .shift()
    .reset_index(level=0, drop=True)
)

trans["uid_avg_amt"] = uid_mean_amt


In [5]:
trans["amt_vs_uid_avg"] = (
    trans["TransactionAmt"] / trans["uid_avg_amt"]
)


In [6]:
trans["uid_prev_dt"] = (
    trans.groupby("uid")["TransactionDT"].shift()
)

trans["uid_time_since_last"] = (
    trans["TransactionDT"] - trans["uid_prev_dt"]
)


In [7]:
m_cols = [c for c in trans.columns if c.startswith("M")]

for c in m_cols:
    trans[c] = trans[c].fillna("Unknown")


In [8]:
trans[
    ["uid_avg_amt", "amt_vs_uid_avg", "uid_time_since_last"]
].isna().mean()


uid_avg_amt            0.000002
amt_vs_uid_avg         0.000002
uid_time_since_last    0.067691
dtype: float64

In [9]:
trans.groupby("isFraud")[
    ["uid_txn_count", "amt_vs_uid_avg", "uid_time_since_last"]
].mean()


Unnamed: 0_level_0,uid_txn_count,amt_vs_uid_avg,uid_time_since_last
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,387.453942,1.194328,374290.048653
1,519.001984,1.327442,245562.544911
