# Data Loading & Enrichment Pipeline

This notebook:
- Loads the base Kaggle credit card fraud dataset
- Preserves original transaction distribution
- Adds domain-consistent synthetic features for interpretability
- Produces an enriched dataset for downstream anomaly detection


In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)


In [2]:
DATA_PATH = "../data/raw/creditcard.csv"

df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
df.head()


Shape: (284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
df["Class"].value_counts(normalize=True)


Class
0    0.998273
1    0.001727
Name: proportion, dtype: float64

## Customer Universe Design

We simulate a fixed population of 5,000 customers.
Each transaction is probabilistically assigned to a customer to reflect
realistic usage patterns while preserving the original fraud distribution.


In [4]:
N_CUSTOMERS = 5000

np.random.seed(42)

customer_ids = [f"CUST_{i:05d}" for i in range(N_CUSTOMERS)]


In [5]:
# Create a probability distribution so that some customers are more active
activity_weights = np.random.exponential(scale=1.0, size=N_CUSTOMERS)
activity_weights = activity_weights / activity_weights.sum()

df["customer_id"] = np.random.choice(
    customer_ids,
    size=len(df),
    p=activity_weights
)


In [6]:
#Sanity check
df["customer_id"].nunique(), df["customer_id"].value_counts().head()


(4911,
 customer_id
 CUST_00531    450
 CUST_01464    428
 CUST_03402    413
 CUST_03649    398
 CUST_01054    386
 Name: count, dtype: int64)

## Customer Home Location Assignment

Each customer is assigned a fixed home location.
Locations are sampled from a bounded geographic region to simulate
realistic cardholder residence clusters.


In [7]:
# Geographic bounds (example: Europe-like region)
LAT_MIN, LAT_MAX = 35.0, 60.0
LON_MIN, LON_MAX = -10.0, 30.0

customer_home = pd.DataFrame({
    "customer_id": df["customer_id"].unique()
})

np.random.seed(42)

customer_home["home_lat"] = np.random.uniform(LAT_MIN, LAT_MAX, size=len(customer_home))
customer_home["home_lon"] = np.random.uniform(LON_MIN, LON_MAX, size=len(customer_home))

customer_home.head()


Unnamed: 0,customer_id,home_lat,home_lon
0,CUST_01935,44.363503,12.15973
1,CUST_02343,58.767858,11.382369
2,CUST_04269,53.299849,-2.18168
3,CUST_01671,49.966462,-3.350826
4,CUST_04343,38.900466,26.659566


In [8]:
df = df.merge(customer_home, on="customer_id", how="left")

df[["customer_id", "home_lat", "home_lon"]].head()


Unnamed: 0,customer_id,home_lat,home_lon
0,CUST_01935,44.363503,12.15973
1,CUST_02343,58.767858,11.382369
2,CUST_04269,53.299849,-2.18168
3,CUST_01671,49.966462,-3.350826
4,CUST_04343,38.900466,26.659566


In [9]:
#Sanity check
df[["home_lat", "home_lon"]].describe()


Unnamed: 0,home_lat,home_lon
count,284807.0,284807.0
mean,47.571796,9.8457
std,7.285607,11.549467
min,35.000291,-9.997887
25%,41.193275,-0.040924
50%,47.735553,9.774832
75%,53.95658,19.734116
max,59.992942,29.978427


## Merchant Universe Design

We simulate a fixed set of merchants with:
- Stable merchant IDs
- Business categories (MCC-like)
- Fixed geographic locations

This allows us to model:
- Category-dependent risk
- Location-based anomalies
- Repeated merchant interaction patterns


In [10]:
# ----------------------------------------------------
# These categories reflect common MCC groupings in
# payment systems and will later be used for:
# - risk differentiation
# - explanation in the prediction API
# ----------------------------------------------------
merchant_categories = [
    "grocery",
    "fuel",
    "restaurant",
    "retail",
    "electronics",
    "jewelry",
    "luxury_goods"
]

In [11]:
# ----------------------------------------------------
# Create a merchant universe
# ----------------------------------------------------
# We simulate a fixed population of merchants.
# Each merchant has:
# - a unique ID
# - a business category
# - a fixed geographic location
# ----------------------------------------------------

N_MERCHANTS = 1000
np.random.seed(42)

merchant_ids = [f"MERCH_{i:05d}" for i in range(N_MERCHANTS)]

merchant_df = pd.DataFrame({
    "merchant_id": merchant_ids,

    # Assign categories uniformly for now
    # (risk bias will be introduced later)
    "merchant_category": np.random.choice(
        merchant_categories,
        size=N_MERCHANTS
    ),

    # Assign merchant locations in the same region
    "merchant_lat": np.random.uniform(LAT_MIN, LAT_MAX, size=N_MERCHANTS),
    "merchant_lon": np.random.uniform(LON_MIN, LON_MAX, size=N_MERCHANTS),
})

merchant_df.head()


Unnamed: 0,merchant_id,merchant_category,merchant_lat,merchant_lon
0,MERCH_00000,luxury_goods,50.552168,24.511883
1,MERCH_00001,retail,56.559093,-1.330408
2,MERCH_00002,electronics,58.738016,-6.171418
3,MERCH_00003,luxury_goods,38.676837,-9.054457
4,MERCH_00004,restaurant,58.164691,15.67886


In [12]:
# ----------------------------------------------------
#Assign merchants to transactions
# ----------------------------------------------------
# Each transaction is linked to a merchant.
# We assume:
# - merchants serve many customers
# - transaction volume varies naturally
# ----------------------------------------------------

df["merchant_id"] = np.random.choice(
    merchant_df["merchant_id"],
    size=len(df)
)

# Merge merchant attributes into transaction data
df = df.merge(merchant_df, on="merchant_id", how="left")

df[["merchant_id", "merchant_category", "merchant_lat", "merchant_lon"]].head()


Unnamed: 0,merchant_id,merchant_category,merchant_lat,merchant_lon
0,MERCH_00564,fuel,47.378663,19.25986
1,MERCH_00806,grocery,49.823771,21.935214
2,MERCH_00320,jewelry,44.929588,-1.838575
3,MERCH_00912,grocery,44.161538,29.239495
4,MERCH_00232,jewelry,53.411106,20.986141


In [13]:
# ----------------------------------------------------
# Sanity checks
# ----------------------------------------------------
# Ensure:
# - All transactions have merchant info
# - Categories are well distributed
# ----------------------------------------------------

df["merchant_id"].isna().sum(), df["merchant_category"].value_counts()


(np.int64(0),
 merchant_category
 retail          44288
 grocery         44185
 electronics     42395
 luxury_goods    39331
 fuel            39281
 jewelry         38355
 restaurant      36972
 Name: count, dtype: int64)

## Distance From Home Calculation

We compute the geographic distance between:
- the customer's home location
- the merchant's transaction location

This feature captures abnormal spatial behavior, which is a strong
indicator of potential fraud in card transactions.


In [14]:
# ----------------------------------------------------
#Compute distance from customer home
# ----------------------------------------------------
# We use the Haversine formula to calculate the
# great-circle distance between two latitude-longitude
# points on Earth.
#
# Output distance is in kilometers.
# ----------------------------------------------------

from math import radians, sin, cos, sqrt, atan2

def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Compute the Haversine distance between two geo points.

    Parameters:
    - lat1, lon1: Customer home coordinates
    - lat2, lon2: Merchant location coordinates

    Returns:
    - Distance in kilometers
    """
    R = 6371  # Earth radius in km

    lat1, lon1, lat2, lon2 = map(
        radians, [lat1, lon1, lat2, lon2]
    )

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c


In [15]:
# ----------------------------------------------------
#Apply distance calculation row-wise
# ----------------------------------------------------
# We compute distance_from_home for every transaction.
# This is intentionally done AFTER customer and merchant
# locations are fixed.
# ----------------------------------------------------

df["distance_from_home"] = df.apply(
    lambda row: haversine_distance(
        row["home_lat"], row["home_lon"],
        row["merchant_lat"], row["merchant_lon"]
    ),
    axis=1
)

df["distance_from_home"].describe()


count    284807.000000
mean       1502.111205
std         719.027247
min           1.510401
25%         944.335524
50%        1469.798045
75%        2024.211677
max        3808.909187
Name: distance_from_home, dtype: float64

In [17]:
# ----------------------------------------------------
#Sanity check distance values
# ----------------------------------------------------
# We ensure:
# - No negative distances
# - Reasonable geographic scale
# ----------------------------------------------------

df["distance_from_home"].min(), df["distance_from_home"].max()


(np.float64(1.5104008316725828), np.float64(3808.9091869030935))

## Time-Based Feature Engineering

We derive temporal features from the original `Time` column
to capture daily and weekly transaction patterns.

These features are deterministic and do not introduce any
synthetic bias.


In [18]:
# ----------------------------------------------------
#Convert Time (seconds) to datetime
# ----------------------------------------------------
# The original dataset stores time as seconds since
# the first transaction.
# We convert it into a pseudo-datetime for extracting
# hour, day_of_week, and month.
# ----------------------------------------------------

# Define an arbitrary start timestamp
START_DATE = pd.Timestamp("2013-09-01")

df["transaction_time"] = START_DATE + pd.to_timedelta(
    df["Time"], unit="s"
)

df[["Time", "transaction_time"]].head()


Unnamed: 0,Time,transaction_time
0,0.0,2013-09-01 00:00:00
1,0.0,2013-09-01 00:00:00
2,1.0,2013-09-01 00:00:01
3,1.0,2013-09-01 00:00:01
4,2.0,2013-09-01 00:00:02


In [19]:
# ----------------------------------------------------
#Extract temporal features
# ----------------------------------------------------
# These features help capture abnormal transaction
# timing behavior.
# ----------------------------------------------------

df["hour"] = df["transaction_time"].dt.hour
df["day_of_week"] = df["transaction_time"].dt.dayofweek
df["month"] = df["transaction_time"].dt.month

df[["hour", "day_of_week", "month"]].describe()


Unnamed: 0,hour,day_of_week,month
count,284807.0,284807.0,284807.0
mean,14.04647,3.050192,9.0
std,5.835854,2.999585,0.0
min,0.0,0.0,9.0
25%,10.0,0.0,9.0
50%,15.0,6.0,9.0
75%,19.0,6.0,9.0
max,23.0,6.0,9.0


In [20]:
# ----------------------------------------------------
#Sanity checks on temporal features
# ----------------------------------------------------

df["hour"].min(), df["hour"].max(), df["day_of_week"].nunique(), df["month"].unique()


(np.int32(0), np.int32(23), 2, array([9], dtype=int32))

## Fraud Type Annotation (Explanatory)

We annotate fraud transactions with high-level fraud types
based on observable behavioral signatures.

This annotation is:
- NOT used for model training
- ONLY used for interpretability and API explanations


In [21]:
# ----------------------------------------------------
#Define fraud types
# ----------------------------------------------------
# These categories reflect common fraud mechanisms.
# They are assigned ONLY to known fraud transactions
# (Class == 1) for explanatory purposes.
# ----------------------------------------------------

fraud_types = [
    "card_cloning",
    "account_takeover",
    "merchant_collusion"
]


In [22]:
# ----------------------------------------------------
#Assign fraud types (rule-based)
# ----------------------------------------------------
# IMPORTANT:
# - This uses observable features
# - It does NOT feed into model training
# - It is strictly for explanation and reporting
# ----------------------------------------------------

def assign_fraud_type(row):
    if row["Class"] == 0:
        return "none"

    # Card cloning: large distance + unusual merchant
    if row["distance_from_home"] > 1000:
        return "card_cloning"

    # Account takeover: odd hour activity
    if row["hour"] < 6 or row["hour"] > 22:
        return "account_takeover"

    # Merchant collusion: fallback category
    return "merchant_collusion"

df["fraud_type"] = df.apply(assign_fraud_type, axis=1)

df["fraud_type"].value_counts()


fraud_type
none                  284315
card_cloning             338
merchant_collusion       104
account_takeover          50
Name: count, dtype: int64

In [23]:
# ----------------------------------------------------
# Validate fraud type assignment
# ----------------------------------------------------

pd.crosstab(df["fraud_type"], df["Class"])


Class,0,1
fraud_type,Unnamed: 1_level_1,Unnamed: 2_level_1
account_takeover,0,50
card_cloning,0,338
merchant_collusion,0,104
none,284315,0


## Dataset Freeze

At this stage, the dataset contains:
- Original anonymized transaction features
- Domain-consistent synthetic attributes
- Explanatory fraud annotations

From this point onward:
- No further enrichment is performed
- All modeling uses this frozen dataset


In [24]:
# ----------------------------------------------------
#Drop intermediate helper columns
# ----------------------------------------------------
# The following columns were used only during enrichment
# and are not required for downstream modeling or API:
# - transaction_time (helper for temporal extraction)
# ----------------------------------------------------

df_final = df.drop(columns=["transaction_time"])


In [26]:
# ----------------------------------------------------
#Save the enriched dataset
# ----------------------------------------------------
# We store the final dataset in a dedicated processed
# data folder to clearly separate it from raw data.
# ----------------------------------------------------

OUTPUT_PATH = "../data/processed/transactions_enriched.csv"

# Ensure directory exists
import os
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

df_final.to_csv(OUTPUT_PATH, index=False)

print(f"Enriched dataset saved to: {OUTPUT_PATH}")


Enriched dataset saved to: ../data/processed/transactions_enriched.csv


In [27]:
# ----------------------------------------------------
# Reload and verify saved dataset
# ----------------------------------------------------

df_check = pd.read_csv(OUTPUT_PATH)
df_check.shape, df_check.columns


((284807, 43),
 Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16',
        'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class',
        'customer_id', 'home_lat', 'home_lon', 'merchant_id', 'merchant_category', 'merchant_lat', 'merchant_lon',
        'distance_from_home', 'hour', 'day_of_week', 'month', 'fraud_type'],
       dtype='object'))