# Graph Neural Network - Preprocess

**Import from Kaggle**

In [1]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"phuongkhanh21","key":"0fd6eb70e1509aad441adeb76f0dfff2"}'}

In [2]:
from google.colab import drive
drive.mount('/content/drive')
drive_path = '/content/drive/MyDrive/Colab Notebooks/'

Mounted at /content/drive


In [3]:
import os
os.makedirs('/root/.kaggle', exist_ok=True)
!mv kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

!kaggle datasets download -d berkanoztas/synthetic-transaction-monitoring-dataset-aml

import zipfile

with zipfile.ZipFile("synthetic-transaction-monitoring-dataset-aml.zip", 'r') as zip_ref:
    zip_ref.extractall("synthetic_transaction_data")

Dataset URL: https://www.kaggle.com/datasets/berkanoztas/synthetic-transaction-monitoring-dataset-aml
License(s): CC-BY-NC-SA-4.0
Downloading synthetic-transaction-monitoring-dataset-aml.zip to /content
 70% 135M/193M [00:00<00:00, 1.40GB/s]
100% 193M/193M [00:00<00:00, 1.09GB/s]


**Import libraries**

In [4]:
!pip install rustworkx

Collecting rustworkx
  Downloading rustworkx-0.17.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading rustworkx-0.17.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m1.3/2.2 MB[0m [31m38.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rustworkx
Successfully installed rustworkx-0.17.1


In [5]:
import pandas as pd
import polars as pl
from datetime import timedelta
import numpy as np
import rustworkx as rx
from typing import List, Dict

**Featuring functions**

In [6]:
def custom_split_polars(df: pl.DataFrame, validation_dt: int = 70, test_dt: int = 35):
    """
    Split a Polars DataFrame into train/validation/test by calendar-day cutoffs
    measured backwards from the dataset max Date.

    Parameters
    - df: polars.DataFrame with a datetime column named "Date" (string or datetime OK)
    - validation_dt: int days for validation window (e.g., 70)
    - test_dt: int days for test window (e.g., 35)

    Returns
    - train_df, validation_df, test_df  (all eager polars.DataFrame)
    """

    # ensure Date is a datetime type: try to get max, otherwise parse strings to Datetime
    try:
        max_date = df.select(pl.col("Date").max()).to_series()[0]
    except Exception:
        df = df.with_column(pl.col("Date").str.strptime(pl.Datetime, fmt=None).alias("Date"))
        max_date = df.select(pl.col("Date").max()).to_series()[0]

    test_cutoff = max_date - timedelta(days=test_dt)
    validation_cutoff = max_date - timedelta(days=validation_dt)

    test_set = df.filter(pl.col("Date") >= pl.lit(test_cutoff))
    validation_set = df.filter(
        (pl.col("Date") >= pl.lit(validation_cutoff)) & (pl.col("Date") < pl.lit(test_cutoff))
    )
    train_set = df.filter(pl.col("Date") < pl.lit(validation_cutoff))

    return train_set, validation_set, test_set

In [7]:
def feature_engineering(df: pl.DataFrame) -> pl.DataFrame:
    """
    Build and return engineered features from a Polars DataFrame.
    """

    # Dropping column Laundering_type
    df = df.drop("Laundering_type")

    # Temporal features
    def temporal_features(df):

        return df.with_columns([
            df["Date"].dt.year().alias("year"),
            df["Date"].dt.month().alias("month"),
            df["Date"].dt.day().alias("day_of_month"),
            df["Date"].dt.weekday().alias("day_of_week"),
            df["Date"].dt.ordinal_day().alias("day_of_year"),
            df["Time"].dt.hour().alias("hour"),
            df["Time"].dt.minute().alias("minute"),
            df["Time"].dt.second().alias("second"),
        ])

    # Risk features
    high_risk_countries = ['Mexico', 'Turkey', 'Morocco', 'UAE']

    def risk_features(df):

        return df.with_columns([
            (df["Payment_currency"] != df["Received_currency"]).cast(pl.Int8).alias("currency_mismatch"),
            (df["Payment_type"] == "Cross-border").cast(pl.Int8).alias("cross_border"),
            df["Sender_bank_location"].is_in(high_risk_countries).cast(pl.Int8).alias("high_risk_sender"),
            df["Receiver_bank_location"].is_in(high_risk_countries).cast(pl.Int8).alias("high_risk_receiver")])

    def build_window_features_lazy(
        df,
        specs,
        date_col="Date",
        sender_col="Sender_account",
        receiver_col="Receiver_account",
        amount_col="Amount",
        index_name="__row_idx",
        label_choice="left",
    ):
        lf = df.lazy() if isinstance(df, pl.DataFrame) else df
        lf = lf.with_columns(pl.arange(0, pl.len()).over(pl.lit(True)).alias(index_name))
        out_lf = lf

        for spec in specs:
            kind = spec.get("kind", "rolling")

            if kind == "rolling":
                # existing rolling logic (no change)
                name = spec["name"]
                direction = spec["type"]  # "fanin" or "fanout"
                period_days = int(spec["period_days"])
                every = spec.get("every", "1d")

                if direction == "fanin":
                    group_by = receiver_col
                    agg_on = sender_col
                else:
                    group_by = sender_col
                    agg_on = receiver_col

                win_label = label_choice
                strategy = "forward" if win_label == "left" else "backward"

                right = (
                    lf
                    .sort([group_by, date_col])
                    .group_by_dynamic(
                        index_column=date_col,
                        every=every,
                        period=f"{period_days}d",
                        group_by=group_by,
                        closed="both",
                        label=win_label
                    )
                    .agg(pl.col(agg_on).n_unique().alias(name))
                    .sort([group_by, date_col])
                )

                left = out_lf.sort([group_by, date_col])

                out_lf = left.join_asof(
                    right,
                    left_on=date_col,
                    right_on=date_col,
                    by=group_by,
                    strategy=strategy,
                )

            elif kind == "monthly":
                # existing monthly logic (no change)
                name = spec["name"]
                side = spec.get("side", "receive")
                group_col = receiver_col if side == "receive" else sender_col

                monthly_agg = (
                    lf
                    .with_columns(pl.col(date_col).dt.truncate("1mo").alias("__month"))
                    .group_by([group_col, "__month"])
                    .agg(pl.col(amount_col).sum().alias(name))
                )

                out_lf = (
                    out_lf
                    .with_columns(pl.col(date_col).dt.truncate("1mo").alias("__month"))
                    .join(monthly_agg, on=[group_col, "__month"], how="left")
                    .drop("__month")
                )

            elif kind == "daily_pair_count":
                # NEW: back_and_forth_transfers (exact-match on day + pair)
                name = spec["name"]  # e.g., "back_and_forth_transfers"
                # day key = calendar day (truncate to 1 day)
                day_key = "__day"
                # compute counts per sender/receiver/day using lf (lazy)
                pair_daily_agg = (
                    lf
                    .with_columns(pl.col(date_col).dt.truncate("1d").alias(day_key))
                    .group_by([sender_col, receiver_col, day_key])
                    .agg(pl.len().alias(name))  # .len() counts rows in group
                )

                # attach day key to working frame and join exact on pair + day
                out_lf = (
                    out_lf
                    .with_columns(pl.col(date_col).dt.truncate("1d").alias(day_key))
                    .join(pair_daily_agg, on=[sender_col, receiver_col, day_key], how="left")
                    .fill_null(0)       # optional: replace nulls with 0
                    .with_columns(pl.col(name).cast(pl.Int64))  # ensure integer type
                    .drop(day_key)
                )

            else:
                raise ValueError("spec kind must be 'rolling', 'monthly', or 'daily_pair_count'")

        return out_lf


    def compute_derived_features_lazy(
        lf: pl.LazyFrame,
        *,
        fanin_col: str = "fanin_30d",
        fanout_col: str = "fanout_30d",
        daily_receive_col: str = "daily_receive",
        monthly_receive_col: str = "monthly_receive",
        monthly_send_col: str = "monthly_send",
        amount_col: str = "Amount",
        sender_col: str = "Sender_account",
        receiver_col: str = "Receiver_account",
        index_name: str = "__row_idx",
    ) -> pl.LazyFrame:
        """
        Take a LazyFrame and return a LazyFrame with derived features:
          - fan_in_out_ratio (safe division, 0 when denom missing or zero)
          - fanin_intensity_ratio (fanin_30d / daily_receive, 0 when denom missing or zero)
          - amount_dispersion_std (per-sender std of Amount, filled 0 when null)
          - sent_to_received_ratio_monthly (monthly_receive / monthly_send, 0 when denom missing or zero)

        If `daily_receive` is not present in lf.schema(), it is computed lazily as the
        per-receiver unique-senders per calendar day (dt.truncate("1d")) and joined back.
        The function is fully lazy; call .collect(...) when ready.
        """
        # ensure lazy input
        lf = lf if isinstance(lf, pl.LazyFrame) else lf.lazy()

        # Attempt to read schema; if unavailable assume missing and compute
        try:
            schema = lf.schema()
            has_daily = daily_receive_col in schema
        except Exception:
            has_daily = False

        # If daily_receive missing, compute it lazily (exact day bucket of unique senders per receiver)
        if not has_daily:
            day_key = "__day_for_daily_receive"
            daily_receive_agg = (
                lf
                .with_columns(pl.col("Date").dt.truncate("1d").alias(day_key))
                .group_by([receiver_col, day_key])
                .agg(pl.col(sender_col).n_unique().alias(daily_receive_col))
            )
            lf = (
                lf
                .with_columns(pl.col("Date").dt.truncate("1d").alias(day_key))
                .join(daily_receive_agg, on=[receiver_col, day_key], how="left")
                .drop(day_key)
            )

        # safe division helper expression
        def safe_div_expr(num: str, den: str, out_name: str):
            return (
                pl.when(pl.col(den).is_null() | (pl.col(den) == 0))
                  .then(0.0)
                  .otherwise(pl.col(num).cast(pl.Float64) / pl.col(den).cast(pl.Float64))
                  .alias(out_name)
            )

        fan_in_out_expr = safe_div_expr(fanin_col, fanout_col, "fan_in_out_ratio")
        fanin_intensity_expr = safe_div_expr(fanin_col, daily_receive_col, "fanin_intensity_ratio")
        sent_to_received_monthly_expr = safe_div_expr(monthly_receive_col, monthly_send_col, "sent_to_received_ratio_monthly")

        # per-sender std aggregation (lazy) and join back
        sender_std_agg = (
            lf
            .select([sender_col, amount_col])
            .group_by(sender_col)
            .agg(pl.col(amount_col).std().alias("__amount_std"))
        )

        out = (
            lf
            .join(sender_std_agg, on=sender_col, how="left")
            .with_columns(
                pl.col("__amount_std").cast(pl.Float64).fill_null(0.0).alias("amount_dispersion_std")
            )
            .drop("__amount_std")
            .with_columns([
                fan_in_out_expr,
                fanin_intensity_expr,
                sent_to_received_monthly_expr
            ])
        )

        return out
    # Temporal features
    df = temporal_features(df)

    # Risk features
    df = risk_features(df)

    # Rolling window computing
    specs = [
    {"name":"fanin_30d", "kind":"rolling", "type":"fanin", "period_days":30, "every":"1d"},
    {"name":"fanout_30d", "kind":"rolling", "type":"fanout", "period_days":30, "every":"1d"},
    {"name":"daily_recieve", "kind":"rolling", "type":"fanin", "period_days":1, "every":"1d"},
    {"name":"monthly_receive", "kind":"monthly", "side":"receive"},
    {"name":"monthly_send",    "kind":"monthly", "side":"send"},
    {"name":"back_and_forth_transfers", "kind":"daily_pair_count"},
    ]
    lazy_with_features = build_window_features_lazy(df, specs, amount_col="Amount", label_choice="left")
    plan = (
    lazy_with_features
    .sort(["Sender_account", "Date"])
    .with_columns([pl.col("Sender_account").set_sorted(), pl.col("Date").set_sorted()])
    )
    df_streamed = plan.collect(engine="streaming")
    df = df_streamed.sort("__row_idx").drop("__row_idx")

    # More computation
    lazy_with_derived = compute_derived_features_lazy(lazy_with_features)

    # Before streaming collect: pick a primary grouping ordering that matches your rolling computations.
    # If most rolling features used Receiver_account then Date, use that; otherwise use the grouping you chose.
    plan_derived = (
        lazy_with_derived
        .sort(["Sender_account", "Date"])
        .with_columns([pl.col("Sender_account").set_sorted(), pl.col("Date").set_sorted()])
    )

    df_streamed = plan_derived.collect(engine="streaming")
    df = df_streamed.sort("__row_idx").drop("__row_idx")

    return df

In [8]:
"""
We recast the integer‑based columns following the logic rules outlined in the paper.
"Explainable Feature Engineering for Multi-class Money Laundering Classification"
This recasting is performed to optimize storage efficiency and reduce overall memory consumption."
Excluding "Sender_account" and "Receiver_account" variables.
"""

def recast(df):
    exclude = ['Sender_account', 'Receiver_account']

    for col in df.columns:
        if col not in exclude:
            dtype = df[col].dtype
            if dtype in (pl.Int64, pl.Int32):
              maxval = df[col].max()
              if maxval:
                  if maxval < 127:
                      df = df.with_columns(df[col].cast(pl.Int8).alias(col))
                  elif maxval < 32767:
                      df = df.with_columns(df[col].cast(pl.Int16).alias(col))
                  elif maxval < 2147483647:
                      df = df.with_columns(df[col].cast(pl.Int32).alias(col))

    return df

In [9]:
"""
Compute circular_transaction_count using a
calendar‑month sliding window, ensuring rustworkx is installed

"""

def circular_transaction_feature(df:pl.DataFrame):
    # Iterate over monthly groups
    results = []
    for (year, month), group in df.group_by(["year", "month"]):
        res = circular_count_monthly(group, year, month)
        if res.height > 0:
            results.append(res)

    # Combine all results
    out_rx = pl.concat(results, how="vertical") if results else pl.DataFrame()

    # Join back to original df
    df_result = (
        df.join(out_rx, on=["Sender_account", "year", "month"], how="left")
        .with_columns(
            pl.col("circular_transaction_count").fill_null(0)
        )
    )
    return df_result

def circular_count_monthly(pdf, year, month):
    edges = list(zip(pdf["Sender_account"], pdf["Receiver_account"]))
    if not edges:
        return empty_month_frame()

    G = rx.PyDiGraph()
    node_idx = {}
    for u, v in edges:
        if u not in node_idx:
            node_idx[u] = G.add_node(u)
        if v not in node_idx:
            node_idx[v] = G.add_node(v)
        G.add_edge(node_idx[u], node_idx[v], None)

    cycles = rx.simple_cycles(G)

    counter = {}
    for cyc in cycles:
        cyc_nodes = [G[node] for node in cyc]
        for node in cyc_nodes:
            counter[node] = counter.get(node, 0) + 1

    return pl.DataFrame({
        "Sender_account": list(counter.keys()),
        "circular_transaction_count": list(counter.values()),
        "year": [year] * len(counter),
        "month": [month] * len(counter)
    })

**Prepare data**

In [10]:
os.listdir("synthetic_transaction_data")
df = pl.read_csv("synthetic_transaction_data/SAML-D.csv")
df.head(5)

Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type
str,str,i64,i64,f64,str,str,str,str,str,i64,str
"""10:35:19""","""2022-10-07""",8724731955,2769355426,1459.15,"""UK pounds""","""UK pounds""","""UK""","""UK""","""Cash Deposit""",0,"""Normal_Cash_Deposits"""
"""10:35:20""","""2022-10-07""",1491989064,8401255335,6019.64,"""UK pounds""","""Dirham""","""UK""","""UAE""","""Cross-border""",0,"""Normal_Fan_Out"""
"""10:35:20""","""2022-10-07""",287305149,4404767002,14328.44,"""UK pounds""","""UK pounds""","""UK""","""UK""","""Cheque""",0,"""Normal_Small_Fan_Out"""
"""10:35:21""","""2022-10-07""",5376652437,9600420220,11895.0,"""UK pounds""","""UK pounds""","""UK""","""UK""","""ACH""",0,"""Normal_Fan_In"""
"""10:35:21""","""2022-10-07""",9614186178,3803336972,115.25,"""UK pounds""","""UK pounds""","""UK""","""UK""","""Cash Deposit""",0,"""Normal_Cash_Deposits"""


In [11]:
df = df.with_columns(
    pl.col("Amount").log().alias("Amount")
)
    # Convert datetime
df = df.with_columns(
    pl.col("Date").str.strptime(pl.Date, "%Y-%m-%d").alias("Date"),
    pl.col("Time").str.strptime(pl.Time, "%H:%M:%S").alias("Time"))

In [12]:
df_train, df_val, df_test = custom_split_polars(df)

In [13]:
df_train.head()

Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type
time,date,i64,i64,f64,str,str,str,str,str,i64,str
10:35:19,2022-10-07,8724731955,2769355426,7.285609,"""UK pounds""","""UK pounds""","""UK""","""UK""","""Cash Deposit""",0,"""Normal_Cash_Deposits"""
10:35:20,2022-10-07,1491989064,8401255335,8.702783,"""UK pounds""","""Dirham""","""UK""","""UAE""","""Cross-border""",0,"""Normal_Fan_Out"""
10:35:20,2022-10-07,287305149,4404767002,9.570002,"""UK pounds""","""UK pounds""","""UK""","""UK""","""Cheque""",0,"""Normal_Small_Fan_Out"""
10:35:21,2022-10-07,5376652437,9600420220,9.383873,"""UK pounds""","""UK pounds""","""UK""","""UK""","""ACH""",0,"""Normal_Fan_In"""
10:35:21,2022-10-07,9614186178,3803336972,4.747104,"""UK pounds""","""UK pounds""","""UK""","""UK""","""Cash Deposit""",0,"""Normal_Cash_Deposits"""


In [14]:
df_train = feature_engineering(df_train)
df_train = circular_transaction_feature(df_train)
df_train = recast(df_train)

df_val = feature_engineering(df_val)
df_val = circular_transaction_feature(df_val)
df_val = recast(df_val)

df_test = feature_engineering(df_test)
df_test = circular_transaction_feature(df_test)
df_test = recast(df_test)

  df_streamed = plan.collect(engine="streaming")
  schema = lf.schema()
  df_streamed = plan_derived.collect(engine="streaming")


**GCN preparation**

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F

**Encoding accounts**

In [16]:
# Mapping from train only
accounts = pl.concat([df_train["Sender_account"], df_train["Receiver_account"]]).unique()
mapping_df = pl.DataFrame({"account": accounts, "id": list(range(len(accounts)))})

In [17]:
def map_accounts(df):
    # join for sender
    df = df.join(mapping_df, left_on="Sender_account", right_on="account", how="left") \
                   .with_columns(pl.col("id").alias("src")) \
                   .drop("id")
    # join for receiver
    df = df.join(mapping_df, left_on="Receiver_account", right_on="account", how="left") \
                   .with_columns(pl.col("id").alias("dst")) \
                   .drop("id")
    return df

In [18]:
df_train = map_accounts(df_train)
df_val = map_accounts(df_val)
df_test = map_accounts(df_test)

In [19]:
# Then fill_null with -1
df_train = df_train.with_columns([
    pl.col("src").fill_null(-1).cast(pl.Int64),
    pl.col("dst").fill_null(-1).cast(pl.Int64),
])
df_val = df_val.with_columns([
    pl.col("src").fill_null(-1).cast(pl.Int64),
    pl.col("dst").fill_null(-1).cast(pl.Int64),
])
df_test = df_test.with_columns([
    pl.col("src").fill_null(-1).cast(pl.Int64),
    pl.col("dst").fill_null(-1).cast(pl.Int64),
])

**Build edge features**

In [20]:
edge_numeric = [
    "Amount",
    "fanin_30d",
    "fanin_intensity_ratio",
    "amount_dispersion_std",
    "sent_to_received_ratio_monthly",
    "back_and_forth_transfers",
    "circular_transaction_count"
]
edge_flags = ["currency_mismatch", "high_risk_sender", "high_risk_receiver"]
temporal_cols = ["year", "day_of_month", "day_of_week", "hour", "minute", "second"]
label_col = "Is_laundering"

In [21]:
def build_edge_attr_from_polars(
    df: pl.DataFrame,
    edge_numeric: list,
    edge_flags: list,
    temporal_cols: list,
    edge_num_mean: np.ndarray = None,
    edge_num_std: np.ndarray = None,
    log_amount_col: str = "Amount",
    return_scalers: bool = False,
):
    """
    Build edge feature tensor (edge_attr) for a Polars DataFrame of transactions.
    Designed to be identical across train/val/test splits.

    Parameters
    ----------
    df : pl.DataFrame
        Must contain columns: src, dst, edge_numeric, edge_flags, temporal_cols.
    edge_numeric : list[str]
        Names of numeric transaction-level columns (e.g., ["Amount", "fanin_30d", ...])
    edge_flags : list[str]
        Names of binary/categorical indicator columns (0/1 flags).
    temporal_cols : list[str]
        Temporal features to encode cyclically: ["year", "day_of_month", "day_of_week", "hour", "minute", "second"]
    edge_num_mean, edge_num_std : np.ndarray or None
        Normalization statistics from the training set. If None, they will be computed.
    log_amount_col : str
        Name of numeric column to log-transform (e.g., "Amount"). If not in edge_numeric, no effect.
    return_scalers : bool
        If True, returns (edge_attr, mean, std). Otherwise just edge_attr.

    Returns
    -------
    edge_attr : torch.FloatTensor [E, F_edge]
    edge_num_mean, edge_num_std : np.ndarray
        Returned only if return_scalers=True
    """

    # Cast to correct types
    df = df.with_columns(
        [pl.col(c).cast(pl.Float64) for c in edge_numeric] +
        [pl.col(c).cast(pl.Int32) for c in edge_flags] +
        [pl.col(c).cast(pl.Int32) for c in temporal_cols]
    )

    # Extract numeric + flag columns
    num_flags_np = df.select(edge_numeric + edge_flags).to_numpy().astype(np.float32)
    num_count = len(edge_numeric)
    flag_count = len(edge_flags)
    num_np = num_flags_np[:, :num_count]
    flags_np = num_flags_np[:, num_count:]

    # Optional log transform for amount
    if log_amount_col in edge_numeric:
        amt_idx = edge_numeric.index(log_amount_col)
        num_np[:, amt_idx] = np.log1p(np.clip(num_np[:, amt_idx], a_min=0, a_max=None))

    # Compute normalization scalers if needed
    if edge_num_mean is None or edge_num_std is None:
        edge_num_mean = num_np.mean(axis=0)
        edge_num_std = num_np.std(axis=0) + 1e-9

    # Apply normalization
    num_np_norm = (num_np - edge_num_mean[None, :]) / edge_num_std[None, :]

    # Temporal cyclic encodings
    tmp = df.select(temporal_cols).to_numpy().astype(np.float32)
    years = tmp[:, 0]
    dom   = tmp[:, 1]
    dow   = tmp[:, 2]
    hour  = tmp[:, 3]
    minute= tmp[:, 4]
    second= tmp[:, 5]

    def cyc_np(x, period):
        ang = 2 * np.pi * x / period
        return np.sin(ang).astype(np.float32), np.cos(ang).astype(np.float32)

    dom_sin, dom_cos   = cyc_np(dom, 31)
    dow_sin, dow_cos   = cyc_np(dow, 7)
    hour_sin, hour_cos = cyc_np(hour, 24)
    min_sin, min_cos   = cyc_np(minute, 60)
    sec_sin, sec_cos   = cyc_np(second, 60)

    temporal_np = np.column_stack([
        years, dom_sin, dom_cos,
        dow_sin, dow_cos,
        hour_sin, hour_cos,
        min_sin, min_cos,
        sec_sin, sec_cos
    ]).astype(np.float32)

    # Combine numeric, flags, temporal
    edge_attr_np = np.hstack([num_np_norm, flags_np, temporal_np]).astype(np.float32)

    edge_attr = torch.as_tensor(edge_attr_np, dtype=torch.float32)

    # Return results
    if return_scalers:
        return edge_attr, edge_num_mean, edge_num_std
    else:
        return edge_attr

In [22]:
edge_attr_train, edge_num_mean, edge_num_std = build_edge_attr_from_polars(
                                              df_train,
                                              edge_numeric=edge_numeric,
                                              edge_flags=edge_flags,
                                              temporal_cols=temporal_cols,
                                              return_scalers=True
                                          )

In [23]:
# Compute using the *same normalization and encoding*
edge_attr_val = build_edge_attr_from_polars(
                                          df_val,
                                          edge_numeric=edge_numeric,
                                          edge_flags=edge_flags,
                                          temporal_cols=temporal_cols,
                                          edge_num_mean=edge_num_mean,
                                          edge_num_std=edge_num_std
                                      )
edge_attr_test = build_edge_attr_from_polars(
                                          df_test,
                                          edge_numeric=edge_numeric,
                                          edge_flags=edge_flags,
                                          temporal_cols=temporal_cols,
                                          edge_num_mean=edge_num_mean,
                                          edge_num_std=edge_num_std
                                      )

In [24]:
def build_edge_index_and_labels(df: pl.DataFrame, label_col: str = "Is_laundering"):
    """
    Convert Polars DataFrame into edge_index and edge_labels tensors.
    Each row = one edge (transaction).
    """
    # Extract edge_index (2 x E)
    edge_index_np = df.select(["src", "dst"]).to_numpy().T     # shape [2, E]
    edge_index = torch.as_tensor(edge_index_np, dtype=torch.long)

    # Extract edge_labels (E,)
    edge_labels_np = df.select(label_col).to_numpy().ravel().astype("int64")
    edge_labels = torch.as_tensor(edge_labels_np, dtype=torch.long)

    return edge_index, edge_labels

In [25]:
edge_index_train, edge_labels_train = build_edge_index_and_labels(df_train)
edge_index_val, edge_labels_val = build_edge_index_and_labels(df_val)
edge_index_test, edge_labels_test = build_edge_index_and_labels(df_test)

In [26]:
edge_data = {
    'train': {'attr': edge_attr_train, 'index': edge_index_train, 'labels': edge_labels_train},
    'val': {'attr': edge_attr_val, 'index': edge_index_val, 'labels': edge_labels_val},
    'test': {'attr': edge_attr_test, 'index': edge_index_test, 'labels': edge_labels_test},
}

for s in ['train', 'val', 'test']:
    torch.save(edge_data[s]['attr'], os.path.join(drive_path, f"edge_attr_{s}.pt"))
    torch.save(edge_data[s]['index'], os.path.join(drive_path, f"edge_index_{s}.pt"))
    torch.save(edge_data[s]['labels'], os.path.join(drive_path, f"edge_labels_{s}.pt"))

**Build node features**

In [27]:
def build_node_features_from_train(df_train: pl.DataFrame, n_nodes: int,
                                   edge_numeric: list, edge_flags: list,
                                   node_numeric_agg = ["mean","sum","std"], fillna=0.0):
    """
    Produces node_features_df (one row per node id 0..n_nodes-1) and X (torch tensor).
    Aggregation uses df_train only (temporal-safe).
    """
    # numeric aggregations: produce mean/sum/std for each numeric col
    num_aggs = []
    for c in edge_numeric:
        num_aggs.append(pl.col(c).mean().alias(f"{c}_out_mean"))
        num_aggs.append(pl.col(c).sum().alias(f"{c}_out_sum"))
        num_aggs.append(pl.col(c).std().alias(f"{c}_out_std"))
    # count of outgoing
    num_aggs.append(pl.len().alias("out_tx_count"))

    # flag aggregations
    flag_aggs = []
    for c in edge_flags:
        flag_aggs.append(pl.col(c).mean().alias(f"{c}_out_frac"))
        flag_aggs.append(pl.col(c).max().alias(f"{c}_out_any"))

    agg = df_train.group_by("src").agg(num_aggs + flag_aggs)

    # ensure every node included
    nodes_df = pl.DataFrame({"node": list(range(n_nodes))})
    agg = agg.rename({"src": "node"})
    node_features_df = nodes_df.join(agg, on="node", how="left").fill_null(fillna)

    # pick feature columns (exclude 'node'), convert to numpy and torch
    node_feature_cols = [c for c in node_features_df.columns if c != "node"]
    X_np = node_features_df.select(node_feature_cols).to_numpy().astype(np.float32)
    # compute node scalers for these columns (train-based)
    X_mean = X_np.mean(axis=0)
    X_std  = X_np.std(axis=0) + 1e-9
    # standardize (optional) - I recommend standardizing node features
    X_np = (X_np - X_mean[None, :]) / X_std[None, :]
    X = torch.as_tensor(X_np, dtype=torch.float32)
    return node_features_df, X, node_feature_cols, X_mean, X_std

In [28]:
n_nodes = len(mapping_df)
node_features_df, X, node_feature_cols, X_mean, X_std = build_node_features_from_train(
                                                      df_train, n_nodes, edge_numeric, edge_flags)

In [29]:
import pickle

In [30]:
# Save Polars DataFrame as Parquet (efficient + preserves schema)
node_features_df.write_parquet(os.path.join(drive_path, "node_features_df.parquet"))

# Save PyTorch tensor
torch.save(X, os.path.join(drive_path, "X.pt"))

# Save list and NumPy arrays using pickle
with open(os.path.join(drive_path, "node_feature_cols.pkl"), "wb") as f:
    pickle.dump(node_feature_cols, f)

with open(os.path.join(drive_path, "X_mean.npy"), "wb") as f:
    np.save(f, X_mean)

with open(os.path.join(drive_path, "X_std.npy"), "wb") as f:
    np.save(f, X_std)
