## 1. Environment Setup and Imports
Ensure we're using the correct kernel and loading utils from the `src/` directory. Now we also load features.

In [1]:
import sys, os
sys.path.append(os.path.abspath("../src"))

# Torch & GNN
import torch
from torch_geometric.nn import GATConv

# Custom modules
from load_elliptic_data import load_and_preprocess_elliptic_data
from data_utils_randomsplit import random_split
from model_gat import GATNet
from train_utils import set_seed, train_full, save_feature_experiment
from config import SEEDS

# Feature pipeline
from features.feature_config import FEATURE_CONFIGS
from features.feature_runner import apply_engineered_features
from features.feature_utils import prepare_graph_and_timestamps
from normalization import normalize_base_features_only

print("✓ All modules imported successfully.")


✓ All modules imported successfully.


## 2. Load and Preprocess the Elliptic Dataset + Graph for Features + Time Stamps
This loads the PyG-formatted graph and base features. We also construct a NetworkX transaction graph for topology-based features. Per-node timestamp sequences is extracted for temporal burst analysis


In [5]:
from normalization import normalize_base_features_only
from features.feature_pipeline import generate_all_features
import os
import torch

# 1. Load graph
data = load_and_preprocess_elliptic_data("../elliptic_bitcoin_dataset", normalize=False)
print(f"✓ PyG Data object loaded: {data}")

# 2. Prepare NetworkX and timestamps
G_nx, node_timestamps = prepare_graph_and_timestamps(
    edgelist_path="../elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv",
    features_path="../elliptic_bitcoin_dataset/elliptic_txs_features.csv",
    num_nodes=data.num_nodes
)

# 3. Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ABLATORS = {
    k: v for k, v in FEATURE_CONFIGS.items()
}
FEATURED_DATA = {}

# 4. Loop through ablations and seeds
for ablation_name, feature_flags in ABLATORS.items():
    print(f"\n== Config: {ablation_name} ==")
    for seed in SEEDS:
        print(f"→ Seed {seed}")
        set_seed(seed)
        train_idx, val_idx = random_split(data, random_state=seed)

        # Clone and extract features
        data_exp = data.clone()
        X, feature_names = generate_all_features(data_exp, G_nx, node_timestamps, **feature_flags)
        data_exp.x = torch.tensor(X, dtype=torch.float)

        # Optional monitoring of temporal features
        temporal_names = {"temporal_lag", "component_size"}
        temporal_indices = [i for i, name in enumerate(feature_names) if name in temporal_names]

        if temporal_indices:
            print("→ Temporal features (raw)")
            print("  Means:", data_exp.x[:, temporal_indices].mean(0))
            print("  Stds :", data_exp.x[:, temporal_indices].std(0))

        # Normalize base features only
        data_exp.x = normalize_base_features_only(data_exp.x, feature_names)

        # Move to device
        data_exp = data_exp.to(device)

        # Store in memory
        FEATURED_DATA[(ablation_name, seed)] = {
            "data": data_exp,
            "train_idx": train_idx,
            "val_idx": val_idx,
            "feature_flags": feature_flags,
        }

        # Save to disk
        save_dir = f"../data/GAT_randomsplit/{ablation_name}/seed_{seed}"
        os.makedirs(save_dir, exist_ok=True)
        torch.save(data_exp, os.path.join(save_dir, "data_exp.pt"))
        torch.save(train_idx, os.path.join(save_dir, "train_idx.pt"))
        torch.save(val_idx, os.path.join(save_dir, "val_idx.pt"))
        print(f"✓ Saved: {save_dir}")
        print(f"✓ Final shape: {data_exp.x.shape}")


✓ PyG Data object loaded: Data(x=[203769, 166], edge_index=[2, 234355], y=[203769])
Loaded directed graph with 203769 nodes and 234355 edges.
Graph type: <class 'networkx.classes.digraph.DiGraph'>
Features DataFrame shape: (203769, 167)
Number of features (excluding txId and time_step): 165
Timestamps found for 203769 out of 203769 nodes.
✓ Time step range: 1.0 to 49.0

== Config: base ==
→ Seed 42
✓ Saved: ../data/GAT_randomsplit/base/seed_42
✓ Final shape: torch.Size([203769, 166])
→ Seed 123
✓ Saved: ../data/GAT_randomsplit/base/seed_123
✓ Final shape: torch.Size([203769, 166])
→ Seed 777
✓ Saved: ../data/GAT_randomsplit/base/seed_777
✓ Final shape: torch.Size([203769, 166])
→ Seed 2023
✓ Saved: ../data/GAT_randomsplit/base/seed_2023
✓ Final shape: torch.Size([203769, 166])
→ Seed 31415
✓ Saved: ../data/GAT_randomsplit/base/seed_31415
✓ Final shape: torch.Size([203769, 166])

== Config: base+structural ==
→ Seed 42
✓ Saved: ../data/GAT_randomsplit/base+structural/seed_42
✓ Final sha

Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 406551.65it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/base+basic_temporal/seed_42
✓ Final shape: torch.Size([203769, 168])
→ Seed 123


Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 504853.87it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/base+basic_temporal/seed_123
✓ Final shape: torch.Size([203769, 168])
→ Seed 777


Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 532219.95it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/base+basic_temporal/seed_777
✓ Final shape: torch.Size([203769, 168])
→ Seed 2023


Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 517017.76it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/base+basic_temporal/seed_2023
✓ Final shape: torch.Size([203769, 168])
→ Seed 31415


Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 460482.14it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/base+basic_temporal/seed_31415
✓ Final shape: torch.Size([203769, 168])

== Config: base+basic_temporal+typology ==
→ Seed 42


Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 536434.64it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/base+basic_temporal+typology/seed_42
✓ Final shape: torch.Size([203769, 170])
→ Seed 123


Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 532304.14it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/base+basic_temporal+typology/seed_123
✓ Final shape: torch.Size([203769, 170])
→ Seed 777


Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 534371.94it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/base+basic_temporal+typology/seed_777
✓ Final shape: torch.Size([203769, 170])
→ Seed 2023


Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 563913.59it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/base+basic_temporal+typology/seed_2023
✓ Final shape: torch.Size([203769, 170])
→ Seed 31415


Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 546734.91it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/base+basic_temporal+typology/seed_31415
✓ Final shape: torch.Size([203769, 170])

== Config: all ==
→ Seed 42


Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 557969.08it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/all/seed_42
✓ Final shape: torch.Size([203769, 174])
→ Seed 123


Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 547144.42it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/all/seed_123
✓ Final shape: torch.Size([203769, 174])
→ Seed 777


Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 554280.99it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/all/seed_777
✓ Final shape: torch.Size([203769, 174])
→ Seed 2023


Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 564875.92it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/all/seed_2023
✓ Final shape: torch.Size([203769, 174])
→ Seed 31415


Computing temporal features: 100%|██████████| 203769/203769 [00:00<00:00, 566555.72it/s]


→ Temporal features (raw)
  Means: tensor([25.1560])
  Stds : tensor([15.1722])
✓ Saved: ../data/GAT_randomsplit/all/seed_31415
✓ Final shape: torch.Size([203769, 174])


In [3]:
import torch
import os

ROOT_DIR = "../data/GAT_randomsplit"
BASE_DIM = 166
SEEDS = [42, 123, 777, 2023, 31415]

print("\n📊 ENGINEERED FEATURE INSPECTION (from disk)")

# Loop over ablation folders
for ablation_name in sorted(os.listdir(ROOT_DIR)):
    ablation_path = os.path.join(ROOT_DIR, ablation_name)
    if not os.path.isdir(ablation_path):
        continue

    for seed in SEEDS:
        seed_dir = f"seed_{seed}"
        data_path = os.path.join(ablation_path, seed_dir, "data_exp.pt")
        if not os.path.exists(data_path):
            print(f"⛔ Missing file: {data_path}")
            continue

        data = torch.load(data_path, map_location="cpu", weights_only=False)
        x = data.x

        if x.shape[1] <= BASE_DIM:
            print(f"\n {ablation_name} | Seed: {seed}")
            print(" No engineered features found.")
            continue

        engineered = x[:, BASE_DIM:]
        print(f"\n {ablation_name} | Seed: {seed}")
        print(f" Engineered feature count: {engineered.shape[1]}")

        means = engineered.mean(dim=0)
        stds = engineered.std(dim=0)

        for i in range(engineered.shape[1]):
            m = means[i].item()
            s = stds[i].item()
            sample_vals = engineered[:5, i].numpy()
            print(f"Feature {BASE_DIM + i:>3} | Mean: {m:8.4f} | Std: {s:8.4f} | Sample: {sample_vals}")



📊 ENGINEERED FEATURE INSPECTION (from disk)

 all | Seed: 42
 Engineered feature count: 8
Feature 166 | Mean:   1.1501 | Std:   3.9111 | Sample: [1. 0. 1. 1. 0.]
Feature 167 | Mean:   1.1501 | Std:   1.8947 | Sample: [  1.   3. 112.   0.  50.]
Feature 168 | Mean:   0.0000 | Std:   4.3634 | Sample: [   0.   -3. -111.    1.  -50.]
Feature 169 | Mean:   0.0138 | Std:   0.0973 | Sample: [0.         0.         0.00063211 0.         0.        ]
Feature 170 | Mean:   4.2315 | Std:   6.0100 | Sample: [ 0.6931467 14.914124   4.727387   0.        17.727533 ]
Feature 171 | Mean:   0.4457 | Std:   0.3437 | Sample: [0.49999976 0.         0.00884956 0.999999   0.        ]
Feature 172 | Mean:  25.1560 | Std:  15.1722 | Sample: [ 1. 43. 15. 48.  1.]
Feature 173 | Mean:   8.4067 | Std:   0.3653 | Sample: [7.991254  8.373092  7.8188324 8.97221   7.991254 ]

 all | Seed: 123
 Engineered feature count: 8
Feature 166 | Mean:   1.1501 | Std:   3.9111 | Sample: [1. 0. 1. 1. 0.]
Feature 167 | Mean:   1.1501 

## 3. Initialize the GAT Model and Training
Set hyperparameters and move model/data to GPU/CPU if available.

In [6]:
from model_gat import GATNet
from train_utils import train_full, save_feature_experiment
import os
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SKIP_GAT_TEMPORAL = {"base+structural", "base+typology", "base+structural+typology", "all", "base+basic_temporal", "base+basic_temporal+typology"}

for (ablation_name, seed), bundle in FEATURED_DATA.items():
    if ablation_name in SKIP_GAT_TEMPORAL:
        print(f"Skipping already trained config: {ablation_name}, seed: {seed}")
        continue

    print(f"\n=== Training GAT (Temporal): {ablation_name} | Seed: {seed} ===")

    data_exp = bundle["data"]
    train_idx = bundle["train_idx"]
    val_idx = bundle["val_idx"]
    feature_flags = bundle["feature_flags"]

    # Initialize model
    model = GATNet(
        in_channels=data_exp.x.shape[1],
        hidden_channels=8,
        out_channels=2,
        heads=8,
        dropout=0.1
    ).to(device)

    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

    # Train model
    model, *results = train_full(
        model=model,
        data=data_exp,
        train_idx=train_idx,
        val_idx=val_idx,
        optimizer=optimizer,
        num_epochs=300,
        patience=30
    )

    # Save logs and model
    output_dir = f"../model_features/randomsplit/GAT_ablation/{ablation_name}/seed_{seed}"
    os.makedirs(output_dir, exist_ok=True)

    save_feature_experiment(
        output_dir=output_dir,
        model=model,
        results=results,
        seed=seed,
        config={
            "model": "GAT",
            "ablation_name": ablation_name,
            "feature_flags": feature_flags,
            "feature_dim": data_exp.x.shape[1],
            "dropout": 0.1,
            "num_epochs": 300,
            "patience": 30,
            "lr": 0.001,
            "weight_decay": 5e-4
        },
        val_idx=val_idx
    )



=== Training GAT (Temporal): base | Seed: 42 ===
Epoch 001 | Loss: 0.9464 | Val Acc: 0.6367 | Val F1: 0.4907 | F1 Illicit: 0.7634
Epoch 002 | Loss: 0.8481 | Val Acc: 0.6947 | Val F1: 0.5340 | F1 Illicit: 0.8076
Epoch 003 | Loss: 0.7000 | Val Acc: 0.7294 | Val F1: 0.5633 | F1 Illicit: 0.8326
Epoch 004 | Loss: 0.6189 | Val Acc: 0.7585 | Val F1: 0.5887 | F1 Illicit: 0.8530
Epoch 005 | Loss: 0.5792 | Val Acc: 0.7786 | Val F1: 0.6050 | F1 Illicit: 0.8669
Epoch 006 | Loss: 0.5319 | Val Acc: 0.7931 | Val F1: 0.6154 | F1 Illicit: 0.8768
Epoch 007 | Loss: 0.5092 | Val Acc: 0.8039 | Val F1: 0.6229 | F1 Illicit: 0.8842
Epoch 008 | Loss: 0.4734 | Val Acc: 0.8157 | Val F1: 0.6296 | F1 Illicit: 0.8922
Epoch 009 | Loss: 0.4759 | Val Acc: 0.8296 | Val F1: 0.6392 | F1 Illicit: 0.9013
Epoch 010 | Loss: 0.4294 | Val Acc: 0.8410 | Val F1: 0.6463 | F1 Illicit: 0.9087
Epoch 011 | Loss: 0.4096 | Val Acc: 0.8530 | Val F1: 0.6534 | F1 Illicit: 0.9165
Epoch 012 | Loss: 0.3933 | Val Acc: 0.8626 | Val F1: 0.6572

## 4. Evaluation

Patch, errors with config

In [2]:
import os
import json
import torch

base_dir = "../model_features/randomsplit/GAT_ablation"
data_dir = "../data/GAT_randomsplit"
seeds = [42, 123, 777, 2023, 31415]

for ablation in sorted(os.listdir(base_dir)):
    ablation_model_path = os.path.join(base_dir, ablation)
    ablation_data_path = os.path.join(data_dir, ablation)

    if not os.path.isdir(ablation_model_path):
        continue

    print(f"\n→ Processing ablation: {ablation}")

    for seed in seeds:
        seed_dir = f"seed_{seed}"
        config_path = os.path.join(ablation_model_path, seed_dir, "config.json")
        model_path = os.path.join(ablation_model_path, seed_dir, "model.pth")
        data_path = os.path.join(ablation_data_path, seed_dir, "data_exp.pt")

        if not all(os.path.exists(p) for p in [config_path, model_path, data_path]):
            continue

        # Load existing config
        with open(config_path, "r") as f:
            config = json.load(f)

        updated = False

        # Load data to recover feature_dim
        data = torch.load(data_path, map_location="cpu", weights_only=False)
        feature_dim = data.x.shape[1]
        if config.get("feature_dim") != feature_dim:
            config["feature_dim"] = feature_dim
            updated = True

        # Patch GAT-only fields
        if config.get("model") == "GAT":
            state_dict = torch.load(model_path, map_location="cpu")

            if "gat1.lin.weight" in state_dict:
                total_hidden_dim = state_dict["gat1.lin.weight"].shape[0]  # = heads * hidden_channels
                heads = config.get("heads", 8)
                hidden_channels = total_hidden_dim // heads

                if config.get("hidden_channels") != hidden_channels:
                    config["hidden_channels"] = hidden_channels
                    updated = True
                if config.get("heads") != heads:
                    config["heads"] = heads
                    updated = True
                if config.get("out_channels") != 2:
                    config["out_channels"] = 2
                    updated = True
            else:
                print(f"[!] Warning: gat1.lin.weight not found in {model_path}")
                continue

        if updated:
            with open(config_path, "w") as f:
                json.dump(config, f, indent=4)
            print(f"✓ Patched config for {ablation} / seed {seed}")
        else:
            print(f"✓ Already complete: {ablation} / seed {seed}")



→ Processing ablation: all
✓ Already complete: all / seed 42
✓ Already complete: all / seed 123
✓ Already complete: all / seed 777
✓ Already complete: all / seed 2023
✓ Already complete: all / seed 31415

→ Processing ablation: base
✓ Already complete: base / seed 42
✓ Already complete: base / seed 123
✓ Already complete: base / seed 777
✓ Already complete: base / seed 2023
✓ Already complete: base / seed 31415

→ Processing ablation: base+basic_temporal
✓ Already complete: base+basic_temporal / seed 42
✓ Already complete: base+basic_temporal / seed 123
✓ Already complete: base+basic_temporal / seed 777
✓ Already complete: base+basic_temporal / seed 2023
✓ Already complete: base+basic_temporal / seed 31415

→ Processing ablation: base+basic_temporal+typology
✓ Already complete: base+basic_temporal+typology / seed 42
✓ Already complete: base+basic_temporal+typology / seed 123
✓ Already complete: base+basic_temporal+typology / seed 777
✓ Already complete: base+basic_temporal+typology / 

In [None]:
import os
from features.evaluate_ablation_model import evaluate_ablation_model
from evaluation_pipeline import run_inference_all_seeds
from analysis_utils import load_metrics_across_seeds, log_metrics_to_csv
from features.feature_utils import make_model_class_from_config

# === Config ===
seeds = [42, 123, 777, 2023, 31415]
base_dir = "../model_features/randomsplit/GAT_ablation"
data_exp_root = "../data/GAT_randomsplit"
node_id_csv_path = "../elliptic_bitcoin_dataset/elliptic_txs_features.csv"

model_tag = "GAT-Random"
split = "randomsplit"

# === Evaluate all ablations ===
for ablation in sorted(os.listdir(base_dir)):
    model_dir = os.path.join(base_dir, ablation)
    if not os.path.isdir(model_dir):
        continue

    print(f"\n→ Evaluating ablation: {ablation}")
    model_name = f"{model_tag}: {ablation}"
    ablation_data_dir = os.path.join(data_exp_root, ablation)
    config_path = os.path.join(model_dir, "seed_42", "config.json")
    model_class = make_model_class_from_config(config_path)

    # Run full evaluation
    evaluate_ablation_model(
        model_dir=model_dir,
        model_class=model_class,
        model_name=model_name,
        seeds=seeds,
        node_id_csv_path=node_id_csv_path,
        data_dir=ablation_data_dir
    )

    # Run inference (to extract structured metrics for CSV logging)
    y_true_all, y_pred_all, y_proba_all, seed_metrics = run_inference_all_seeds(
        model_dir=model_dir,
        model_class=model_class,
        data_dir=ablation_data_dir,
        seeds=seeds
    )
    val_acc_list = load_metrics_across_seeds(model_dir, ["val_acc"])["val_acc"]

    # Save to CSV
    log_metrics_to_csv(
        model_name=model_tag,
        split_name=split,
        ablation=ablation,
        seeds=seeds,
        val_acc_list=val_acc_list,
        seed_metrics=seed_metrics,
        is_feature=True  # ← optional flag to append _feature to filename
    )


ImportError: cannot import name 'evaluate_model_from_checkpoint' from 'features.feature_utils' (c:\Users\ruoho\Documents\BSP\src\features\feature_utils.py)