In [1]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.datasets import EllipticBitcoinDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score

In [2]:
try:
    dataset = EllipticBitcoinDataset(root='./data/EllipticBitcoinDataset')
    data = dataset[0]
    print("✅ Dataset loaded successfully!")
except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    print("Please check your internet connection or library installation.")
    exit()

Downloading https://data.pyg.org/datasets/elliptic/elliptic_txs_features.csv.zip
Downloading https://data.pyg.org/datasets/elliptic/elliptic_txs_edgelist.csv.zip
Downloading https://data.pyg.org/datasets/elliptic/elliptic_txs_classes.csv.zip
Processing...


✅ Dataset loaded successfully!


Done!


In [3]:
# Initial Data Exploration (EDA)
print("\n--- Graph Statistics ---")
print(f"Graph Data Object: {data}")
print(f"Number of nodes (transactions): {data.num_nodes}")
print(f"Number of edges (flows): {data.num_edges}")
print(f"Number of node features: {data.num_node_features}")
print(f"Number of classes: {dataset.num_classes}")


--- Graph Statistics ---
Graph Data Object: Data(x=[203769, 165], edge_index=[2, 234355], y=[203769], train_mask=[203769], test_mask=[203769])
Number of nodes (transactions): 203769
Number of edges (flows): 234355
Number of node features: 165
Number of classes: 2


In [4]:
# Explore Node Labels (Classes)
# In the Elliptic dataset: y=1 -> illicit, y=0 -> licit
# The 'train_mask' identifies nodes with known labels.
labeled_nodes_mask = data.train_mask
labels = data.y[labeled_nodes_mask]

num_labeled_nodes = labels.size(0)
num_illicit = (labels == 1).sum().item()
num_licit = (labels == 0).sum().item()

print("\n--- Node Label Distribution (on Labeled Data) ---")
print(f"Total labeled nodes: {num_labeled_nodes}")
print(f"Number of illicit (fraudulent) nodes: {num_illicit}")
print(f"Number of licit (normal) nodes: {num_licit}")
if num_labeled_nodes > 0:
    print(f"Proportion of illicit nodes: {num_illicit / num_labeled_nodes:.4f}")


--- Node Label Distribution (on Labeled Data) ---
Total labeled nodes: 29894
Number of illicit (fraudulent) nodes: 3462
Number of licit (normal) nodes: 26432
Proportion of illicit nodes: 0.1158


In [5]:
# ---  Preprocessing & Baseline Model ---

# Extract features and labels for labeled nodes
# Note: We use the original, unscaled features for the baseline model here.
node_features = data.x.numpy()[labeled_nodes_mask.numpy()]
node_labels = data.y.numpy()[labeled_nodes_mask.numpy()]

# 2. Split data into training and testing sets
# We use stratify to maintain the same class distribution in train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    node_features, node_labels, test_size=0.3, random_state=42, stratify=node_labels
)
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

Training set size: 20925
Testing set size: 8969


In [6]:
# Scale features
# It's important to scale features for many ML models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(" Features scaled successfully.")


 Features scaled successfully.


In [8]:
# Train Isolation Forest baseline model
# We only train it on the 'normal' data points from the training set.
iso_forest = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
print("Training Isolation Forest on normal data...")
# Filter the training data to include only licit (0) nodes for training
X_train_normal = X_train_scaled[y_train == 0]
iso_forest.fit(X_train_normal)
print(" Baseline model trained.")

Training Isolation Forest on normal data...
 Baseline model trained.


In [9]:
# Evaluate the baseline model
# The model outputs -1 for anomalies and 1 for normal instances.
# We need to convert this to an anomaly score.
anomaly_scores = iso_forest.decision_function(X_test_scaled)
# We invert the scores because roc_auc_score expects higher values for the positive class (anomalies)
inverted_scores = -anomaly_scores

auc_score = roc_auc_score(y_test, inverted_scores)

print("\n--- Baseline Model Performance ---")
print(f"Isolation Forest AUC-ROC Score: {auc_score:.4f}")


--- Baseline Model Performance ---
Isolation Forest AUC-ROC Score: 0.1213
