# Data Exploration: Elliptic Dataset
# Augmentate network features
https://www.kaggle.com/datasets/ellipticco/elliptic-data-set/data

[1] Elliptic, www.elliptic.co.

[2] M. Weber, G. Domeniconi, J. Chen, D. K. I. Weidele, C. Bellei, T. Robinson, C. E. Leiserson, "Anti-Money Laundering in Bitcoin: Experimenting with Graph Convolutional Networks for Financial Forensics", KDD ’19 Workshop on Anomaly Detection in Finance, August 2019, Anchorage, AK, USA.

Description: The Elliptic Data Set maps Bitcoin transactions to real entities belonging to licit categories (exchanges, wallet providers, miners, licit services, etc.) versus illicit ones (scams, malware, terrorist organizations, ransomware, Ponzi schemes, etc.). The task on the dataset is to classify the illicit and licit nodes in the graph.


In [2]:
import networkx as nx
import pandas as pd
from pathlib import Path
import plotly
import numpy as np
import plotly.graph_objects as go

from src.data.load_data import load_elliptic_dataset
from src.visualization.graphs import plot_transaction_graph
from src.data.preprocess import corr_with_binary_labels
from scipy import stats
from scipy.stats import chi2_contingency
pd.options.plotting.backend = 'plotly'
from typing import List, Dict

In [3]:
import os
import pathlib
import random
from typing import Dict, List
from tqdm import tqdm

import numpy as np
import pandas as pd
import networkx as nx
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
from torch_geometric.nn import GCNConv

In [4]:
DATA_PATH = 'data/raw/'
nodes_df, edges_df = load_elliptic_dataset(DATA_PATH)
nodes_df['class_label'] = nodes_df['class_label'].replace(['1', '2', 'unknown'], ['illicit', 'licit', 'unknown'])

In [6]:
# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x16985d4d0>

In [7]:
edges, df, feature_cols = edges_df, nodes_df, nodes_df.drop(columns=['txId', 'time_step', 'class_label']).columns.to_list()


In [19]:
gw_features.merge(nodes_df[['txId', 'class_label']], on = 'txId').query('class_label == "illicit" & gw_avg_step == 1')

Unnamed: 0,txId,gw_hit_ratio,gw_min_step,gw_avg_step,gw_unique_illicit,class_label
907,232629023,1.0,1,1.0,1,illicit
1361,230389796,1.0,1,1.0,1,illicit
2718,17387772,1.0,1,1.0,1,illicit
2815,232947878,1.0,1,1.0,1,illicit
3423,16754007,1.0,1,1.0,1,illicit
...,...,...,...,...,...,...
203685,159043651,1.0,1,1.0,1,illicit
203708,158360779,1.0,1,1.0,1,illicit
203736,159028476,1.0,1,1.0,1,illicit
203759,158375075,1.0,1,1.0,1,illicit


In [20]:

##############################################################################
#                 Graph Convolutional Network Embeddings                      #
##############################################################################

class GCN(nn.Module):
    def __init__(self, in_feats, hidden=64, out_feats=32, dropout=0.3):
        super().__init__()
        self.conv1 = GCNConv(in_feats, hidden)
        self.conv2 = GCNConv(hidden, out_feats)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
    
    def forward(self, x, edge_index):
        x = self.relu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        return x


def train_gcn_get_embeddings(df, time_step_graphs, feature_cols, epochs=30, lr=1e-3, device='cpu'):
    """Train semi-supervised GCN for each time step and return embeddings."""
    all_embeddings = []
    
    for time_step, G in tqdm(time_step_graphs.items(), desc="GCN Embeddings"):
        if len(G.nodes()) < 10:  # Skip very small graphs
            continue
            
        # Filter dataframe for nodes in this time step
        sub_df = df[df['time_step'] == time_step]
        nodes = list(G.nodes())
        
        if len(sub_df) < 10:  # Skip if not enough data
            continue
            
        # PyG data
        pyg_graph = from_networkx(G)
        
        # Prepare feature matrix - align with node order in the graph
        node_to_idx = {node: i for i, node in enumerate(nodes)}
        
        # Make sure all nodes are in dataframe
        nodes_in_df = [n for n in nodes if n in sub_df['txId'].values]
        
        if not nodes_in_df:
            continue
            
        # Get features for these nodes
        node_df = sub_df[sub_df['txId'].isin(nodes_in_df)]
        feat_mat = node_df.set_index('txId')[feature_cols].fillna(0).values
        
        # Create node index to feature row mapping
        node_to_feat_row = {node: i for i, node in enumerate(node_df['txId'])}
        
        # Create edge index for PyG
        edge_list = list(G.edges())
        src_nodes = [src for src, _ in edge_list if src in node_to_feat_row]
        dst_nodes = [dst for _, dst in edge_list if dst in node_to_feat_row]
        
        if not src_nodes or not dst_nodes:
            continue
            
        # Map to indices
        src_indices = [node_to_idx[src] for src in src_nodes]
        dst_indices = [node_to_idx[dst] for dst in dst_nodes]
        
        edge_index = torch.tensor([src_indices, dst_indices], dtype=torch.long)
        
        # Features and labels
        x = torch.tensor(feat_mat, dtype=torch.float32)
        
        # Map labels: illicit=1, licit=0, unknown=-1
        label_map = {'illicit': 1, 'licit': 0, 'unknown': -1}
        y = node_df['class_label'].map(label_map).fillna(-1).astype(int).values
        y = torch.tensor(y, dtype=torch.long)
        
        # Setup train mask
        train_mask = y >= 0  # Only labeled nodes
        train_mask = torch.tensor(train_mask, dtype=torch.bool)
        
        if not train_mask.any():  # Skip if no labeled nodes
            continue
        
        # Create PyG data object
        data = Data(x=x, edge_index=edge_index, y=y)
        data.train_mask = train_mask
        data = data.to(device)
        
        # Initialize model
        model = GCN(in_feats=len(feature_cols), hidden=128, out_feats=64).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
        loss_fn = nn.CrossEntropyLoss()
        
        # Training loop
        model.train()
        for epoch in range(epochs):
            optimizer.zero_grad()
            out = model(data.x, data.edge_index)
            loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
            loss.backward()
            optimizer.step()
            if epoch % 10 == 0:
                print(f"Time step {time_step}, Epoch {epoch}: Loss {loss.item():.4f}")
        
        # Get embeddings
        model.eval()
        with torch.no_grad():
            emb = model(data.x, data.edge_index).cpu().numpy()
        
        # Create embedding dataframe
        emb_df = pd.DataFrame(emb, columns=[f'gcn_{i}' for i in range(emb.shape[1])])
        emb_df['txId'] = node_df['txId'].values
        
        all_embeddings.append(emb_df)
    
    # Combine all embeddings
    if all_embeddings:
        return pd.concat(all_embeddings, ignore_index=True)
    else:
        # Return empty DataFrame with correct columns
        return pd.DataFrame(columns=['txId'] + [f'gcn_{i}' for i in range(64)])



In [None]:

# 5. Train GCN and get embeddings
print("Training GCN and getting embeddings for each time step...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
gcn_embeddings = train_gcn_get_embeddings(time_step_graphs, df, feature_cols, device=device)

# 6. Merge all features
print("Merging features...")
final_df = df.merge(gw_features, on='txId', how='left')
final_df = final_df.merge(gcn_embeddings, on='txId', how='left')

# 7. Train classifier (time-based split as in the paper)
print("Training classifier...")
# Split by timestep (34 is the cutoff mentioned in the paper)
train_mask = final_df['time_step'] <= 34
test_mask = final_df['time_step'] > 34

# Prepare features
X_cols = feature_cols + [c for c in final_df.columns if c.startswith('gw_') or c.startswith('gcn_')]
X = final_df[X_cols].fillna(0).values  # Fill NAs for any missing embeddings
# Class 'illicit' is the target
y = (final_df['class_label'] == 'illicit').astype(int).values

X_train = X[train_mask]
y_train = y[train_mask]
X_test = X[test_mask]
y_test = y[test_mask]

# Train Random Forest
clf = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

print("\nTop 20 most important features:")
feature_names = X_cols
for i in range(min(20, len(feature_names))):
    print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

# Save results
final_df.to_csv(os.path.join(data_path, 'elliptic_processed.csv'), index=False)
print(f"Processed data saved to {os.path.join(data_path, 'elliptic_processed.csv')}")


if __name__ == "__main__":
main()

In [13]:
feature_cols = nodes_df.drop(columns=['txId', 'time_step', 'class_label']).columns.to_list()
### für mac 
mps_device = torch.device("mps")

In [11]:

def build_nx_graph(edges):
    """Build a NetworkX graph from edge list."""
    G = nx.DiGraph()  # Directed graph as per the paper
    G.add_edges_from(edges[['src', 'dst']].itertuples(index=False, name=None))
    return G

In [14]:
G = build_nx_graph(edges_df)

gcn_embeddings = train_gcn_get_embeddings(G, nodes_df, feature_cols, device=mps_device)
    

KeyboardInterrupt: 

In [None]:

    # 4. Compute GuiltyWalker features
    print("Computing GuiltyWalker features...")
    gw_features = guilty_walker_features(G, labels)
    
    # 5. Train GCN and get embeddings
    print("Training GCN and getting embeddings...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    gcn_embeddings = train_gcn_get_embeddings(G, df, feature_cols, device=device)
    
    # 6. Merge all features
    print("Merging features...")
    final_df = df.merge(gw_features, on='txId', how='left')
    final_df = final_df.merge(gcn_embeddings, on='txId', how='left')
    
    # 7. Train classifier (time-based split as in the paper)
    print("Training classifier...")
    # Split by timestep (34 is the cutoff mentioned in the paper)
    train_mask = final_df['time_step'] <= 34
    test_mask = final_df['time_step'] > 34
    
    # Prepare features
    X_cols = feature_cols + [c for c in final_df.columns if c.startswith('gw_') or c.startswith('gcn_')]
    X = final_df[X_cols].values
    # Class '2' is illicit
    y = (final_df['class_label'] == '2').astype(int).values
    
    X_train = X[train_mask]
    y_train = y[train_mask]
    X_test = X[test_mask]
    y_test = y[test_mask]
    
    # Train Random Forest
    clf = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)
    clf.fit(X_train, y_train)
    
    # Evaluate
    y_pred = clf.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    