## Imports

In [None]:
import os
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, SAGEConv
from torch_geometric.utils import from_networkx

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## EDA

**A Labeled Transactions-Based Dataset on the Ethereum Network**

Link: https://github.com/salam-ammari/Labeled-Transactions-based-Dataset-of-Ethereum-Network/tree/master

In [None]:
eth = pd.read_csv("data/ammari/dataset.csv")
eth.tail(2)

In [None]:
eth.columns

In [None]:
eth.info()

## Graph Construction

In [None]:
def build_transaction_graph(df):
    """
    Build a graph from Ethereum transactions
    Nodes are addresses, edges are transactions
    """
    G = nx.DiGraph()
    
    address_info = {}
    
    # process 'from_address' data
    from_addr_df = df[['from_address', 'from_scam', 'from_category']].drop_duplicates()
    for _, row in from_addr_df.iterrows():
        addr = row['from_address']
        if addr not in address_info:
            address_info[addr] = {'is_scam': 0, 'category': "Unknown"}
        
        if row['from_scam'] == 1:
            address_info[addr]['is_scam'] = 1
        
        if not pd.isna(row['from_category']):
            address_info[addr]['category'] = row['from_category']
    
    # process 'to_address' data
    to_addr_df = df[['to_address', 'to_scam', 'to_category']].drop_duplicates()
    for _, row in to_addr_df.iterrows():
        addr = row['to_address']
        if addr not in address_info:
            address_info[addr] = {'is_scam': 0, 'category': "Unknown"}
        
        if row['to_scam'] == 1:
            address_info[addr]['is_scam'] = 1
        
        if not pd.isna(row['to_category']):
            address_info[addr]['category'] = row['to_category']
    
    # add nodes
    for addr, info in tqdm(address_info.items(), desc="Adding nodes"):
        G.add_node(addr, is_scam=info['is_scam'], category=info['category'])
    
    # prepare transaction data
    edge_data = []
    for _, row in tqdm(df.iterrows(), desc="Processing transactions"):
        from_addr = row['from_address']
        to_addr = row['to_address']
        
        value = float(row['value']) if not pd.isna(row['value']) else 0.0
        gas = float(row['gas']) if not pd.isna(row['gas']) else 0.0
        gas_price = float(row['gas_price']) if not pd.isna(row['gas_price']) else 0.0
        
        timestamp = pd.to_datetime(row['block_timestamp']).timestamp()
        
        # fraudulent if either sender or receiver is involved in scam
        is_fraud = (row['from_scam'] == 1) or (row['to_scam'] == 1)
        
        edge_data.append((
            from_addr,
            to_addr,
            {
                'tx_hash': row['hash'],
                'value': value,
                'gas': gas,
                'gas_price': gas_price,
                'timestamp': timestamp,
                'is_fraud': int(is_fraud)
            }
        ))
    
    # add edges
    G.add_edges_from(edge_data)
    
    return G

In [None]:
transaction_graph = build_transaction_graph(eth)
print(f"Graph built with {transaction_graph.number_of_nodes()} nodes and {transaction_graph.number_of_edges()} edges")

## Feature Engineering

In [None]:
def engineer_node_features(G):
    """
    Engineer node features for addresses in the transaction graph
    """
    node_features = {} # feature dictionary
    
    # compute features for each node (address)
    for node in G.nodes():
        # in and out edges
        in_edges = list(G.in_edges(node, data=True))
        out_edges = list(G.out_edges(node, data=True))
        
        # tx count
        num_in_tx = len(in_edges)
        num_out_tx = len(out_edges)
        
        # tx value
        in_values = [e[2]['value'] for e in in_edges]
        out_values = [e[2]['value'] for e in out_edges]
        
        # aggregate features
        total_in_value = sum(in_values) if in_values else 0
        total_out_value = sum(out_values) if out_values else 0
        avg_in_value = np.mean(in_values) if in_values else 0
        avg_out_value = np.mean(out_values) if out_values else 0
        max_in_value = max(in_values) if in_values else 0
        max_out_value = max(out_values) if out_values else 0
        
        # time based features
        in_timestamps = [e[2]['timestamp'] for e in in_edges]
        out_timestamps = [e[2]['timestamp'] for e in out_edges]
        
        in_intervals = np.diff(sorted(in_timestamps)) if len(in_timestamps) > 1 else [0]
        out_intervals = np.diff(sorted(out_timestamps)) if len(out_timestamps) > 1 else [0]
        
        avg_in_interval = np.mean(in_intervals) if len(in_intervals) > 0 else 0
        avg_out_interval = np.mean(out_intervals) if len(out_intervals) > 0 else 0
        
        # topological features
        in_degree = G.in_degree(node)
        out_degree = G.out_degree(node)
        
        # store node features
        node_features[node] = {
            'num_in_tx': num_in_tx,
            'num_out_tx': num_out_tx,
            'total_in_value': total_in_value,
            'total_out_value': total_out_value,
            'avg_in_value': avg_in_value,
            'avg_out_value': avg_out_value,
            'max_in_value': max_in_value,
            'max_out_value': max_out_value,
            'avg_in_interval': avg_in_interval,
            'avg_out_interval': avg_out_interval,
            'in_degree': in_degree,
            'out_degree': out_degree,
            'net_flow': total_in_value - total_out_value
        }
    
    return node_features

def engineer_edge_features(G):
    """
    Engineer edge features for transactions in the graph
    """

    edge_features = {}
    for u, v, data in G.edges(data=True):
        tx_hash = data['tx_hash']
        
        # ratio features
        sender_total_out = sum([e[2]['value'] for e in G.out_edges(u, data=True)])
        receiver_total_in = sum([e[2]['value'] for e in G.in_edges(v, data=True)])
        
        # pct of sender's outgoing value this transaction represents
        pct_of_sender_outgoing = data['value'] / sender_total_out if sender_total_out > 0 else 0
        
        # pct of receiver's incoming value this transaction represents
        pct_of_receiver_incoming = data['value'] / receiver_total_in if receiver_total_in > 0 else 0
        
        # store edge features
        edge_features[(u, v)] = {
            'value': data['value'],
            'gas': data['gas'],
            'gas_price': data['gas_price'],
            'timestamp': data['timestamp'],
            'pct_of_sender_outgoing': pct_of_sender_outgoing,
            'pct_of_receiver_incoming': pct_of_receiver_incoming,
            'gas_to_value_ratio': data['gas'] * data['gas_price'] / data['value'] if data['value'] > 0 else 0,
            'is_fraud': data['is_fraud']
        }
    
    return edge_features

In [None]:
# feature engineering
node_features = engineer_node_features(transaction_graph)
edge_features = engineer_edge_features(transaction_graph)

# convert to dataframes
node_df = pd.DataFrame.from_dict(node_features, orient='index')
edge_df = pd.DataFrame.from_dict(edge_features, orient='index')

# scaling
node_scaler = StandardScaler()
edge_scaler = StandardScaler()

node_features_scaled = node_scaler.fit_transform(node_df.fillna(0))
edge_features_scaled = edge_scaler.fit_transform(edge_df.drop('is_fraud', axis=1).fillna(0))

# add scaled features back to graph
for i, node in enumerate(node_df.index):
    for j, feature in enumerate(node_df.columns):
        transaction_graph.nodes[node][feature] = node_features_scaled[i, j]

for i, (u, v) in enumerate(edge_df.index):
    for j, feature in enumerate(edge_df.columns.drop('is_fraud')):
        transaction_graph.edges[u, v][feature] = edge_features_scaled[i, j]

print("Feature engineering completed")

In [None]:
# prepare data for GNN

def prepare_pytorch_geometric_data(G):
    """
    Convert networkx graph to PyTorch Geometric Data object
    """

    # NODES

    # node feature matrix
    node_list = list(G.nodes())
    node_mapping = {node: i for i, node in enumerate(node_list)}
    
    # first, identify all feature keys across all nodes (excluding labels)
    node_feature_keys = set()
    for _, attrs in G.nodes(data=True):
        node_feature_keys.update([k for k in attrs.keys() if k != 'is_scam' and k != 'category'])
    
    # sort keys for consistent ordering
    node_feature_keys = sorted(list(node_feature_keys))
    num_node_features = len(node_feature_keys)
    
    # initialize feature matrix with the correct dimensions
    num_nodes = len(node_list)
    x = torch.zeros((num_nodes, num_node_features))
    
    # node labels (is_scam)
    y_nodes = torch.zeros(num_nodes, dtype=torch.long)
    
    # fill in node features and labels
    for i, (node, attrs) in enumerate(G.nodes(data=True)):
        for j, key in enumerate(node_feature_keys):
            if key in attrs:
                x[i, j] = float(attrs[key])  # ensure all values are converted to float
        
        y_nodes[i] = attrs.get('is_scam', 0)
    
    # EDGES

    # identify all edge feature keys
    edge_feature_keys = set()
    for _, _, attrs in G.edges(data=True):
        edge_feature_keys.update([k for k in attrs.keys() if k != 'is_fraud' and k != 'tx_hash'])
    
    # sort keys for consistent ordering
    edge_feature_keys = sorted(list(edge_feature_keys))
    num_edge_features = len(edge_feature_keys)
    
    # create edge indices and features
    edge_list = list(G.edges(data=True))
    num_edges = len(edge_list)
    
    edge_index = torch.zeros((2, num_edges), dtype=torch.long)
    edge_attr = torch.zeros((num_edges, num_edge_features))
    edge_labels = torch.zeros(num_edges, dtype=torch.long)
    
    # fill in edge indices, features, and labels
    for i, (u, v, attrs) in enumerate(edge_list):
        edge_index[0, i] = node_mapping[u]
        edge_index[1, i] = node_mapping[v]
        
        for j, key in enumerate(edge_feature_keys):
            if key in attrs:
                edge_attr[i, j] = float(attrs[key])  # ensure all values are converted to float
        
        edge_labels[i] = attrs.get('is_fraud', 0)
    
    # create PyG
    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, 
                y_node=y_nodes, y_edge=edge_labels)
    
    # store feature names as additional attributes for interpretability
    data.node_feature_names = node_feature_keys
    data.edge_feature_names = edge_feature_keys
    
    print(f"Node features: {num_node_features}, Edge features: {num_edge_features}")
    
    return data, node_mapping

In [None]:
pyg_data, node_mapping = prepare_pytorch_geometric_data(transaction_graph)

## Model

In [None]:
class GNNFraudDetector(torch.nn.Module):
    def __init__(self, num_node_features, num_edge_features, hidden_channels):
        super(GNNFraudDetector, self).__init__()
        
        # GC layers
        self.conv1 = GCNConv(num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        
        # edge prediction layers
        self.edge_mlp = torch.nn.Sequential(
            torch.nn.Linear(2 * hidden_channels + num_edge_features, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(hidden_channels),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(hidden_channels // 2),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(hidden_channels // 2, 2)  # binary classification
        )
        
    def forward(self, x, edge_index, edge_attr):
        # node embedding
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        x = torch.relu(x)
        x = self.conv3(x, edge_index)
        
        # for each edge, concatenate source node, destination node, and edge features
        edge_src, edge_dst = edge_index
        edge_features = torch.cat([x[edge_src], x[edge_dst], edge_attr], dim=1)
        
        # edge classification
        edge_pred = self.edge_mlp(edge_features)
        
        return edge_pred

# parameters
num_node_features = pyg_data.x.shape[1]
num_edge_features = pyg_data.edge_attr.shape[1]
hidden_channels = 64

# initialize model
model = GNNFraudDetector(num_node_features, num_edge_features, hidden_channels)
print(f"Model created with {num_node_features} node features and {num_edge_features} edge features")

## Train

In [None]:
def train_model(model, data, epochs=100, lr=0.001):

    # split edges into train/val/test
    num_edges = data.edge_attr.shape[0]
    indices = list(range(num_edges))
    
    # 70% train, 15% val, 15% test
    train_indices, temp_indices = train_test_split(indices, test_size=0.3, random_state=42)
    val_indices, test_indices = train_test_split(temp_indices, test_size=0.5, random_state=42)
    
    # optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
    criterion = torch.nn.CrossEntropyLoss()
    
    # calculate class weights to handle imbalanced data
    edge_labels = data.y_edge
    num_fraud = (edge_labels == 1).sum().item()
    num_non_fraud = (edge_labels == 0).sum().item()
    
    weight_non_fraud = num_fraud / (num_fraud + num_non_fraud)
    weight_fraud = num_non_fraud / (num_fraud + num_non_fraud)
    
    class_weights = torch.tensor([weight_non_fraud, weight_fraud])
    criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
    
    # training loop
    model.train()
    for epoch in range(epochs):

        # forward pass
        optimizer.zero_grad()
        edge_pred = model(data.x, data.edge_index, data.edge_attr)
        
        # compute loss on training edges
        loss = criterion(edge_pred[train_indices], data.y_edge[train_indices])
        
        # backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # validation
        if epoch % 5 == 0:
            model.eval()
            with torch.no_grad():
                val_pred = model(data.x, data.edge_index, data.edge_attr)[val_indices]
                val_loss = criterion(val_pred, data.y_edge[val_indices])
                
                # accuracy
                _, predicted = torch.max(val_pred, 1)
                val_acc = (predicted == data.y_edge[val_indices]).sum().item() / len(val_indices)
            
            print(f'Epoch {epoch}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}, Val Acc: {val_acc:.4f}')
            model.train()
    
    # test evaluation
    model.eval()
    with torch.no_grad():
        test_pred = model(data.x, data.edge_index, data.edge_attr)[test_indices]
        test_loss = criterion(test_pred, data.y_edge[test_indices]).item()
        
        _, predicted = torch.max(test_pred, 1)
        test_acc = (predicted == data.y_edge[test_indices]).sum().item() / len(test_indices)
        
        # calculate precision, recall, F1
        true_labels = data.y_edge[test_indices].numpy()
        pred_labels = predicted.numpy()
        
        true_positives = ((pred_labels == 1) & (true_labels == 1)).sum()
        false_positives = ((pred_labels == 1) & (true_labels == 0)).sum()
        false_negatives = ((pred_labels == 0) & (true_labels == 1)).sum()
        
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
    print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')
    
    return test_indices, predicted

In [None]:
test_indices, predictions = train_model(model, pyg_data, epochs=50)

## Analysis

In [None]:
def visualize_fraud_network(G, node_mapping, test_indices, predictions):
    """
    Visualize the transaction graph highlighting detected fraud transactions
    """
    # create subgraph of test transactions
    test_edges = []
    inv_node_mapping = {v: k for k, v in node_mapping.items()}
    
    edge_list = list(G.edges())
    for i, idx in enumerate(test_indices):
        if idx < len(edge_list):
            edge = edge_list[idx]
            u, v = edge
            pred_label = predictions[i].item()
            test_edges.append((u, v, {'predicted_fraud': pred_label == 1}))
    
    test_graph = nx.DiGraph()
    test_graph.add_edges_from(test_edges)
    
    # keep only nodes that are part of test edges
    nodes_in_test = set()
    for u, v, _ in test_edges:
        nodes_in_test.add(u)
        nodes_in_test.add(v)
    
    # color nodes based on scam label
    node_colors = []
    for node in test_graph.nodes():
        if node in G.nodes() and G.nodes[node].get('is_scam', 0) == 1:
            node_colors.append('red')
        else:
            node_colors.append('blue')
    
    # color edges based on fraud prediction
    edge_colors = []
    for _, _, attr in test_graph.edges(data=True):
        if attr.get('predicted_fraud', False):
            edge_colors.append('red')
        else:
            edge_colors.append('gray')
    
    # plot the graph
    plt.figure(figsize=(12, 12))
    pos = nx.spring_layout(test_graph, seed=42)
    nx.draw_networkx_nodes(test_graph, pos, node_color=node_colors, node_size=50, alpha=0.8)
    nx.draw_networkx_edges(test_graph, pos, edge_color=edge_colors, width=1, alpha=0.6)
    
    plt.title('Ethereum Transaction Graph with Fraud Predictions')
    red_patch = plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10, label='Scam Address')
    blue_patch = plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=10, label='Normal Address')
    red_edge = plt.Line2D([0], [0], color='red', lw=2, label='Predicted Fraud Transaction')
    gray_edge = plt.Line2D([0], [0], color='gray', lw=2, label='Normal Transaction')
    
    plt.legend(handles=[red_patch, blue_patch, red_edge, gray_edge], loc='upper right')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('fraud_network_visualization.png')
    plt.show()
    
    # analyze top fraud patterns
    print("\nAnalyzing fraud patterns...")
    
    # get edges predicted as fraud
    predicted_fraud_edges = []
    for i, idx in enumerate(test_indices):
        if predictions[i].item() == 1 and idx < len(edge_list):
            edge = edge_list[idx]
            predicted_fraud_edges.append(edge)
    
    # extract features of fraud transactions
    if predicted_fraud_edges:
        fraud_features = {}
        for u, v in predicted_fraud_edges:
            edge_data = G.edges[u, v]
            for feature, value in edge_data.items():
                if feature not in ['tx_hash', 'is_fraud']:
                    if feature not in fraud_features:
                        fraud_features[feature] = []
                    fraud_features[feature].append(value)
        
        # calculate statistics for fraud transactions
        fraud_stats = {}
        for feature, values in fraud_features.items():
            fraud_stats[feature] = {
                'mean': np.mean(values),
                'median': np.median(values),
                'std': np.std(values),
                'min': np.min(values),
                'max': np.max(values)
            }
        
        # print statistics
        print("\nFraud Transaction Statistics:")
        for feature, stats in fraud_stats.items():
            print(f"{feature}:")
            for stat_name, stat_value in stats.items():
                print(f"  {stat_name}: {stat_value}")
            print()
    else:
        print("No fraud transactions predicted in the test set.")

In [None]:
visualize_fraud_network(transaction_graph, node_mapping, test_indices, predictions)

In [None]:
# save and explain feature importance

def analyze_feature_importance(model, data, node_mapping):
    """
    Analyze feature importance by measuring gradients
    """
    model.eval()
    data.x.requires_grad = True
    data.edge_attr.requires_grad = True
    
    # forward pass
    edge_pred = model(data.x, data.edge_index, data.edge_attr)
    fraud_prob = edge_pred[:, 1]  # probability of fraud class
    
    # calculate gradients
    fraud_prob.sum().backward()
    
    # node feature importance
    node_feature_importance = data.x.grad.abs().mean(dim=0)
    edge_feature_importance = data.edge_attr.grad.abs().mean(dim=0)
    
    # print feature importance
    print("\nNode Feature Importance:")
    node_features = list(node_df.columns)
    for i, importance in enumerate(node_feature_importance):
        if i < len(node_features):
            print(f"{node_features[i]}: {importance.item():.4f}")
    
    print("\nEdge Feature Importance:")
    edge_features = list(edge_df.columns.drop('is_fraud'))
    for i, importance in enumerate(edge_feature_importance):
        if i < len(edge_features):
            print(f"{edge_features[i]}: {importance.item():.4f}")
    
    # save the model
    os.makedirs('models', exist_ok=True)
    torch.save(model.state_dict(), 'models/eth_fraud_detector_gnn.pt')
    print("\nModel saved to 'models/eth_fraud_detector_gnn.pt'")

In [None]:
analyze_feature_importance(model, pyg_data, node_mapping)