In [None]:
import pandas as pd
import torch
from torch_geometric.data import Data
import os

# Load the edge list
file_path = os.path.expanduser('~/Downloads/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')
edges_df = pd.read_csv(file_path)

# Load the transaction labels
file_path = os.path.expanduser('~/Downloads/elliptic_bitcoin_dataset/elliptic_txs_classes.csv')
labels_df = pd.read_csv(file_path)

# Load the transaction features
file_path = os.path.expanduser('~/Downloads/elliptic_bitcoin_dataset/elliptic_txs_features.csv')
num_columns = 167
column_names = [f"{i}" for i in range(num_columns)]
features_df = pd.read_csv(file_path, names=column_names)

# Map transaction IDs to node indices
tx_id_mapping = {tx_id: idx for idx, tx_id in enumerate(features_df.iloc[:, 0].unique())}

# Replace transaction IDs in the edgelist with corresponding indices
edges_df['txId1'] = edges_df['txId1'].map(tx_id_mapping)
edges_df['txId2'] = edges_df['txId2'].map(tx_id_mapping)

# Convert to PyTorch Geometric edge index tensor
edge_index = torch.tensor(edges_df.values.T, dtype=torch.long)

# Map transaction IDs to node indices in features DataFrame
features_df.iloc[:, 0] = features_df.iloc[:, 0].map(tx_id_mapping)

# Sort features by node index to ensure proper alignment
features_df = features_df.sort_values(by="0")

# Convert features to a PyTorch tensor
x = torch.tensor(features_df.iloc[:, 1:].values, dtype=torch.float)

# Map transaction IDs to node indices in the labels DataFrame
labels_df['txId'] = labels_df['txId'].map(tx_id_mapping)

# Map 'unknown' to -1, '1' to 0 (licit), and '2' to 1 (illicit)
labels_df['class'] = labels_df['class'].map({'unknown': 0, '1': 1, '2': 2})

# Create a label tensor (set nodes without labels to -1)
y = torch.full((x.size(0),), 0, dtype=torch.long)  # Default label is 0
y[labels_df['txId']] = torch.tensor(labels_df['class'].values, dtype=torch.long)

# Create the PyTorch Geometric Data object
data = Data(x=x, edge_index=edge_index, y=y)

In [2]:
# Create masks for train, validation, and test splits
num_nodes = data.num_nodes
indices = torch.randperm(num_nodes)  # Random permutation of node indices

# Define proportions for train, validation, and test splits
train_size = int(0.7 * num_nodes)  # 70% for training
val_size = int(0.15 * num_nodes)   # 15% for validation
test_size = num_nodes - train_size - val_size  # Remaining 15% for testing

# Generate masks
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

# Assign True to the relevant masks
train_mask[indices[:train_size]] = True
val_mask[indices[train_size:train_size + val_size]] = True
test_mask[indices[train_size + val_size:]] = True

# Add the masks to the data object
data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask

In [3]:
print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Data(x=[203769, 166], edge_index=[2, 234355], y=[203769], train_mask=[203769], val_mask=[203769], test_mask=[203769])
Number of nodes: 203769
Number of edges: 234355
Average node degree: 1.15
Number of training nodes: 142638
Training node label rate: 0.70
Has isolated nodes: False
Has self-loops: False
Is undirected: False


In [4]:
file_path = os.path.expanduser('~/Downloads/data_object.pt')
torch.save(data, file_path)

In [None]:
print("change 2")