In [7]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from torch_geometric.data import Data, Dataset, DataLoader
from torch.nn import functional as F
import numpy as np
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, average_precision_score
import os

In [8]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [9]:
# Load dataset
data_path = "LI-Small_Trans.csv"
df = pd.read_csv(data_path)

# Money Laundering Detection

### Encoding features

In [10]:
illicit_transactions = df["Is Laundering"].sum()
total_transactions = df["Is Laundering"].count()
print("Illicit Transactions : ", illicit_transactions)
print("Total Transactions : ", total_transactions)
print("Illicit Transactions Ratio : ", illicit_transactions/total_transactions)
print("Illicit Transactions Percentage : ", round(illicit_transactions/total_transactions*100,2), "%")

Illicit Transactions :  3565
Total Transactions :  6924049
Illicit Transactions Ratio :  0.0005148721506736881
Illicit Transactions Percentage :  0.05 %


In [11]:
# Step 1: Encode account strings to integers
account_encoder = LabelEncoder()
df['From Account Encoded'] = account_encoder.fit_transform(df['Account'])
df['To Account Encoded'] = account_encoder.fit_transform(df['Account.1'])

# Step 2: Encode currencies and payment formats
currency_encoder = LabelEncoder()
df['Receiving Currency Encoded'] = currency_encoder.fit_transform(df['Receiving Currency'])
df['Payment Currency Encoded'] = currency_encoder.fit_transform(df['Payment Currency'])

payment_format_encoder = LabelEncoder()
df['Payment Format Encoded'] = payment_format_encoder.fit_transform(df['Payment Format'])

# Step 3: Create nodes and edges
unique_accounts = pd.concat([df['From Account Encoded'], df['To Account Encoded']]).unique()
account_mapping = {account: i for i, account in enumerate(unique_accounts)}
df['From Account Encoded'] = df['From Account Encoded'].map(account_mapping)
df['To Account Encoded'] = df['To Account Encoded'].map(account_mapping)
num_nodes = len(unique_accounts)
edges = torch.tensor(df[['From Account Encoded', 'To Account Encoded']].values.T, dtype=torch.long)

# Step 4: Create edge features
edge_features = torch.tensor(
    df[['Amount Paid', 'Payment Currency Encoded', 'Amount Received', 'Receiving Currency Encoded', 'Payment Format Encoded']].values,
    dtype=torch.float
)

# Step 5: Create labels (is laundering)
edge_labels = torch.tensor(df['Is Laundering'].values, dtype=torch.long)

# Step 6: Create PyTorch Geometric Data object
data = Data(
    x=torch.randn(num_nodes, 32),  # Node features (random initialization)
    edge_index=edges,
    edge_attr=edge_features,
    y=edge_labels
)

# Move data to GPU
data = data.to(device)

### Custom class for dataset represenatation

In [12]:
# Step 7: Define a custom Dataset class
class GraphDataset(Dataset):
    def __init__(self, data, num_nodes):
        super(GraphDataset, self).__init__()
        self.data = data
        self.num_nodes = num_nodes

    def len(self):
        return 1  # Single graph

    def get(self, idx):
        return self.data

# Step 8: Create Dataset and DataLoader
dataset = GraphDataset(data, num_nodes)
loader = DataLoader(dataset, batch_size=32, shuffle=True)



### Message Passing Graph Neural Network (MPGNN) Implementation

In [13]:
# Step 9: Define the MPNN model
class MPNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, edge_feature_dim, out_channels):
        super(MPNN, self).__init__()
        # Message function (MLP for edge messages)
        self.message_mlp = torch.nn.Sequential(
            torch.nn.Linear(2 * in_channels + edge_feature_dim, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_channels, hidden_channels)
        )
        # Update function (GRU for node updates)
        self.update_gru = torch.nn.GRUCell(hidden_channels, in_channels)
        # Readout function (MLP for edge prediction)
        self.readout_mlp = torch.nn.Sequential(
            torch.nn.Linear(2 * hidden_channels + edge_feature_dim, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_channels, out_channels)
        )

    def forward(self, x, edge_index, edge_attr):
        row, col = edge_index  # Source and target nodes for each edge
        node_features_src = x[row]  # Source node features
        node_features_dst = x[col]  # Target node features
        # Concatenate source, destination, and edge features to form messages
        messages = torch.cat([node_features_src, node_features_dst, edge_attr], dim=1)
        messages = self.message_mlp(messages)  # Apply message function

        # Aggregate messages for each node
        aggregated_messages = torch.zeros_like(x)  # Initialize aggregated messages
        aggregated_messages = aggregated_messages.index_add_(0, col, messages)  # Sum messages for each node

        # Update node states using GRU
        x = self.update_gru(aggregated_messages, x)

        # Readout (Edge Prediction)
        edge_src = x[row]  # Updated source node features
        edge_dst = x[col]  # Updated destination node features
        edge_features = torch.cat([edge_src, edge_dst, edge_attr], dim=1)  # Concatenate edge features
        edge_logits = self.readout_mlp(edge_features)  # Predict edge labels
        return edge_logits

# Step 10: Initialize model and optimizer
model = MPNN(in_channels=32, hidden_channels=32, edge_feature_dim=edge_features.size(1), out_channels=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

### Splitting the dataset

In [14]:
# Step 11: Train/Test split
train_mask = torch.rand(data.edge_index.size(1)) < 0.8  # 80% for training
train_edge_index = data.edge_index[:, train_mask]
train_edge_attr = data.edge_attr[train_mask]
train_labels = data.y[train_mask]

test_edge_index = data.edge_index[:, ~train_mask]
test_edge_attr = data.edge_attr[~train_mask]
test_labels = data.y[~train_mask]

# Step 12: Use weighted CrossEntropyLoss
class_counts = torch.bincount(train_labels.cpu())
class_weights = 1.0 / class_counts.float()
class_weights = class_weights / class_weights.sum()
criterion = torch.nn.CrossEntropyLoss(weight=class_weights.to(device))

### Training the model

In [15]:
# Step 13: Training and testing functions
def train():
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch.x, train_edge_index, train_edge_attr)
        loss = criterion(out, train_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def test():
    model.eval()
    with torch.no_grad():
        out = model(data.x, test_edge_index, test_edge_attr)
        pred = out.argmax(dim=1)
        correct = (pred == test_labels).sum()
        acc = correct / test_labels.size(0)
        return acc.item()

In [16]:
# Step 14: Training loop for fraud detection
for epoch in tqdm(range(1, 3)):  # Train for 100 epochs
    loss = train()
    acc = test()
    print(f"Epoch: {epoch}, Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")

# Save the fraud detection model
torch.save(model.state_dict(), "mpnn_fraud_detection.pt")

 50%|█████     | 1/2 [01:31<01:31, 91.30s/it]

Epoch: 1, Loss: 42461.1562, Test Accuracy: 0.0516


100%|██████████| 2/2 [02:54<00:00, 87.08s/it]

Epoch: 2, Loss: 92085.9531, Test Accuracy: 0.1075





In [17]:
# Step 15: Save encoders and mappings
os.makedirs("encoders", exist_ok=True)
np.save("encoders/account_encoder_classes.npy", account_encoder.classes_)
np.save("encoders/currency_encoder_classes.npy", currency_encoder.classes_)
np.save("encoders/payment_format_encoder_classes.npy", payment_format_encoder.classes_)
np.save("encoders/account_mapping.npy", account_mapping)

# Link Prediction

### Marking Fraudulent Accounts

In [19]:
# Step 16: Mark fraudulent accounts
def mark_fraudulent_accounts(data, edge_labels):
    fraudulent_edges = (edge_labels == 1).nonzero().squeeze()
    fraudulent_accounts = set()
    for edge in fraudulent_edges:
        sender = data.edge_index[0, edge].item()
        receiver = data.edge_index[1, edge].item()
        fraudulent_accounts.add(sender)
        fraudulent_accounts.add(receiver)
    return fraudulent_accounts

fraudulent_accounts = mark_fraudulent_accounts(data, edge_labels)
print(f"Fraudulent accounts: {fraudulent_accounts}")

Fraudulent accounts: {557102, 458799, 131126, 557111, 524347, 67, 163912, 589897, 622681, 426080, 163938, 98418, 622723, 557196, 360592, 32912, 153, 131226, 131227, 655520, 131233, 295072, 655527, 98472, 360617, 180, 183, 229563, 262331, 98492, 189, 193, 197, 131269, 206, 208, 219, 233, 360682, 65771, 234, 33005, 131309, 235, 327929, 252, 262396, 164098, 258, 33027, 131333, 267, 393487, 524563, 524564, 280, 164135, 295, 622887, 393524, 65854, 325, 338, 426323, 426333, 352, 354, 360808, 65907, 33140, 373, 374, 459134, 295298, 590217, 393611, 401, 411, 328093, 423, 429, 393646, 431, 164272, 33212, 449, 131523, 459204, 459209, 459212, 557518, 459215, 459216, 467, 474, 459227, 478, 479, 459233, 131553, 459235, 485, 494, 131568, 33269, 197110, 503, 33288, 523, 538, 542, 543, 426530, 547, 552, 556, 98867, 564, 229943, 459320, 328250, 492091, 572, 98881, 426563, 584, 98889, 131664, 593, 598, 66134, 600, 492133, 426600, 629, 639, 641, 642, 459395, 649, 295561, 657, 262805, 524950, 663, 666, 66

### Building subgraph of fraudulent accounts

In [20]:
# Step 17: Build a subgraph of fraudulent accounts
def build_fraudulent_subgraph(data, fraudulent_accounts):
    fraudulent_accounts = list(fraudulent_accounts)
    # Create a mapping from original node indices to new indices in the subgraph
    node_mapping = {node: i for i, node in enumerate(fraudulent_accounts)}

    # Create a mask for edges involving fraudulent accounts
    edge_mask = torch.tensor([(u in fraudulent_accounts and v in fraudulent_accounts)
                              for u, v in data.edge_index.t().tolist()], dtype=torch.bool)

    # Extract the subgraph
    subgraph = Data(
        x=data.x[fraudulent_accounts],  # Node features for fraudulent accounts
        edge_index=data.edge_index[:, edge_mask],  # Filtered edges
        edge_attr=data.edge_attr[edge_mask],  # Filtered edge features
        y=data.y[edge_mask]  # Filtered edge labels
    )

    # Remap node indices in edge_index
    subgraph.edge_index = torch.tensor([
        [node_mapping[u.item()] for u in subgraph.edge_index[0]],  # Remap source nodes
        [node_mapping[v.item()] for v in subgraph.edge_index[1]]  # Remap target nodes
    ], dtype=torch.long)

    return subgraph, fraudulent_accounts

fraudulent_subgraph, fraudulent_accounts = build_fraudulent_subgraph(data, fraudulent_accounts)

### Predicting the link between two fraudulent account in the subgraph

In [16]:
# Step 18: Perform link prediction on the fraudulent subgraph
def predict_links(model, subgraph, threshold=0.5):
    model.eval()
    with torch.no_grad():
        out = model(subgraph.x, subgraph.edge_index, subgraph.edge_attr)
        preds = torch.sigmoid(out.squeeze())
        predicted_links = (preds > threshold).nonzero().squeeze().tolist()
    return predicted_links

predicted_links = predict_links(model, fraudulent_subgraph, threshold=0.7)
print(f"Predicted links between fraudulent accounts: {predicted_links}")

# Save the link prediction model
torch.save(model.state_dict(), "mpnn_link_prediction.pt")

Predicted links between fraudulent accounts: [[0, 0], [0, 1], [1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [4, 0], [4, 1], [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1], [9, 0], [10, 1], [11, 0], [11, 1], [12, 0], [12, 1], [13, 0], [13, 1], [14, 0], [15, 0], [15, 1], [16, 1], [17, 0], [18, 0], [18, 1], [19, 0], [19, 1], [20, 0], [20, 1], [21, 0], [22, 0], [22, 1], [23, 0], [24, 0], [25, 0], [25, 1], [26, 0], [26, 1], [27, 0], [27, 1], [28, 0], [29, 0], [29, 1], [30, 0], [31, 0], [32, 0], [33, 0], [33, 1], [34, 0], [35, 0], [35, 1], [36, 0], [37, 0], [37, 1], [38, 0], [38, 1], [39, 0], [39, 1], [40, 0], [41, 0], [41, 1], [42, 0], [42, 1], [43, 0], [43, 1], [44, 0], [45, 0], [45, 1], [46, 0], [47, 1], [48, 0], [48, 1], [49, 0], [49, 1], [50, 0], [51, 0], [51, 1], [52, 0], [52, 1], [53, 0], [53, 1], [54, 0], [54, 1], [55, 0], [55, 1], [56, 0], [56, 1], [57, 0], [57, 1], [58, 0], [58, 1], [59, 0], [60, 0], [61, 0], [61, 1], [62, 0], [62, 1], [63, 0], [64, 0], [64, 1], [65, 0], 