# Fraud Detection using Graph Neural Networks (GNN)

This notebook implements an end-to-end **Fraud Detection** pipeline using **Graph Neural Networks**.

Contents:
- Synthetic graph dataset creation (users, devices, transactions, merchants)
- Graph construction and conversion to PyTorch Geometric Data
- GNN model (GraphSAGE / GAT) implementation and training
- Evaluation (ROC AUC, Precision@K)
- Visualizations and saving models

In [1]:
!pip install -q pandas numpy scikit-learn torch torchvision torchaudio
!pip install -q torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-<YOUR_TORCH_VERSION>.html || true
!pip install -q scikit-learn matplotlib networkx pyvis joblib

The system cannot find the file specified.
'true' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
import random
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import joblib
import os

import torch
from torch_geometric.data import Data


ModuleNotFoundError: No module named 'torch_geometric'

## 1) Create a synthetic heterogeneous graph
We'll simulate a small graph containing Users, Devices, Transactions, and Merchants. Fraud labels will be assigned to some transactions.


In [None]:
def create_synthetic_graph(num_users=500, num_devices=300, num_merchants=100, num_transactions=2000, fraud_ratio=0.05, seed=42):
    random.seed(seed)
    np.random.seed(seed)
    
    users = [f'U{i}' for i in range(num_users)]
    devices = [f'D{i}' for i in range(num_devices)]
    merchants = [f'M{i}' for i in range(num_merchants)]
    
    # Create transactions linking user->device->transaction->merchant
    transactions = []
    tx_rows = []
    for t in range(num_transactions):
        user = random.choice(users)
        device = random.choice(devices)
        merchant = random.choice(merchants)
        amount = round(float(np.random.exponential(scale=50.0)), 2)
        time = np.random.randint(1_600_000_000, 1_700_000_000)
        transactions.append(f'T{t}')
        tx_rows.append({'tx_id': f'T{t}', 'user': user, 'device': device, 'merchant': merchant, 'amount': amount, 'time': time})
    
    tx_df = pd.DataFrame(tx_rows)
    # assign fraud labels randomly according to fraud_ratio, but inject patterns: some devices and merchants more likely fraud
    tx_df['label'] = 0
    # choose suspicious devices and merchants
    suspicious_devices = set(np.random.choice(devices, size=max(1,int(0.02*len(devices))), replace=False))
    suspicious_merchants = set(np.random.choice(merchants, size=max(1,int(0.02*len(merchants))), replace=False))
    for idx, row in tx_df.sample(frac=fraud_ratio, random_state=seed).iterrows():
        tx_df.at[idx,'label'] = 1
    # increase fraud for suspicious device/merchant transactions
    for idx, row in tx_df.iterrows():
        if row['device'] in suspicious_devices and np.random.rand() < 0.3:
            tx_df.at[idx,'label'] = 1
        if row['merchant'] in suspicious_merchants and np.random.rand() < 0.25:
            tx_df.at[idx,'label'] = 1
    
    return users, devices, merchants, tx_df

users, devices, merchants, tx_df = create_synthetic_graph()
tx_df.head()

## 2) Build a graph and convert to PyTorch Geometric `Data` object
We'll build a bipartite/heterogeneous-style graph by creating nodes for users, devices, merchants and transactions and edges between them.


In [None]:
def build_graph(users, devices, merchants, tx_df):
    # create node ids
    node_list = users + devices + merchants + tx_df['tx_id'].tolist()
    node_index = {n:i for i,n in enumerate(node_list)}
    
    G = nx.Graph()
    for n in node_list:
        G.add_node(n)
    
    # edges: user - tx, device - tx, merchant - tx
    edges = []
    for _, row in tx_df.iterrows():
        u = row['user']
        d = row['device']
        m = row['merchant']
        t = row['tx_id']
        edges.append((node_index[u], node_index[t]))
        edges.append((node_index[d], node_index[t]))
        edges.append((node_index[m], node_index[t]))
    
    # build adjacency list
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    
    # create simple node features: type one-hot and degree
    N = len(node_list)
    feat_type = np.zeros((N,4), dtype=float)
    for i,n in enumerate(node_list):
        if n.startswith('U'):
            feat_type[i,0]=1.0
        elif n.startswith('D'):
            feat_type[i,1]=1.0
        elif n.startswith('M'):
            feat_type[i,2]=1.0
        else:
            feat_type[i,3]=1.0
    degrees = np.array([val for (_,val) in G.degree(node_list)])
    degrees = degrees.reshape(-1,1).astype(float)
    x = np.hstack([feat_type, degrees])
    x = torch.tensor(x, dtype=torch.float)
    
    # labels: only transactions have labels; for other nodes label=-1
    y = -1 * torch.ones((N,), dtype=torch.long)
    for _, row in tx_df.iterrows():
        idx = node_index[row['tx_id']]
        y[idx] = int(row['label'])
    
    data = Data(x=x, edge_index=edge_index, y=y)
    data.node_list = node_list
    data.node_index = node_index
    return data

data = build_graph(users, devices, merchants, tx_df)
print('Nodes:', len(data.x))
print('Edges:', data.edge_index.shape)
# show a few transaction labels
for n in tx_df['tx_id'].tolist()[:5]:
    print(n, data.y[data.node_index[n]].item())

## 3) Prepare train/test masks for node classification (transactions only)
We will create masks that only include transaction nodes for training / validation / test.


In [None]:
def create_masks(data, tx_ids, train_frac=0.7, val_frac=0.15, seed=42):
    np.random.seed(seed)
    tx_indices = [data.node_index[t] for t in tx_ids]
    labels = data.y[tx_indices].numpy()
    pos_idx = [i for i,lab in zip(tx_indices, labels) if lab==1]
    neg_idx = [i for i,lab in zip(tx_indices, labels) if lab==0]
    
    def split_indices(idxs):
        idxs = np.array(idxs)
        np.random.shuffle(idxs)
        n = len(idxs)
        n_train = int(train_frac * n)
        n_val = int(val_frac * n)
        return idxs[:n_train].tolist(), idxs[n_train:n_train+n_val].tolist(), idxs[n_train+n_val:].tolist()
    
    pos_train, pos_val, pos_test = split_indices(pos_idx)
    neg_train, neg_val, neg_test = split_indices(neg_idx)
    
    train_idx = pos_train + neg_train
    val_idx = pos_val + neg_val
    test_idx = pos_test + neg_test
    
    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    
    train_mask[train_idx] = True
    val_mask[val_idx] = True
    test_mask[test_idx] = True
    
    return train_mask, val_mask, test_mask

train_mask, val_mask, test_mask = create_masks(data, tx_df['tx_id'].tolist())
print('Train txn:', train_mask.sum().item(), 'Val txn:', val_mask.sum().item(), 'Test txn:', test_mask.sum().item())

## 4) Define a GNN model (GraphSAGE)
We'll define a simple GraphSAGE model for node classification. If torch_geometric isn't available in your environment, install it as shown above.


In [None]:
from torch.nn import Linear
import torch.nn.functional as F

try:
    from torch_geometric.nn import SAGEConv
except Exception as e:
    print('torch_geometric not available in this environment. The model definition remains visible but will fail if executed without torch-geometric installed.')

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2):
        super().__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        for _ in range(num_layers-2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        if num_layers>1:
            self.convs.append(SAGEConv(hidden_channels, out_channels))
        else:
            self.convs.append(SAGEConv(in_channels, out_channels))
        self.lin = Linear(out_channels, out_channels)

    def forward(self, x, edge_index):
        for conv in self.convs[:-1]:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=0.2, training=self.training)
        x = self.convs[-1](x, edge_index)
        return x

# instantiate (will fail if SAGEConv is missing)
in_ch = data.num_node_features
model = GraphSAGE(in_channels=in_ch, hidden_channels=64, out_channels=2, num_layers=3)
print(model)


## 5) Training loop
We'll train only on transaction nodes (train_mask). Loss computed only for labeled nodes in train mask.


In [None]:
from sklearn.metrics import roc_auc_score, precision_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
data = data.to(device)
train_mask = train_mask.to(device)
val_mask = val_mask.to(device)
test_mask = test_mask.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

def train_epoch():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    # only compute loss on training nodes
    loss = F.cross_entropy(out[train_mask], data.y[train_mask].to(torch.long))
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def evaluate(mask):
    model.eval()
    out = model(data.x, data.edge_index)
    probs = F.softmax(out, dim=1)[:,1].cpu().numpy()
    labels = data.y.cpu().numpy()
    mask_idx = mask.cpu().numpy()
    if mask_idx.sum() == 0:
        return {'auc': None}
    auc = roc_auc_score(labels[mask_idx], probs[mask_idx])
    preds = (probs[mask_idx] > 0.5).astype(int)
    prec = precision_score(labels[mask_idx], preds, zero_division=0)
    return {'auc': auc, 'precision': prec}

# training loop (small number of epochs for demo)
best_val = 0
for epoch in range(1, 31):
    loss = train_epoch()
    val_metrics = evaluate(val_mask)
    if val_metrics['auc'] is not None and val_metrics['auc'] > best_val:
        best_val = val_metrics['auc']
        # save best model
        torch.save(model.state_dict(), 'models/gnn_best.pth')
    if epoch % 5 == 0:
        print(f'Epoch {epoch:02d} loss={loss:.4f} val_auc={val_metrics["auc"]} val_prec={val_metrics.get("precision")}')

# load best and evaluate on test
model.load_state_dict(torch.load('models/gnn_best.pth'))
print('Test metrics:', evaluate(test_mask))

## 6) Visualize a small subgraph and suspicious nodes
We'll plot a small neighborhood around a suspicious device or merchant for inspection.


In [None]:
import matplotlib.pyplot as plt

# pick a suspicious transaction (label=1)
fraud_tx = [n for n in data.node_list if n.startswith('T') and data.y[data.node_index[n]]==1]
if len(fraud_tx)>0:
    center = fraud_tx[0]
    center_idx = data.node_index[center]
    # build networkx subgraph of neighbors within 2 hops
    edges = [(int(u.item()), int(v.item())) for u,v in data.edge_index.t()]
    G = nx.Graph()
    G.add_edges_from(edges)
    nodes = list(nx.ego_graph(G, center_idx, radius=2).nodes())
    sub = G.subgraph(nodes)
    pos = nx.spring_layout(sub)
    plt.figure(figsize=(8,6))
    nx.draw(sub, pos, with_labels=True, node_size=100)
    plt.title(f'Neighborhood around {center}')
    plt.show()
else:
    print('No fraud transactions found in sample')

## Save README and requirements
We'll save a README.md and requirements.txt for GitHub upload.


In [None]:
readme = '''# Fraud Detection using Graph Neural Networks (GNN)

This repository contains an end-to-end notebook and supporting files for fraud detection using GNNs.

Contents:
- `notebooks/fraud_gnn.ipynb` - main notebook
- `models/` - saved model weights
- `data/` - (optional) transaction csv files

How to run:
1. Install dependencies (torch, torch-geometric, scikit-learn, networkx)
2. Open and run the notebook cells in order

Notes:
- The notebook uses a synthetic dataset for demonstration. Replace with real data for production.
- Adjust torch-geometric installation to match your CUDA/PyTorch version.
'''
os.makedirs('models', exist_ok=True)
with open('README.md','w') as f:
    f.write(readme)

reqs = 'pandas\nnumpy\nscikit-learn\ntorch\ntorch-geometric\nnetworkx\nmatplotlib\npyvis\njoblib\n'
with open('requirements.txt','w') as f:
    f.write(reqs)
print('Saved README.md and requirements.txt')