# GNN Training and Encoding

* Train a GNN based on enriched features in an unsupervised fashion, and use the resulting model to encode the input features.

## Load Data

In [8]:
site_input_dir = "processed_data"
site_name = "HCBHSGSG_Bank_9"

In [9]:
import os
import pandas as pd

dataset_names = ["train", "test"]
df_feats = {}
df_edges = {}
for ds_name in dataset_names:
    # Get feature and class
    file_name = os.path.join(site_input_dir, site_name, f"{ds_name}_normalized.csv")
    df = pd.read_csv(file_name, index_col=0)
    # Drop irrelevant columns
    df = df.drop(
        columns=[
            "Currency_Country",
            "Beneficiary_BIC",
            "Currency",
            "Receiver_BIC",
            "Sender_BIC",
        ]
    )
    df_feats[ds_name] = df
    # Get edge map
    file_name = os.path.join(site_input_dir, site_name, f"{ds_name}_edgemap.csv")
    df = pd.read_csv(file_name, header=None)
    # Add column names to the edge map
    df.columns = ["UETR_1", "UETR_2"]
    df_edges[ds_name] = df

## Prepared Data for Unsupervised GNN Training

In [10]:
import numpy as np
import torch

node_ids = {}
node_features = {}
edge_indices = {}
weights = {}
labels = {}

for ds_name in dataset_names:
    df_feat_class = df_feats[ds_name]
    df_edge = df_edges[ds_name]

    # Sort the data by UETR
    df_feat_class = df_feat_class.sort_values(by="Transaction_ID").reset_index(
        drop=True
    )

    # Generate UETR-index map with the feature list
    node_id = df_feat_class["Transaction_ID"].values
    map_id = {j: i for i, j in enumerate(node_id)}  # mapping nodes to indexes
    node_ids[ds_name] = node_id

    # Get class labels
    labels[ds_name] = df_feat_class["Fraud_Label"].values

    # Map UETR to indexes in the edge map
    edges = df_edge.copy()
    edges.UETR_1 = edges.UETR_1.map(map_id)
    edges.UETR_2 = edges.UETR_2.map(map_id)
    edges = edges.astype(int)

    # for undirected graph
    edge_index = np.array(edges.values).T
    edge_index = torch.tensor(edge_index, dtype=torch.long).contiguous()
    edge_indices[ds_name] = edge_index
    weights[ds_name] = torch.tensor([1] * edge_index.shape[1], dtype=torch.float)

    # UETR mapped to corresponding indexes, drop UETR and class
    node_feature = df_feat_class.drop(["Transaction_ID", "Fraud_Label"], axis=1).copy()
    node_feature = torch.tensor(np.array(node_feature.values), dtype=torch.float)
    node_features[ds_name] = node_feature

## Unsupervised GNN Training

In [11]:
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from torch_geometric.data import Data
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import GraphSAGE

output_dir = os.path.join(site_input_dir, site_name)
DEVICE = "cuda:0"
writer = SummaryWriter(output_dir)
epochs = 100

# Converting data to PyG graph data format
train_data = Data(
    x=node_features["train"],
    edge_index=edge_indices["train"],
    edge_attr=weights["train"],
)

# Define the dataloader for graphsage training
loader = LinkNeighborLoader(
    train_data,
    batch_size=2048,
    shuffle=True,
    neg_sampling_ratio=1.0,
    num_neighbors=[10, 10],
    num_workers=6,
    persistent_workers=True,
)

# Model
model = GraphSAGE(
    in_channels=node_features["train"].shape[1],
    hidden_channels=64,
    num_layers=2,
    out_channels=64,
)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
model.to(DEVICE)

for epoch in range(1, epochs + 1):
    model.train()
    running_loss = instance_count = 0

    for data in loader:
        # get the inputs data
        data = data.to(DEVICE)
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        h = model(data.x, data.edge_index)
        h_src = h[data.edge_label_index[0]]
        h_dst = h[data.edge_label_index[1]]
        link_pred = (h_src * h_dst).sum(dim=-1)  # Inner product.
        loss = F.binary_cross_entropy_with_logits(link_pred, data.edge_label)
        loss.backward()
        optimizer.step()
        # add record
        running_loss += float(loss.item()) * link_pred.numel()
        instance_count += link_pred.numel()
    print(f"Epoch: {epoch:02d}, Loss: {running_loss / instance_count:.4f}")
    writer.add_scalar("train_loss", running_loss / instance_count, epoch)

# Save the model
torch.save(model.state_dict(), os.path.join(output_dir, "model.pt"))

Epoch: 01, Loss: 0.8859
Epoch: 02, Loss: 0.8394
Epoch: 03, Loss: 0.8093
Epoch: 04, Loss: 0.7704
Epoch: 05, Loss: 0.7573
Epoch: 06, Loss: 0.7322
Epoch: 07, Loss: 0.7152
Epoch: 08, Loss: 0.7016
Epoch: 09, Loss: 0.6863
Epoch: 10, Loss: 0.6735
Epoch: 11, Loss: 0.6598
Epoch: 12, Loss: 0.6509
Epoch: 13, Loss: 0.6446
Epoch: 14, Loss: 0.6461
Epoch: 15, Loss: 0.6378
Epoch: 16, Loss: 0.6130
Epoch: 17, Loss: 0.6196
Epoch: 18, Loss: 0.6024
Epoch: 19, Loss: 0.6176
Epoch: 20, Loss: 0.6044
Epoch: 21, Loss: 0.5929
Epoch: 22, Loss: 0.5969
Epoch: 23, Loss: 0.5806
Epoch: 24, Loss: 0.5899
Epoch: 25, Loss: 0.5810
Epoch: 26, Loss: 0.5851
Epoch: 27, Loss: 0.5784
Epoch: 28, Loss: 0.5713
Epoch: 29, Loss: 0.5690
Epoch: 30, Loss: 0.5515
Epoch: 31, Loss: 0.5616
Epoch: 32, Loss: 0.5537
Epoch: 33, Loss: 0.5436
Epoch: 34, Loss: 0.5464
Epoch: 35, Loss: 0.5499
Epoch: 36, Loss: 0.5505
Epoch: 37, Loss: 0.5258
Epoch: 38, Loss: 0.5378
Epoch: 39, Loss: 0.5466
Epoch: 40, Loss: 0.5467
Epoch: 41, Loss: 0.5337
Epoch: 42, Loss:

## GNN Inference - Encoding the Raw Feature

In [12]:
# Load the model and perform inference / encoding
model_enc = GraphSAGE(
    in_channels=node_features["train"].shape[1],
    hidden_channels=64,
    num_layers=2,
    out_channels=64,
)
model_enc.load_state_dict(torch.load(os.path.join(output_dir, "model.pt")))
model_enc.eval()

embeds = {}
# Perform encoding
for ds_name in dataset_names:
    h = model_enc(node_features[ds_name], edge_indices[ds_name])
    embed = pd.DataFrame(h.cpu().detach().numpy())
    # Add column names as V_0, V_1, ... V_63
    embed.columns = [f"V_{i}" for i in range(embed.shape[1])]
    # Concatenate the node ids and class labels with the encoded features
    embed["Transaction_ID"] = node_ids[ds_name]
    embed["Fraud_Label"] = labels[ds_name]
    # Move the UETR and Class columns to the front
    embed = embed[
        ["Transaction_ID", "Fraud_Label"]
        + [col for col in embed.columns if col not in ["Transaction_ID", "Fraud_Label"]]
    ]
    embed.to_csv(os.path.join(output_dir, f"{ds_name}_embedding.csv"), index=False)
    embeds[ds_name] = embed

In [13]:
! find /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1

find: ‘/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1’: No such file or directory


In [14]:
embeds["train"]

Unnamed: 0,Transaction_ID,Fraud_Label,V_0,V_1,V_2,V_3,V_4,V_5,V_6,V_7,...,V_54,V_55,V_56,V_57,V_58,V_59,V_60,V_61,V_62,V_63
0,TXN_1,0.0,-0.028973,0.095657,0.042164,-0.011008,-0.065842,-0.024214,0.073006,0.140930,...,-0.148318,-0.164059,0.088292,0.007325,0.109578,-0.013932,0.011894,-0.012820,-0.050226,0.067718
1,TXN_100,0.0,0.172293,0.218145,0.120312,0.113256,0.063575,0.061711,-0.140634,0.029781,...,0.051324,-0.066613,-0.078133,0.026027,0.058151,-0.202513,0.125230,0.080344,-0.114718,-0.106416
2,TXN_10022,1.0,0.056977,0.017150,-0.160322,0.013378,-0.102321,0.040907,-0.089257,0.049358,...,0.023719,-0.004594,0.050215,0.016388,-0.050697,-0.083995,-0.025159,0.032874,-0.084087,-0.010891
3,TXN_10027,1.0,0.071584,0.148986,0.015382,0.058536,-0.016526,0.004357,-0.053840,0.091114,...,-0.051510,-0.128119,0.043044,0.021109,0.078473,-0.125906,0.064768,0.029336,-0.104389,-0.016299
4,TXN_10037,0.0,-0.088335,0.076789,0.092709,-0.042632,-0.015114,-0.073617,0.066298,0.121103,...,-0.143355,-0.201269,0.098042,-0.018394,0.132004,0.011132,0.019389,0.014127,-0.039008,0.062403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3596,TXN_9960,0.0,-0.081266,-0.110061,-0.198649,-0.092774,-0.231194,0.002604,0.090064,0.100205,...,-0.081326,-0.006626,0.115972,0.003366,-0.035140,0.102030,-0.125337,-0.050900,0.006397,0.112276
3597,TXN_9973,0.0,0.112650,-0.041571,-0.183561,0.030091,0.007039,0.064889,-0.172339,-0.059009,...,0.127273,0.065834,-0.000823,0.029885,-0.103677,-0.138047,-0.001729,0.083217,-0.074434,-0.104596
3598,TXN_9974,1.0,0.006942,-0.093850,-0.252976,-0.048655,-0.151257,0.024363,-0.125361,0.012513,...,0.087925,0.035664,0.059005,-0.025692,-0.121029,-0.041755,-0.081888,0.028827,-0.066285,-0.018904
3599,TXN_9977,0.0,-0.020019,0.088668,0.056534,-0.000961,-0.134506,-0.014854,0.146039,0.166312,...,-0.201459,-0.158649,0.093627,-0.001041,0.099505,0.031917,0.034907,-0.094791,-0.001101,0.115109


Let's go back to the [XGBoost Notebook](../xgboost.ipynb)