In [3]:
import argparse
import copy
import math
import os
from typing import Dict, List

import numpy as np
import torch
from inferred_stypes import dataset2inferred_stypes
from model import Model
from text_embedder import GloveTextEmbedding
from torch.nn import BCEWithLogitsLoss, L1Loss
from torch_frame.config.text_embedder import TextEmbedderConfig
from torch_geometric.data import HeteroData
from torch_geometric.loader import NeighborLoader
from torch_geometric.seed import seed_everything
from tqdm import tqdm

from relbench.data import NodeTask, RelBenchDataset
from relbench.data.task_base import TaskType
from relbench.datasets import get_dataset
from relbench.external.graph import get_node_train_table_input, make_pkey_fkey_graph

from torch_geometric.data import HeteroData
from torch_geometric.explain import Explainer, CaptumExplainer
import dgl

from dgl.nn import HeteroGNNExplainer

In [4]:
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str, default="rel-stackex")
parser.add_argument("--task", type=str, default="rel-stackex-engage")
parser.add_argument("--lr", type=float, default=0.01)
parser.add_argument("--epochs", type=int, default=10)
parser.add_argument("--batch_size", type=int, default=512)
parser.add_argument("--channels", type=int, default=128)
parser.add_argument("--aggr", type=str, default="sum")
parser.add_argument("--num_layers", type=int, default=2)
parser.add_argument("--num_neighbors", type=int, default=128)
parser.add_argument("--temporal_strategy", type=str, default="uniform")
parser.add_argument("--num_workers", type=int, default=1)
args = parser.parse_args("")


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed_everything(42)

root_dir = "./data"

# TODO: remove process=True once correct data/task is uploaded.
dataset: RelBenchDataset = get_dataset(name=args.dataset, process=True)
task: NodeTask = dataset.get_task(args.task, process=True)

making Database object from raw files...
done in 33.54 seconds.
reindexing pkeys and fkeys...
done in 2.66 seconds.


In [7]:
dataset.db

Database()

In [6]:
col_to_stype_dict = dataset2inferred_stypes[args.dataset]

data, col_stats_dict = make_pkey_fkey_graph(
    dataset.db,
    col_to_stype_dict=col_to_stype_dict,
    text_embedder_cfg=TextEmbedderConfig(
        text_embedder=GloveTextEmbedding(device=device), batch_size=256
    ),
    cache_dir=os.path.join(root_dir, f"{args.dataset}_materialized_cache"),
)



In [9]:
loader_dict: Dict[str, NeighborLoader] = {}
for split, table in [
    ("train", task.train_table),
    ("val", task.val_table),
    ("test", task.test_table),
]:
    table_input = get_node_train_table_input(table=table, task=task)
    entity_table = table_input.nodes[0]
    loader_dict[split] = NeighborLoader(
        data,
        num_neighbors=[
            int(args.num_neighbors / 2**i) for i in range(args.num_layers)
        ],
        time_attr="time",
        input_nodes=table_input.nodes,
        input_time=table_input.time,
        transform=table_input.transform,
        batch_size=args.batch_size,
        temporal_strategy=args.temporal_strategy,
        shuffle=split == "train",
        num_workers=args.num_workers,
        persistent_workers=args.num_workers > 0,
    )

In [6]:
edge_index_dict = {}
for edge_type in data.edge_types:
    edge_index_dict[edge_type] = data[edge_type].edge_index

#print("edge types:\n",data.edge_types)

x_dict = {}
for node_type in data.node_types:
    # Assuming data[node_type].tf.to_tensor() is the method to convert TensorFrame to a tensor.
    # Adjust this method based on the actual implementation of TensorFrame.
    x_dict[node_type] = data[node_type].tf.to_tensor() if hasattr(data[node_type].tf, 'to_tensor') else data[node_type].tf
#print(x_dict)
#print("node types:\n",data.node_types)

# if __name__ == '__main__':
#     # Your code that creates and starts processes should go here
#     # This ensures that multiprocessing is handled correctly
#     for batch in loader_dict["train"]:
#         batch = batch.to(device)
#         print("batch:\n", batch)


#print("task.train_table\n",task.train_table)
#print("task.entity_table\n",task.entity_table)

# print("table_input:",table_input)
# print("table_input.nodes:",table_input.nodes)
# print("table_input.nodes[0]:",table_input.nodes[0])


In [7]:
commentstf = data['comments'].tf
commentstf

TensorFrame(
  num_cols=2,
  num_rows=623967,
  timestamp (1): ['CreationDate'],
  embedding (1): ['Text'],
  has_target=False,
  device='cpu',
)

In [8]:
c_timestamp = list(commentstf.feat_dict.keys())[0]
c_emb = list(commentstf.feat_dict.keys())[1]
c_timetensors = commentstf.feat_dict[c_timestamp]
c_embeddingtensors = commentstf.feat_dict[c_emb].values
c_timetensors = c_timetensors.squeeze(1)  
comments_features = torch.cat([c_timetensors, c_embeddingtensors], dim=1)


In [9]:
userstf = data['users'].tf
u_numerical = list(userstf.feat_dict.keys())[0]
u_timestamp = list(userstf.feat_dict.keys())[1]
u_embedding = list(userstf.feat_dict.keys())[2]
u_numtensors = userstf.feat_dict[u_numerical]
u_timetensors = userstf.feat_dict[u_timestamp]
u_embtensors = userstf.feat_dict[u_embedding].values
users_features = torch.cat([u_numtensors,u_timetensors.squeeze(1),u_embtensors], dim=1)

In [10]:
badgestf = data['badges'].tf
b_timestamp = list(badgestf.feat_dict.keys())[1]
b_categorical = list(badgestf.feat_dict.keys())[0]
b_timetensors = badgestf.feat_dict[b_timestamp]
b_categoricaltensors = badgestf.feat_dict[b_categorical]
badges_features = torch.cat([b_categoricaltensors,b_timetensors.squeeze(1)], dim=1)

In [11]:
postLinkstf = data['postLinks'].tf
pl_timestamp = list(postLinkstf.feat_dict.keys())[1]
pl_numerical = list(postLinkstf.feat_dict.keys())[0]
pl_timetensors = postLinkstf.feat_dict[pl_timestamp]
pl_numericaltensors = postLinkstf.feat_dict[pl_numerical]
postLinks_features = torch.cat([pl_numericaltensors,pl_timetensors.squeeze(1)], dim=1)

In [12]:
postHistorytf = data['postHistory'].tf
ph_numerical = list(postHistorytf.feat_dict.keys())[0]
ph_categorical = list(postHistorytf.feat_dict.keys())[1]
ph_timestamp = list(postHistorytf.feat_dict.keys())[2]
ph_embedding = list(postHistorytf.feat_dict.keys())[3]
ph_numtensors = postHistorytf.feat_dict[ph_numerical]
ph_categoricaltensors = postHistorytf.feat_dict[ph_categorical]
ph_timetensors = postHistorytf.feat_dict[ph_timestamp]
ph_embtensors = postHistorytf.feat_dict[ph_embedding].values
postHistory_features = torch.cat([ph_numtensors,ph_categoricaltensors,ph_timetensors.squeeze(1),ph_embtensors], dim=1)

In [13]:
votestf = data['votes'].tf
v_numerical = list(votestf.feat_dict.keys())[0]
v_timestamp = list(votestf.feat_dict.keys())[1]
v_numtensors = votestf.feat_dict[v_numerical]
v_timetensors = votestf.feat_dict[v_timestamp]
votes_features = torch.cat([v_numtensors,v_timetensors.squeeze(1)], dim=1)

In [14]:
poststf = data['posts'].tf
p_numerical = list(poststf.feat_dict.keys())[0]
p_timestamp = list(poststf.feat_dict.keys())[1]
p_embedding = list(poststf.feat_dict.keys())[2]
p_numtensors = poststf.feat_dict[p_numerical]
p_timetensors = poststf.feat_dict[p_timestamp]
p_embtensors = poststf.feat_dict[p_embedding].values
posts_features = torch.cat([p_numtensors,p_timetensors.squeeze(1),p_embtensors], dim=1)

In [15]:
feature_dict = {
    'comments': comments_features,
    'badges': badges_features,
    'postLinks': postLinks_features,
    'postHistory': postHistory_features,
    'votes': votes_features,
    'users': users_features,
    'posts': posts_features  
}

In [16]:
feature_dict['users'].shape

torch.Size([255360, 308])

In [34]:
graph_data = {}

# For each edge type in the HeteroData
for (src_type, edge_type, dst_type), edge_data in data.edge_index_dict.items():
    src_nodes, dst_nodes = edge_data
    # Convert PyG edge index format to DGL format
    graph_data[(src_type, edge_type, dst_type)] = (src_nodes.numpy(), dst_nodes.numpy())

# Create the DGL heterograph
dgl_graph = dgl.heterograph(graph_data)
print("dgl_graph:\n",dgl_graph)
print("dgl_graph.nodes['users']:\n",dgl_graph.nodes['users'])

dgl_graph:
 Graph(num_nodes={'badges': 463463, 'comments': 623967, 'postHistory': 1175368, 'postLinks': 77337, 'posts': 333893, 'users': 255359, 'votes': 1317876},
      num_edges={('badges', 'f2p_UserId', 'users'): 463463, ('comments', 'f2p_PostId', 'posts'): 623962, ('comments', 'f2p_UserId', 'users'): 612288, ('postHistory', 'f2p_PostId', 'posts'): 1175368, ('postHistory', 'f2p_UserId', 'users'): 1100031, ('postLinks', 'f2p_PostId', 'posts'): 61171, ('postLinks', 'f2p_RelatedPostId', 'posts'): 75588, ('posts', 'f2p_AcceptedAnswerId', 'posts'): 57714, ('posts', 'f2p_OwnerUserId', 'users'): 328648, ('posts', 'f2p_ParentId', 'posts'): 167355, ('posts', 'p2f_AcceptedAnswerId', 'posts'): 57714, ('posts', 'p2f_ParentId', 'posts'): 167355, ('posts', 'p2f_PostId', 'comments'): 623962, ('posts', 'p2f_PostId', 'postHistory'): 1175368, ('posts', 'p2f_PostId', 'postLinks'): 61171, ('posts', 'p2f_PostId', 'votes'): 1199831, ('posts', 'p2f_RelatedPostId', 'postLinks'): 75588, ('users', 'p2f_Owner

In [35]:
graph_data

{('comments',
  'f2p_UserId',
  'users'): (array([     2,      5,      9, ..., 623964, 623965, 623966], dtype=int64), array([   957,    884,    957, ..., 249015, 126602,   1872], dtype=int64)),
 ('users',
  'p2f_UserId',
  'comments'): (array([     2,      4,      4, ..., 255334, 255347, 255351], dtype=int64), array([ 34710,     43,     57, ..., 623941, 623944, 623957], dtype=int64)),
 ('comments',
  'f2p_PostId',
  'posts'): (array([     0,      1,      2, ..., 623964, 623965, 623966], dtype=int64), array([     1,      1,      1, ..., 333879, 333879, 333437], dtype=int64)),
 ('posts',
  'p2f_PostId',
  'comments'): (array([     0,      0,      0, ..., 333881, 333881, 333883], dtype=int64), array([413710, 413713, 559727, ..., 623961, 623962, 623958], dtype=int64)),
 ('badges',
  'f2p_UserId',
  'users'): (array([     0,      1,      2, ..., 463460, 463461, 463462], dtype=int64), array([     4,      5,      7, ..., 253733, 255356, 255358], dtype=int64)),
 ('users',
  'p2f_UserId',
  'ba

In [185]:
model = torch.load("C:\\Users\\Shreya Reddy\\Downloads\\relbenchmain\\examples\\saved_model.pth",
                   map_location=torch.device('cpu'))
explainer = HeteroGNNExplainer(model, num_hops=1)

In [186]:
feat_mask, edge_mask = explainer.explain_graph(dgl_graph, feature_dict)

TypeError: forward() got an unexpected keyword argument 'graph'

In [189]:
import dgl
import dgl.function as fn
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import HeteroGNNExplainer

In [217]:
class Model(nn.Module):
    def __init__(self, num_classes, canonical_etypes, ntype_feature_sizes):
        super(Model, self).__init__()
        # Initialize ModuleDict for edge type specific linear transformations
        self.etype_weights = nn.ModuleDict({
            '_'.join(c_etype): nn.Linear(ntype_feature_sizes[c_etype[0]], num_classes)
            for c_etype in canonical_etypes
        })

    def forward(self, graph, feat, eweight=None):
        feat = {ntype: f.float() for ntype, f in feat.items()}
        with graph.local_scope():
            c_etype_func_dict = {}
            for c_etype in graph.canonical_etypes:
                src_type, etype, dst_type = c_etype
                # Apply linear transformation based on edge type
                wh = self.etype_weights['_'.join(c_etype)](feat[src_type])
                graph.nodes[src_type].data[f'h_{c_etype}'] = wh
                
                # Define message and reduce functions based on whether edge weights are provided
                if eweight is None:
                    c_etype_func_dict[c_etype] = (fn.copy_u(f'h_{c_etype}', 'm'), fn.mean('m', 'h'))
                else:
                    graph.edges[c_etype].data['w'] = eweight[c_etype]
                    c_etype_func_dict[c_etype] = (fn.u_mul_e(f'h_{c_etype}', 'w', 'm'), fn.mean('m', 'h'))
            
            # Update all nodes based on the defined message and reduce functions
            graph.multi_update_all(c_etype_func_dict, 'sum')
            
            # Aggregate node features across all node types
            hg = 0
            for ntype in graph.ntypes:
                if graph.num_nodes(ntype):
                    hg += dgl.mean_nodes(graph, 'h', ntype=ntype)
            return hg

In [191]:
input_dim = 5
num_classes = 2
g = dgl.heterograph({
    ('user', 'plays', 'game'): ([0, 1, 1, 2], [0, 0, 1, 1])})
g.nodes['user'].data['h'] = th.randn(g.num_nodes('user'), input_dim)
g.nodes['game'].data['h'] = th.randn(g.num_nodes('game'), input_dim)

In [198]:
g

Graph(num_nodes={'game': 2, 'user': 3},
      num_edges={('game', 'rev_plays', 'user'): 4, ('user', 'plays', 'game'): 4},
      metagraph=[('game', 'user', 'rev_plays'), ('user', 'game', 'plays')])

In [199]:
dgl_graph

Graph(num_nodes={'badges': 463463, 'comments': 623967, 'postHistory': 1175368, 'postLinks': 77337, 'posts': 333893, 'users': 255359, 'votes': 1317876},
      num_edges={('badges', 'f2p_UserId', 'users'): 463463, ('comments', 'f2p_PostId', 'posts'): 623962, ('comments', 'f2p_UserId', 'users'): 612288, ('postHistory', 'f2p_PostId', 'posts'): 1175368, ('postHistory', 'f2p_UserId', 'users'): 1100031, ('postLinks', 'f2p_PostId', 'posts'): 61171, ('postLinks', 'f2p_RelatedPostId', 'posts'): 75588, ('posts', 'f2p_AcceptedAnswerId', 'posts'): 57714, ('posts', 'f2p_OwnerUserId', 'users'): 328648, ('posts', 'f2p_ParentId', 'posts'): 167355, ('posts', 'p2f_AcceptedAnswerId', 'posts'): 57714, ('posts', 'p2f_ParentId', 'posts'): 167355, ('posts', 'p2f_PostId', 'comments'): 623962, ('posts', 'p2f_PostId', 'postHistory'): 1175368, ('posts', 'p2f_PostId', 'postLinks'): 61171, ('posts', 'p2f_PostId', 'votes'): 1199831, ('posts', 'p2f_RelatedPostId', 'postLinks'): 75588, ('users', 'p2f_OwnerUserId', 'po

In [201]:
g.ndata['h']

{'game': tensor([[-0.1195,  0.2537, -0.3454, -0.6362, -0.4492],
         [ 1.8029,  1.0797,  1.1765,  1.3234, -0.6570]]),
 'user': tensor([[ 0.2359,  0.0094, -0.5826, -2.0033,  1.0364],
         [-0.4612, -1.1405, -0.7990,  0.0546, -0.3133],
         [ 0.6679,  0.7529, -1.2330,  1.2736,  0.2938]])}

In [36]:
feature_dict

{'comments': tensor([[ 2.0090e+03,  1.0000e+00,  1.0000e+00,  ..., -3.2116e-01,
          -1.0238e-01,  6.1823e-02],
         [ 2.0090e+03,  1.0000e+00,  1.0000e+00,  ..., -8.0727e-02,
          -3.2732e-03,  5.0768e-02],
         [ 2.0090e+03,  1.0000e+00,  1.0000e+00,  ..., -8.7929e-02,
           1.2871e-01, -1.6217e-01],
         ...,
         [ 2.0200e+03,  1.1000e+01,  3.0000e+01,  ..., -1.8093e-01,
          -4.8019e-02, -1.4107e-01],
         [ 2.0200e+03,  1.1000e+01,  3.0000e+01,  ..., -1.1969e-01,
           1.0899e-01, -7.9156e-03],
         [ 2.0200e+03,  1.1000e+01,  3.0000e+01,  ..., -1.4080e-01,
          -1.0459e-01, -3.6677e-02]]),
 'badges': tensor([[   0,    0, 2010,  ...,   19,   39,    7],
         [   0,    0, 2010,  ...,   19,   39,    7],
         [   0,    0, 2010,  ...,   19,   39,    7],
         ...,
         [   0,    0, 2020,  ...,   22,   57,   30],
         [   0,    0, 2020,  ...,   23,   40,   12],
         [   0,    0, 2020,  ...,   23,   55,    3]])

In [192]:
transform = dgl.transforms.AddReverse()
g = transform(g)

In [213]:
transform = dgl.transforms.AddReverse()
dgl_graph_t = transform(dgl_graph)

In [193]:
# define and train the model
model = Model(input_dim, num_classes, g.canonical_etypes)
feat = g.ndata['h']
optimizer = th.optim.Adam(model.parameters())
for epoch in range(10):
    logits = model(g, feat)
    loss = F.cross_entropy(logits, th.tensor([1]))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [215]:
g

Graph(num_nodes={'game': 2, 'user': 3},
      num_edges={('game', 'rev_plays', 'user'): 4, ('user', 'plays', 'game'): 4},
      metagraph=[('game', 'user', 'rev_plays'), ('user', 'game', 'plays')])

In [216]:
g.ndata['h']

{'game': tensor([[-0.1195,  0.2537, -0.3454, -0.6362, -0.4492],
         [ 1.8029,  1.0797,  1.1765,  1.3234, -0.6570]]),
 'user': tensor([[ 0.2359,  0.0094, -0.5826, -2.0033,  1.0364],
         [-0.4612, -1.1405, -0.7990,  0.0546, -0.3133],
         [ 0.6679,  0.7529, -1.2330,  1.2736,  0.2938]])}

In [219]:
for ntype in feature_dict:
    feature_dict[ntype] = feature_dict[ntype].float()

In [220]:
# define and train the model
model = Model(2, dgl_graph_t.canonical_etypes, ntype_feature_sizes)
feat = feature_dict
# Trim the 'users' feature tensor
feat['users'] = feat['users'][:dgl_graph_t.num_nodes('users')]

optimizer = th.optim.Adam(model.parameters())


DGLError: Expect number of features to match number of nodes (len(u)). Got 255360 and 255359 instead.

In [232]:
th.tensor([1])

tensor([1])

In [230]:
for epoch in range(10):
    logits = model(dgl_graph_t, feat)
    loss = F.cross_entropy(logits, th.tensor([1]))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [194]:
# Explain for the graph
explainer = HeteroGNNExplainer(model, num_hops=1)
feat_mask, edge_mask = explainer.explain_graph(g, feat)


Explain graph: 100%|██████████| 100/100 [00:02<00:00, 42.82it/s]


In [241]:
explainer.explain_node('users', 2, dgl_graph_t, feat)

IndexError: too many indices for tensor of dimension 2

In [236]:
import gc
gc.collect()
torch.cuda.empty_cache()  # If using GPU
explainer = HeteroGNNExplainer(model, num_hops=1)
feat_mask, edge_mask = explainer.explain_graph(dgl_graph_t, feat)


Explain graph:   0%|          | 0/100 [19:09<?, ?it/s]
Explain graph:   0%|          | 0/100 [00:00<?, ?it/s]

RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 1212699376 bytes.

In [196]:
feat_mask

{'game': tensor([0.2701, 0.2751, 0.2891, 0.2709, 0.2584]),
 'user': tensor([0.2388, 0.2607, 0.2523, 0.2926, 0.2674])}

In [197]:
edge_mask

{('game', 'rev_plays', 'user'): tensor([0.2376, 0.9426, 0.0731, 0.0854]),
 ('user', 'plays', 'game'): tensor([0.9453, 0.7330, 0.7705, 0.9033])}

In [206]:
for n in data.node_types:
    print(n)
    print(feature_dict[n].shape)

comments
torch.Size([623967, 307])
badges
torch.Size([463463, 9])
postLinks
torch.Size([77337, 8])
postHistory
torch.Size([1175368, 309])
votes
torch.Size([1317876, 8])
users
torch.Size([255360, 308])
posts
torch.Size([333893, 908])


In [221]:
for ntype in dgl_graph_t.ntypes:
    print(f"Node type: {ntype}, Number of nodes: {dgl_graph_t.num_nodes(ntype)}")


Node type: badges, Number of nodes: 463463
Node type: comments, Number of nodes: 623967
Node type: postHistory, Number of nodes: 1175368
Node type: postLinks, Number of nodes: 77337
Node type: posts, Number of nodes: 333893
Node type: users, Number of nodes: 255359
Node type: votes, Number of nodes: 1317876


In [229]:
for ntype, feat_tensor in feat.items():
    print(f"Node type: {ntype}, Feature shape: {feat_tensor.shape}")


Node type: comments, Feature shape: torch.Size([623967, 307])
Node type: badges, Feature shape: torch.Size([463463, 9])
Node type: postLinks, Feature shape: torch.Size([77337, 8])
Node type: postHistory, Feature shape: torch.Size([1175368, 309])
Node type: votes, Feature shape: torch.Size([1317876, 8])
Node type: users, Feature shape: torch.Size([255359, 308])
Node type: posts, Feature shape: torch.Size([333893, 908])


In [228]:
# Trim the 'users' feature tensor
feat['users'] = feat['users'][:dgl_graph_t.num_nodes('users')]


In [240]:
# Assuming `feat` is a dictionary of features with keys as node types
# and you're looking for a 'user' node with a specific feature value
user_features = feat['users']  # Get features for 'user' nodes

# Hypothetical condition to identify the specific user node,
# e.g., the first feature is a unique ID, and you're looking for ID == some_unique_id
some_unique_id = 2  # The unique ID or feature you're looking for
node_ids = torch.where(user_features[:, 0] == some_unique_id)[0]  # Find node(s) with the ID

if len(node_ids) > 0:
    node_id = node_ids[0].item()  # Assuming the ID is unique and only one match is expected
    print(f"Node ID for 'user' with unique ID {some_unique_id}: {node_id}")
else:
    print(f"No 'user' node found with unique ID {some_unique_id}")


Node ID for 'user' with unique ID 2: 1


tensor([[-1.0000e+00,  2.0100e+03,  6.0000e+00,  ..., -4.6642e-02,
         -2.3646e-01,  3.8098e-02],
        [ 2.0000e+00,  2.0100e+03,  6.0000e+00,  ..., -1.3502e-01,
         -4.5602e-02, -2.6330e-02],
        [ 3.0000e+00,  2.0100e+03,  6.0000e+00,  ...,  5.9934e-02,
         -1.8164e-01, -1.0005e-01],
        ...,
        [ 2.0341e+07,  2.0200e+03,  1.1000e+01,  ...,  6.7718e-02,
         -2.2297e-01,  3.0119e-01],
        [ 1.7282e+07,  2.0200e+03,  1.1000e+01,  ...,  6.7718e-02,
         -2.2297e-01,  3.0119e-01],
        [ 2.0342e+07,  2.0200e+03,  1.1000e+01,  ...,  6.7718e-02,
         -2.2297e-01,  3.0119e-01]])

In [209]:
ntype_feature_sizes = {
    'comments': 307,
    'badges': 9,
    'postLinks': 8,
    'postHistory': 309,
    'votes': 8,
    'users': 308,
    'posts': 908,
}

In [242]:
data.edge_index_dict

{('comments',
  'f2p_UserId',
  'users'): tensor([[     2,      5,      9,  ..., 623964, 623965, 623966],
         [   957,    884,    957,  ..., 249015, 126602,   1872]]),
 ('users',
  'p2f_UserId',
  'comments'): tensor([[     2,      4,      4,  ..., 255334, 255347, 255351],
         [ 34710,     43,     57,  ..., 623941, 623944, 623957]]),
 ('comments',
  'f2p_PostId',
  'posts'): tensor([[     0,      1,      2,  ..., 623964, 623965, 623966],
         [     1,      1,      1,  ..., 333879, 333879, 333437]]),
 ('posts',
  'p2f_PostId',
  'comments'): tensor([[     0,      0,      0,  ..., 333881, 333881, 333883],
         [413710, 413713, 559727,  ..., 623961, 623962, 623958]]),
 ('badges',
  'f2p_UserId',
  'users'): tensor([[     0,      1,      2,  ..., 463460, 463461, 463462],
         [     4,      5,      7,  ..., 253733, 255356, 255358]]),
 ('users',
  'p2f_UserId',
  'badges'): tensor([[     1,      1,      1,  ..., 255356, 255356, 255358],
         [    46,   1505,   3590,

In [34]:
from torch_geometric.data import HeteroData
from torch_geometric.explain import Explainer, CaptumExplainer

#hetero_data = HeteroData(...)  # A heterogeneous graph data object.
model = torch.load("C:\\Users\\Shreya Reddy\\Downloads\\relbench\\examples\\saved_model.pth",
                   map_location=torch.device('cpu'))
explainer = Explainer(
    model,  # It is assumed that model outputs a single tensor.
    algorithm=CaptumExplainer('IntegratedGradients'),
    explanation_type='model',
    node_mask_type='attributes',
    edge_mask_type='object',
    model_config = dict(
        mode='binary_classification',
        task_level="node",
        return_type='probs',  # Model returns probabilities.
    ),
)



In [35]:
hetero_explanation = explainer(
    feature_dict,
    data.edge_index_dict,
    index=torch.tensor([2]),
)

batch:
 {'comments': tensor([[ 2.0090e+03,  1.0000e+00,  1.0000e+00,  ..., -3.2116e-01,
         -1.0238e-01,  6.1823e-02],
        [ 2.0090e+03,  1.0000e+00,  1.0000e+00,  ..., -8.0727e-02,
         -3.2732e-03,  5.0768e-02],
        [ 2.0090e+03,  1.0000e+00,  1.0000e+00,  ..., -8.7929e-02,
          1.2871e-01, -1.6217e-01],
        ...,
        [ 2.0200e+03,  1.1000e+01,  3.0000e+01,  ..., -1.8093e-01,
         -4.8019e-02, -1.4107e-01],
        [ 2.0200e+03,  1.1000e+01,  3.0000e+01,  ..., -1.1969e-01,
          1.0899e-01, -7.9156e-03],
        [ 2.0200e+03,  1.1000e+01,  3.0000e+01,  ..., -1.4080e-01,
         -1.0459e-01, -3.6677e-02]]), 'badges': tensor([[   0,    0, 2010,  ...,   19,   39,    7],
        [   0,    0, 2010,  ...,   19,   39,    7],
        [   0,    0, 2010,  ...,   19,   39,    7],
        ...,
        [   0,    0, 2020,  ...,   22,   57,   30],
        [   0,    0, 2020,  ...,   23,   40,   12],
        [   0,    0, 2020,  ...,   23,   55,    3]]), 'postLink

NameError: name 'entity_tables' is not defined

In [259]:
# Generate batch-wise heterogeneous explanations for
# the nodes at index `1` and `3`:
hetero_explanation = explainer(
    feature_dict,
    data.edge_index_dict,
    index=torch.tensor([2]),
)
print(hetero_explanation.edge_mask_dict)
print(hetero_explanation.node_mask_dict)

TypeError: unhashable type: 'dict'

In [246]:
data.get_node_store('users')

{'tf': TensorFrame(
  num_cols=3,
  num_rows=255360,
  numerical (1): ['AccountId'],
  timestamp (1): ['CreationDate'],
  embedding (1): ['AboutMe'],
  has_target=False,
  device='cpu',
), 'time': tensor([1279522526, 1279548096, 1279553690,  ..., 1609454915, 1609457265,
        1609457383])}

In [250]:
feature_dict

{'comments': tensor([[ 2.0090e+03,  1.0000e+00,  1.0000e+00,  ..., -3.2116e-01,
          -1.0238e-01,  6.1823e-02],
         [ 2.0090e+03,  1.0000e+00,  1.0000e+00,  ..., -8.0727e-02,
          -3.2732e-03,  5.0768e-02],
         [ 2.0090e+03,  1.0000e+00,  1.0000e+00,  ..., -8.7929e-02,
           1.2871e-01, -1.6217e-01],
         ...,
         [ 2.0200e+03,  1.1000e+01,  3.0000e+01,  ..., -1.8093e-01,
          -4.8019e-02, -1.4107e-01],
         [ 2.0200e+03,  1.1000e+01,  3.0000e+01,  ..., -1.1969e-01,
           1.0899e-01, -7.9156e-03],
         [ 2.0200e+03,  1.1000e+01,  3.0000e+01,  ..., -1.4080e-01,
          -1.0459e-01, -3.6677e-02]]),
 'badges': tensor([[   0.,    0., 2010.,  ...,   19.,   39.,    7.],
         [   0.,    0., 2010.,  ...,   19.,   39.,    7.],
         [   0.,    0., 2010.,  ...,   19.,   39.,    7.],
         ...,
         [   0.,    0., 2020.,  ...,   22.,   57.,   30.],
         [   0.,    0., 2020.,  ...,   23.,   40.,   12.],
         [   0.,    0.,

In [17]:
import os.path as osp

import torch
import torch.nn.functional as F

import torch_geometric.transforms as T
from torch_geometric.datasets import DBLP
from torch_geometric.nn import HeteroConv, Linear, SAGEConv

class HeteroGNN(torch.nn.Module):
    def __init__(self, metadata, hidden_channels, out_channels, num_layers):
        super().__init__()

        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HeteroConv({
                edge_type: SAGEConv((-1, -1), hidden_channels)
                for edge_type in metadata[1]
            })
            self.convs.append(conv)

        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict)
            x_dict = {key: F.leaky_relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['users'])






In [18]:
model = HeteroGNN(data.metadata(), hidden_channels=64, out_channels=4,
                  num_layers=2).to(device)

# with torch.no_grad():  # Initialize lazy modules.
#     out = model(feature_dict, data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=0.001)

In [24]:
import torch

num_nodes = data['users'].num_nodes  # Assuming 'users' is your node type of interest
indices = torch.randperm(num_nodes)

# Example split ratios
train_ratio, val_ratio = 0.7, 0.15

train_size = int(num_nodes * train_ratio)
val_size = int(num_nodes * val_ratio)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[indices[:train_size]] = True
val_mask[indices[train_size:train_size+val_size]] = True
test_mask[indices[train_size+val_size:]] = True

# Assign masks to HeteroData
data['users'].train_mask = train_mask
data['users'].val_mask = val_mask
data['users'].test_mask = test_mask


In [29]:
data.node_items()

[('comments',
  {'tf': TensorFrame(
    num_cols=2,
    num_rows=623967,
    timestamp (1): ['CreationDate'],
    embedding (1): ['Text'],
    has_target=False,
    device='cpu',
  ), 'time': tensor([1233585919, 1233585964, 1233586032,  ..., 1609455955, 1609456624,
          1609456919])}),
 ('badges',
  {'tf': TensorFrame(
    num_cols=3,
    num_rows=463463,
    categorical (2): ['Class', 'TagBased'],
    timestamp (1): ['Date'],
    has_target=False,
    device='cpu',
  ), 'time': tensor([1279568347, 1279568347, 1279568347,  ..., 1609455450, 1609458012,
          1609458903])}),
 ('postLinks',
  {'tf': TensorFrame(
    num_cols=2,
    num_rows=77337,
    numerical (1): ['LinkTypeId'],
    timestamp (1): ['CreationDate'],
    has_target=False,
    device='cpu',
  ), 'time': tensor([1279723653, 1279902641, 1279962661,  ..., 1609443119, 1609449639,
          1609449924])}),
 ('postHistory',
  {'tf': TensorFrame(
    num_cols=4,
    num_rows=1175368,
    numerical (1): ['PostHistoryType

In [25]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(feature_dict, data.edge_index_dict)  # Ensure 'out' aligns with your model's expected inputs and outputs
    mask = data['users'].train_mask
    loss = F.cross_entropy(out[mask], data['users'].y[mask])  # Make sure 'data['users'].y' exists and is properly formatted
    loss.backward()
    optimizer.step()
    return float(loss)


255360

In [69]:

def train():
    model.train()
    optimizer.zero_grad()
    out = model(feature_dict, data.edge_index_dict)
    mask = data['users'].train_mask
    loss = F.cross_entropy(out[mask], data['users'].y[mask])
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test():
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict).argmax(dim=-1)

    accs = []
    for split in ['train_mask', 'val_mask', 'test_mask']:
        mask = data['author'][split]
        acc = (pred[mask] == data['author'].y[mask]).sum() / mask.sum()
        accs.append(float(acc))
    return accs


for epoch in range(1, 10):
    loss = train()
    train_acc, val_acc, test_acc = test()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_acc:.4f}, '
          f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')

In [1]:
if __name__ == '__main__':
    # Your code that creates and starts processes should go here
    # This ensures that multiprocessing is handled correctly
    for batch in loader_dict["train"]:
        batch = batch.to(device)
        print("batch:\n", batch)

NameError: name 'loader_dict' is not defined

In [79]:
n = len(node_feature_name)

In [89]:
for i in n:
    print(tf.feat_dict(node_feature_name[0]))

TypeError: 'int' object is not iterable

In [15]:



# Function to combine features from different semantic types
def combine_features(tensor_frame, semantic_types):
    feature_tensors = []
    for stype in semantic_types:
        if stype in tensor_frame.feat_dict:
            feature_tensors.append(tensor_frame.feat_dict[stype])
    if feature_tensors:
        return torch.cat(feature_tensors, dim=1)  # Concatenate along the feature axis
    else:
        return None



In [ ]:
# Semantic types you want to combine
semantic_types = [torch_frame.numerical, torch_frame.categorical]

# Iterate over each node type and add combined features to the DGL graph
for node_type in data.node_types:
    tensor_frame = data[node_type].tf
    features = combine_features(tensor_frame, semantic_types)
    if features is not None:
        dgl_graph.nodes[node_type].data['features'] = features

In [124]:
# homogeneous_data = data.to_homogeneous()
# print("homogeneous_data:\n",homogeneous_data)

graph_data = {}

# For each edge type in the HeteroData
for (src_type, edge_type, dst_type), edge_data in data.edge_index_dict.items():
    src_nodes, dst_nodes = edge_data
    # Convert PyG edge index format to DGL format
    graph_data[(src_type, edge_type, dst_type)] = (src_nodes.numpy(), dst_nodes.numpy())

# Create the DGL heterograph
dgl_graph = dgl.heterograph(graph_data)
print("dgl_graph:\n",dgl_graph)
print("dgl_graph.nodes['users']:\n",dgl_graph.nodes['users'])



dgl_graph:
 Graph(num_nodes={'badges': 463463, 'comments': 623967, 'postHistory': 1175368, 'postLinks': 77337, 'posts': 333893, 'users': 255359, 'votes': 1317876},
      num_edges={('badges', 'f2p_UserId', 'users'): 463463, ('comments', 'f2p_PostId', 'posts'): 623962, ('comments', 'f2p_UserId', 'users'): 612288, ('postHistory', 'f2p_PostId', 'posts'): 1175368, ('postHistory', 'f2p_UserId', 'users'): 1100031, ('postLinks', 'f2p_PostId', 'posts'): 61171, ('postLinks', 'f2p_RelatedPostId', 'posts'): 75588, ('posts', 'f2p_AcceptedAnswerId', 'posts'): 57714, ('posts', 'f2p_OwnerUserId', 'users'): 328648, ('posts', 'f2p_ParentId', 'posts'): 167355, ('posts', 'p2f_AcceptedAnswerId', 'posts'): 57714, ('posts', 'p2f_ParentId', 'posts'): 167355, ('posts', 'p2f_PostId', 'comments'): 623962, ('posts', 'p2f_PostId', 'postHistory'): 1175368, ('posts', 'p2f_PostId', 'postLinks'): 61171, ('posts', 'p2f_PostId', 'votes'): 1199831, ('posts', 'p2f_RelatedPostId', 'postLinks'): 75588, ('users', 'p2f_Owner

In [125]:

# model = torch.load("C:\\Users\\Shreya Reddy\\Downloads\\relbenchmain\\examples\\saved_model.pth",
#                    map_location=torch.device('cpu'))
explainer = HeteroGNNExplainer(model, num_hops=1)

In [126]:
feat_mask, edge_mask = explainer.explain_graph(dgl_graph, feature_dict)

TypeError: forward() got an unexpected keyword argument 'graph'

In [30]:
from dgl.data import MovieLensDataset
dataset = MovieLensDataset(name='ml-100k', valid_ratio=0.2)
g = dataset[0]
g

Downloading C:\Users\Shreya Reddy\.dgl\ml-100k.zip from https://data.dgl.ai/dataset/ml-100k.zip...


C:\Users\Shreya Reddy\.dgl\ml-100k.zip: 100%|██████████| 9.81M/9.81M [00:00<00:00, 29.6MB/s]


Extracting file to C:\Users\Shreya Reddy\.dgl
Starting processing ml-100k ...
End processing ml-100k ...


Graph(num_nodes={'movie': 1682, 'user': 943},
      num_edges={('movie', 'movie-user', 'user'): 100000, ('user', 'user-movie', 'movie'): 100000},
      metagraph=[('movie', 'user', 'movie-user'), ('user', 'movie', 'user-movie')])

In [31]:
g.nodes["user"].data['train_mask']

KeyError: 'train_mask'

In [32]:
train_mask = g.edges['user-movie'].data['train_mask']

In [33]:
import os.path as osp

import torch
import torch.nn.functional as F
from torch.nn import Linear

import torch_geometric.transforms as T
from torch_geometric.datasets import MovieLens
from torch_geometric.explain import CaptumExplainer, Explainer
from torch_geometric.nn import SAGEConv, to_hetero

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [37]:
path = osp.join(osp.dirname(osp.realpath(__file__)), '../../data/MovieLens')
dataset = MovieLens(path, model_name='all-MiniLM-L6-v2')
data = dataset[0].to(device)

NameError: name '__file__' is not defined

In [ ]:




path = osp.join(osp.dirname(osp.realpath(__file__)), '../../data/MovieLens')
dataset = MovieLens(path, model_name='all-MiniLM-L6-v2')
data = dataset[0].to(device)

# Add user node features for message passing:
data['user'].x = torch.eye(data['user'].num_nodes, device=device)
del data['user'].num_nodes

# Add a reverse ('movie', 'rev_rates', 'user') relation for message passing:
data = T.ToUndirected()(data)
data['user', 'movie'].edge_label = data['user',
                                        'movie'].edge_label.to(torch.float)
del data['movie', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

# Perform a link-level split into training, validation, and test edges:
data, _, _ = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'movie')],
    rev_edge_types=[('movie', 'rev_rates', 'user')],
)(data)


class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)


model = Model(hidden_channels=32).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(1, 10):
    model.train()
    optimizer.zero_grad()
    pred = model(
        data.x_dict,
        data.edge_index_dict,
        data['user', 'movie'].edge_label_index,
    )
    loss = F.mse_loss(pred, data['user', 'movie'].edge_label)
    loss.backward()
    optimizer.step()

explainer = Explainer(
    model=model,
    algorithm=CaptumExplainer('IntegratedGradients'),
    explanation_type='model',
    model_config=dict(
        mode='regression',
        task_level='edge',
        return_type='raw',
    ),
    node_mask_type='attributes',
    edge_mask_type='object',
    threshold_config=dict(
        threshold_type='topk',
        value=200,
    ),
)

index = torch.tensor([2, 10])  # Explain edge labels with index 2 and 10.
explanation = explainer(
    data.x_dict,
    data.edge_index_dict,
    index=index,
    edge_label_index=data['user', 'movie'].edge_label_index,
)
print(f'Generated explanations in {explanation.available_explanations}')

path = 'feature_importance.png'
explanation.visualize_feature_importance(path, top_k=10)
print(f"Feature importance plot has been saved to '{path}'")
