In [None]:
import random
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import sys

In [4]:
# Add the parent directory of 'src' to the Python path
sys.path.append('/ranjan/graphtransformer/my_project')  

In [5]:
from src.data_processing import load_cora_data, partition_graph
from src.embedding import mean_pooling, compute_laplacian_positional_embedding, compute_gcn_embeddings
from src.transformer import GraphTransformer
from src.trainer import train_model, evaluate_model

# Step 1: Load the Cora dataset

In [15]:
graph = load_cora_data()
print(f"Graph Info:\nNodes: {graph.num_nodes}, Edges: {graph.num_edges}, Features: {graph.num_node_features}")

graph

Dataset Loaded: Cora, Number of Graphs: 1
Graph Info:
Nodes: 2708, Edges: 10556, Features: 1433


Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [14]:
graph.train_mask

tensor([ True,  True,  True,  ..., False, False, False])

-  `graph.train_mask` is likely a boolean mask indicating which nodes in the Cora dataset are used for training.
- This mask is commonly used in machine learning tasks to separate the dataset into training, validation, and test sets.
- By applying this mask, you can filter out the nodes that are designated for training purposes.

# Step 2: Partition the graph into subgraphs

In [24]:

num_parts = 10  # More partitions for larger training set
cluster_data = partition_graph(graph, num_parts=num_parts)

for subgraphs in cluster_data:
    print(subgraphs)

Graph partitioned into 10 subgraphs.
Data(x=[277, 1433], y=[277], train_mask=[277], val_mask=[277], test_mask=[277], edge_index=[2, 1164])
Data(x=[270, 1433], y=[270], train_mask=[270], val_mask=[270], test_mask=[270], edge_index=[2, 866])
Data(x=[273, 1433], y=[273], train_mask=[273], val_mask=[273], test_mask=[273], edge_index=[2, 944])
Data(x=[262, 1433], y=[262], train_mask=[262], val_mask=[262], test_mask=[262], edge_index=[2, 870])
Data(x=[273, 1433], y=[273], train_mask=[273], val_mask=[273], test_mask=[273], edge_index=[2, 960])
Data(x=[274, 1433], y=[274], train_mask=[274], val_mask=[274], test_mask=[274], edge_index=[2, 1140])
Data(x=[262, 1433], y=[262], train_mask=[262], val_mask=[262], test_mask=[262], edge_index=[2, 740])
Data(x=[265, 1433], y=[265], train_mask=[265], val_mask=[265], test_mask=[265], edge_index=[2, 812])
Data(x=[277, 1433], y=[277], train_mask=[277], val_mask=[277], test_mask=[277], edge_index=[2, 980])
Data(x=[275, 1433], y=[275], train_mask=[275], val_m

Computing METIS partitioning...
Done!


# Step 3: Compute embeddings and track mask information

In [None]:
subgraph_embeddings = []
lpe_embeddings = []
node_labels = []
num_nodes_list = []
train_masks = []
val_masks = []
test_masks = []

for i in range(num_parts):
    subgraph = cluster_data[i]
    print("="*100)
    print(f"Subgraph {i} - Number of nodes: {subgraph.num_nodes}")
    print(f"Subgraph {i} - Feature vector size: {subgraph.x.size(1)}")
        
    # Compute GCN embeddings
    gcn_embeddings = compute_gcn_embeddings(subgraph, input_dim=1433, hidden_dim=64, output_dim=16)
        
    # Compute Laplacian positional embeddings
    lpe = compute_laplacian_positional_embedding(subgraph, embedding_dim=16)
        
    # Compute subgraph-level embeddings using mean pooling
    subgraph_embedding = mean_pooling(gcn_embeddings)
        
    # Append subgraph-level embeddings and labels
    subgraph_embeddings.append(subgraph_embedding)
    lpe_embeddings.append(lpe.mean(dim=0))  # Mean pooling for LPE as well
    node_labels.append(subgraph.y)
    num_nodes_list.append(subgraph.num_nodes)
    
    # Store the masks for each subgraph
    train_masks.append(subgraph.train_mask)
    val_masks.append(subgraph.val_mask)
    test_masks.append(subgraph.test_mask)
    
    # Debugging prints to check tensor sizes
    print(f"Subgraph {i} - GCN Embeddings Size: {gcn_embeddings.size()}")
    print(f"Subgraph {i} - LPE Size: {lpe.size()}")
    print(f"Subgraph {i} - Subgraph Embedding Size: {subgraph_embedding.size()}")
    print(f"Subgraph {i} - Node Labels Size: {subgraph.y.size()}")
    print("="*100)
    
subgraph_embeddings = torch.stack(subgraph_embeddings)
lpe_embeddings = torch.stack(lpe_embeddings)
node_labels = torch.cat(node_labels, dim=0)
num_nodes_list = torch.tensor(num_nodes_list)


In [1]:
1+1

2