In [None]:
import rdflib
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, OWL, RDFS
import torch
from torch_geometric.data import HeteroData
from torch_geometric.nn import SAGEConv, HeteroConv
import torch.nn.functional as F
import torch.nn as nn
import torch_geometric.transforms as T



In [2]:
# File paths
ttl_ark = r"C:\Users\yanpe\OneDrive - Metropolia Ammattikorkeakoulu Oy\Research\MD2MV\data\TTL\01ARK\ARK_MET.ttl"
ttl_sensor = r"C:\Users\yanpe\OneDrive - Metropolia Ammattikorkeakoulu Oy\Research\MD2MV\data\TTL\sensors_linked.ttl"

# 1. Load Data
g = rdflib.Graph()
files = [ttl_ark, ttl_sensor] 

for file in files:
    try:
        g.parse(file, format="turtle")
        print(f"Loaded {file}")
    except Exception as e:
        print(f"Error loading {file}: {e}")

# Define Namespaces
BRICK = Namespace("https://brickschema.org/schema/Brick#")
BOT = Namespace("https://w3id.org/bot#")
INST = Namespace("https://lbd.example.com/")
PROPS = Namespace("http://lbd.arch.rwth-aachen.de/props#")

PREFIXES = {
    "brick": BRICK, "bot": BOT, "inst": INST, "rdfs": RDFS, "props": PROPS
}

for p, ns in PREFIXES.items(): g.bind(p, ns)

Loaded C:\Users\yanpe\OneDrive - Metropolia Ammattikorkeakoulu Oy\Research\MD2MV\data\TTL\01ARK\ARK_MET.ttl
Loaded C:\Users\yanpe\OneDrive - Metropolia Ammattikorkeakoulu Oy\Research\MD2MV\data\TTL\sensors_linked.ttl


In [3]:
# Print the first 5 triples to see the raw format
print("--- Raw Triples Sample ---")
for s, p, o in list(g)[:5]:
    print(f"{s}  |  {p}  |  {o}")

# Print all unique Types found
unique_types = set()
for s, p, o in g.triples((None, RDF.type, None)):
    unique_types.add(str(o))
print(f"\n--- Found Node Types ({len(unique_types)}) ---")
print(unique_types)

# Print all unique Predicates found
unique_preds = set()
for s, p, o in g:
    unique_preds.add(str(p))
print(f"\n--- Found Relations ({len(unique_preds)}) ---")
print(unique_preds)


--- Raw Triples Sample ---
https://lbd.example.com/wall_ae67b2bf-8316-4d92-a0e6-1ed84e5b3bb3  |  http://lbd.arch.rwth-aachen.de/props#globalIdIfcRoot_attribute_simple  |  2kPxA$WnPDag3c7jXEMpkp
https://lbd.example.com/wall_1798ccea-aa19-41f7-b372-ad96d543e26c  |  https://w3id.org/bot#hasSubElement  |  https://lbd.example.com/door_fee3f249-9668-4076-ae9d-60d0eae2725e
https://lbd.example.com/railing_81c21791-fb6b-40af-80ed-189e6e37faac  |  http://lbd.arch.rwth-aachen.de/props#objectTypeIfcObject_attribute_simple  |  Railing:KÃ¤sijohde d 30mm
https://lbd.example.com/wall_7c575b5f-b6bb-4516-801b-e3e04fadb854  |  http://www.w3.org/2002/07/owl#sameAs  |  https://lbd.example.com/IfcWallStandardCase_14633015
https://lbd.example.com/flowterminal_9c247209-b941-4d40-af98-a82fda27464b  |  http://lbd.arch.rwth-aachen.de/props#batid_attribute_simple  |  14626612

--- Found Node Types (10) ---
{'https://w3id.org/bot#Site', 'https://w3id.org/bot#Storey', 'https://w3id.org/bot#Building', 'https://brick

In [4]:
# 2. Map URIs to Types (The "Node Registry")
uri_to_type = {}   # URI string -> Type string (e.g. 'Space')
type_to_id = {}    # Type string -> {URI string: int_id}
id_to_uri = {}     # Type string -> {int_id: URI string}

# We iterate over all rdf:type triples to register valid nodes
for s, p, o in g.triples((None, RDF.type, None)):
    s_str = str(s)
    # Extract clean type name (e.g., "Space" from "https://w3id.org/bot#Space")
    type_label = str(o).split('#')[-1].split('/')[-1]
    
    # Filter: Ignore OWL class definitions or properties if they appear as types
    if type_label in ['ObjectProperty', 'DatatypeProperty', 'Class']:
        continue
        
    uri_to_type[s_str] = type_label

    if type_label not in type_to_id:
        type_to_id[type_label] = {}
        id_to_uri[type_label] = {}
    
    # Assign ID if not exists
    if s_str not in type_to_id[type_label]:
        new_id = len(type_to_id[type_label])
        type_to_id[type_label][s_str] = new_id
        id_to_uri[type_label][new_id] = s_str

print(f"Registered {len(uri_to_type)} nodes across {len(type_to_id)} types.")

# 3. Create HeteroData Object
data = HeteroData()

# Initialize node features
for node_type, mapping in type_to_id.items():
    data[node_type].x = torch.randn(len(mapping), 16)

# --- FIXED EDGE BUILDING SECTION ---

# Use a temporary dictionary to avoid HeteroData KeyError during construction
temp_edge_dict = {} 
edges_count = 0

for s, p, o in g:
    s_str, o_str = str(s), str(o)
    
    if p == RDF.type:
        continue
        
    if s_str in uri_to_type and o_str in uri_to_type:
        src_type = uri_to_type[s_str]
        dst_type = uri_to_type[o_str]
        rel_name = str(p).split('#')[-1].split('/')[-1]
        
        edge_key = (src_type, rel_name, dst_type)
        
        if edge_key not in temp_edge_dict:
            temp_edge_dict[edge_key] = [[], []]
        
        src_id = type_to_id[src_type][s_str]
        dst_id = type_to_id[dst_type][o_str]
        
        temp_edge_dict[edge_key][0].append(src_id)
        temp_edge_dict[edge_key][1].append(dst_id)
        edges_count += 1

# Now move the edges from our temp dict into the data object
for edge_key, indices in temp_edge_dict.items():
    data[edge_key].edge_index = torch.tensor(indices, dtype=torch.long)

print(f"Graph Construction Complete.")
print(f"Nodes: {len(uri_to_type)}")
print(f"Edges: {edges_count}")

# Now this will work because the dictionary is no longer empty!
if edges_count > 0:
    print("Found Edge Types:", list(data.edge_index_dict.keys()))
else:
    print("Zero structural edges found. Check if URIs are shared between files.")

Registered 54966 nodes across 8 types.
Graph Construction Complete.
Nodes: 54966
Edges: 60592
Found Edge Types: [('Element', 'hasSubElement', 'Element'), ('Storey', 'containsElement', 'Element'), ('Storey', 'hasSpace', 'Space'), ('Space', 'containsElement', 'Element'), ('Room_Air_Temperature_Sensor', 'isPointOf', 'Space'), ('CO2_Sensor', 'isPointOf', 'Space'), ('Humidity_Sensor', 'isPointOf', 'Space'), ('Building', 'hasStorey', 'Storey'), ('Site', 'hasBuilding', 'Building')]


In [5]:
class ManualHeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, metadata):
        super().__init__()
        self.conv1 = HeteroConv({
            edge_type: SAGEConv((-1, -1), hidden_channels)
            for edge_type in metadata[1]
        }, aggr='sum')
        
        self.conv2 = HeteroConv({
            edge_type: SAGEConv((-1, -1), out_channels)
            for edge_type in metadata[1]
        }, aggr='sum')

    def forward(self, x_dict, edge_index_dict):
        # 1. First Convolution
        out_dict = self.conv1(x_dict, edge_index_dict)
        
        # 2. SAFETY: Filter out None values and apply ReLU
        # Some node types might not receive messages and return None
        x_dict = {
            key: x.relu() for key, x in out_dict.items() 
            if x is not None
        }
        
        # 3. Second Convolution
        out_dict = self.conv2(x_dict, edge_index_dict)
        
        # 4. SAFETY: Filter out None values again
        x_dict = {
            key: x for key, x in out_dict.items() 
            if x is not None
        }
        
        return x_dict

# Initialize the model with the data's metadata
# data.metadata() returns ([node_types], [edge_types])
model = ManualHeteroGNN(hidden_channels=32, out_channels=16, metadata=data.metadata())



In [7]:
import torch_geometric.transforms as T

# 1. Define which relation we want to predict
# Using the one from your previous error
target_edge = ('Room_Air_Temperature_Sensor', 'isPointOf', 'Space')

# 2. Setup the split
# num_val=0.1 (10% for validation), num_test=0.1 (10% for testing)
# This will automatically hide some edges from the 'train' graph 
# so the model can't "cheat" by seeing them during training.
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.2,
    key="edge_label", # This creates a 'labels' attribute for training
    add_negative_train_samples=True, 
    edge_types=[target_edge], # We only split the target relation
    rev_edge_types=None 
)

train_data, val_data, test_data = transform(data)

print(f"Original edges: {data[target_edge].edge_index.size(1)}")
print(f"Train edges: {train_data[target_edge].edge_index.size(1)}")

Original edges: 827
Train edges: 580


In [None]:


class LinkPredictor(nn.Module):
    def __init__(self, in_channels):
        super().__init__()
        self.lin = nn.Sequential(
            nn.Linear(in_channels * 2, in_channels),
            nn.ReLU(),
            nn.Linear(in_channels, 1)
        )

    def forward(self, z_src, z_dst):
        # Combine sensor embedding and space embedding
        combined = torch.cat([z_src, z_dst], dim=-1)
        return self.lin(combined)

# Initialize
predictor = LinkPredictor(in_channels=16)
optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=0.01)

def train():
    model.train()
    predictor.train()
    optimizer.zero_grad()

    # 1. Get embeddings using the TRAINING edges only
    # We use a try-except here to catch any persistent PyG internal errors 
    # during the message passing phase.
    try:
        z_dict = model(train_data.x_dict, train_data.edge_index_dict)
    except AttributeError:
        # Fallback: if message passing fails because a node is too isolated, 
        # we use the raw features (x) as the embedding.
        z_dict = train_data.x_dict 

    # 2. Get the specific edges for this training batch
    edge_label_index = train_data[target_edge].edge_label_index
    labels = train_data[target_edge].edge_label

    # 3. Predict
    src_type, _, dst_type = target_edge
    z_src = z_dict[src_type][edge_label_index[0]]
    z_dst = z_dict[dst_type][edge_label_index[1]]
    
    predictions = predictor(z_src, z_dst).squeeze()
    
    loss = torch.nn.functional.binary_cross_entropy_with_logits(predictions, labels)
    loss.backward()
    optimizer.step()
    return loss.item()

In [None]:
@torch.no_grad()
def find_missing_relations(target_edge_type, top_k=10):
    model.eval()
    src_type, rel, dst_type = target_edge_type
    
    # 1. Get trained embeddings
    embeddings = model(data.x_dict, data.edge_index_dict)
    
    # 2. Iterate through potential candidates (Simplified)
    # In production, use a matrix product, but be careful with memory on large graphs
    src_emb = embeddings[src_type] # Shape: [NumSrcNodes, 16]
    dst_emb = embeddings[dst_type] # Shape: [NumDstNodes, 16]
    
    # Calculate score matrix (All Sources vs All Destinations)
    # score_matrix[i, j] = probability of link between src[i] and dst[j]
    score_matrix = torch.matmul(src_emb, dst_emb.t()).sigmoid()
    
    # 3. Mask out existing edges so we don't "discover" links we already have
    existing_edges = data[target_edge_type].edge_index
    score_matrix[existing_edges[0], existing_edges[1]] = 0
    
    # 4. Find highest scores
    values, indices = torch.topk(score_matrix.flatten(), top_k)
    
    print(f"\nTop {top_k} predicted NEW relations for {target_edge_type}:")
    for v, idx in zip(values, indices):
        # Convert flat index back to (row, col)
        row_idx = idx // score_matrix.size(1)
        col_idx = idx % score_matrix.size(1)
        
        src_uri = id_to_uri[src_type][row_idx]
        dst_uri = id_to_uri[dst_type][col_idx]
        
        print(f"Score: {v:.4f} | {src_uri} -- should be connected to --> {dst_uri}")

# Execute
find_missing_relations(target_edge_type)