In [None]:
import pandas as pd
import torch
from torch_geometric.data import Data
import numpy as np

In [8]:
lnctard_dataset = pd.read_csv("../data/filtered_lnctard.tsv", sep="\t", header=0)
biobert_embeddings_dict = np.load(
    "../data/all_biobert_embeddings_128_dim.npy", allow_pickle=True
).item()

In [9]:
name_type_dict = {}
for _, row in lnctard_dataset.iterrows():
    name_type_dict[row["Regulator"]] = row["RegulatorType"]
    name_type_dict[row["Target"]] = row["TargetType"]

In [10]:
EMBEDDING_DIM = 128

In [13]:
def create_regulatory_graph(df, biobert_embeddings_dict, name_type_dict):
    all_nodes = set()
    all_nodes.update(df["Regulator"].unique())
    all_nodes.update(df["Target"].unique())
    all_nodes = list(all_nodes)
    node_to_idx = {node: idx for idx, node in enumerate(all_nodes)}

    node_features = []
    node_types = []
    for node in all_nodes:
        embedding = biobert_embeddings_dict.get(node, np.zeros(EMBEDDING_DIM))
        embedding = embedding / np.linalg.norm(embedding)
        node_type = name_type_dict.get(node, "unknown")
        node_features.append(embedding)
        node_types.append(node_type)

    node_features = torch.tensor(np.array(node_features), dtype=torch.float)
    type_categories = ["miRNA", "PCG", "TF", "lncRNA", "snoRNA", "circRNA"]
    node_type_features = []
    for node_type in node_types:
        type_vec = [1 if node_type == cat else 0 for cat in type_categories]
        if node_type not in type_categories:
            type_vec.append(1)
        else:
            type_vec.append(0)
        node_type_features.append(type_vec)
    node_type_features = torch.tensor(node_type_features, dtype=torch.float)
    node_features = torch.cat([node_features, node_type_features], dim=1)

    edge_index = []
    for _, row in df.iterrows():
        regulator_idx = node_to_idx[row["Regulator"]]
        target_idx = node_to_idx[row["Target"]]
        edge_index.append([regulator_idx, target_idx])
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    edge_attr = []
    for _, row in df.iterrows():
        cancer_type = row["DiseaseName2"]
        cancer_categories = ["Lung cancer", "Brain glioma", "Pancreatic cancer"]
        cancer_vec = [1 if cancer_type == cat else 0 for cat in cancer_categories]
        if cancer_type not in cancer_categories:
            cancer_vec.append(1)
        else:
            cancer_vec.append(0)
        regulation_dir = row["RegulationDiretion"]
        regulation_categories = [
            "negatively-F",
            "negatively-E",
            "positively-E",
            "positively-F",
            "interact",
        ]
        regulation_vec = [
            1 if regulation_dir == cat else 0 for cat in regulation_categories
        ]
        if regulation_dir not in regulation_categories:
            regulation_vec.append(1)
        else:
            regulation_vec.append(0)
        expression_pattern = row["ExpressionPattern"]
        expression_categories = ["upregulation", "downregulation"]
        expression_vec = [
            1 if expression_pattern == cat else 0 for cat in expression_categories
        ]
        if expression_pattern not in expression_categories:
            expression_vec.append(1)
        else:
            expression_vec.append(0)
        edge_attr_vec = cancer_vec + regulation_vec + expression_vec
        edge_attr.append(edge_attr_vec)
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)

    data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr)

    data.node_names = all_nodes
    data.node_to_idx = node_to_idx
    data.num_nodes = len(all_nodes)
    data.num_edges = len(df)
    return data

In [14]:
graph_data = create_regulatory_graph(
    lnctard_dataset,
    biobert_embeddings_dict,
    name_type_dict,
)

In [15]:
print(f"Graph Information:")
print(f"- Number of nodes: {graph_data.num_nodes}")
print(f"- Number of edges: {graph_data.num_edges}")
print(f"- Node feature dimensions: {graph_data.x.shape}")
print(f"- Edge feature dimensions: {graph_data.edge_attr.shape}")
print(f"- Edge index shape: {graph_data.edge_index.shape}")

print(f"Node features shape: {graph_data.x.shape}")
print(f"Edge index shape: {graph_data.edge_index.shape}")
print(f"Edge attributes shape: {graph_data.edge_attr.shape}")

Graph Information:
- Number of nodes: 3908
- Number of edges: 8360
- Node feature dimensions: torch.Size([3908, 135])
- Edge feature dimensions: torch.Size([8360, 13])
- Edge index shape: torch.Size([2, 8360])
Node features shape: torch.Size([3908, 135])
Edge index shape: torch.Size([2, 8360])
Edge attributes shape: torch.Size([8360, 13])


In [16]:
torch.save(graph_data, "../data/graph_biobert_128_dim.pt")