In [16]:
import torch
import pandas as pd
from torch_geometric.data import HeteroData
from sklearn.preprocessing import LabelEncoder

In [3]:
# Load the file containing lncRNA–protein interactions
lnc_protein_df = pd.read_excel("data/cleaned_lncRNA_interactions.xlsx")

# Load the protein–protein interactions file
ppi_df = pd.read_excel("data/cleaned_PPI_interactions.xlsx")

In [4]:
# Extract raw sets of nodes
lnc_nodes_set = set(lnc_protein_df['node1'].unique())
protein_nodes_set = set(lnc_protein_df['node2'].unique()) | set(ppi_df['gene1'].unique()) | set(ppi_df['gene2'].unique())

# Build node → type mapping
node_types = {}
for node in lnc_nodes_set:
    node_types[node] = 'lncRNA'
for node in protein_nodes_set:
    node_types[node] = 'protein' 

# Final node list + type
all_nodes_df = pd.DataFrame(list(node_types.items()), columns=['node', 'type'])

# calculate degree using all interactions
all_node_occurrences = pd.concat([
    lnc_protein_df['node1'], lnc_protein_df['node2'],
    ppi_df['gene1'], ppi_df['gene2']
], ignore_index=True)

degree_counts = all_node_occurrences.value_counts().rename_axis('node').reset_index(name='degree')

# Merge
all_nodes_df = all_nodes_df.merge(degree_counts, on='node', how='left').fillna(0)
all_nodes_df['degree'] = all_nodes_df['degree'].astype(int)

print(all_nodes_df.head())
print(len(all_nodes_df))


              node    type  degree
0      SLC25A5-AS1  lncRNA       1
1  ENST00000609697  lncRNA       1
2          NMRAL2P  lncRNA       5
3       FBXL19-AS1  lncRNA      11
4    CCDC144NL-AS1  lncRNA       2
12854


In [5]:
# Count how many nodes of each type we have
node_type_counts = all_nodes_df['type'].value_counts()

# Print the result
print("Number of lncRNA nodes:", node_type_counts.get('lncRNA', 0))
print("Number of protein nodes:", node_type_counts.get('protein', 0))


Number of lncRNA nodes: 1269
Number of protein nodes: 11585


In [6]:
lnc_protein_df.head()

Unnamed: 0,node1,node2,regulatoryType,SearchregulatoryMechanism,diseaseCategory,regulatoryType.1,DiseaseName2,CancerLabel,node1Type
0,LINC00313,miR-4429,binding/interaction,ceRNA or sponge,Cancer,binding/interaction,Thyroid cancer,Cancer,lncRNA
1,FAM83H-AS1,CDKN1A,regulation,epigenetic regulation,Cancer,regulation,Brain glioma,Cancer,lncRNA
2,NEAT1,TGFB1,association,ceRNA or sponge,Cancer,association,Liver cancer,Cancer,lncRNA
3,NEAT1,ZEB1,regulation,ceRNA or sponge,Cancer,regulation,Breast cancer,Cancer,lncRNA
4,ZFPM2-AS1,MIF,binding/interaction,interact with protein,Cancer,binding/interaction,Gastric cancer,Cancer,lncRNA


In [7]:
# Start from the lncRNA–protein interaction DataFrame
# We'll extract the edges and attach edge features

# Select relevant columns only
lp_edges_df = lnc_protein_df[['node1', 'node2', 'SearchregulatoryMechanism', 'CancerLabel', 'DiseaseName2', 'regulatoryType.1']].copy()

# Rename the columns for clarity
lp_edges_df.columns = ['source', 'target', 'mechanism', 'cancer', 'disease', 'interaction']

# Convert 'CancerLabel' to binary: 1 if "Cancer", 0 otherwise
lp_edges_df['cancer'] = (lp_edges_df['cancer'] == 'Cancer').astype(int)

# Show some samples
print(lp_edges_df.head())


       source    target              mechanism  cancer         disease  \
0   LINC00313  miR-4429        ceRNA or sponge       1  Thyroid cancer   
1  FAM83H-AS1    CDKN1A  epigenetic regulation       1    Brain glioma   
2       NEAT1     TGFB1        ceRNA or sponge       1    Liver cancer   
3       NEAT1      ZEB1        ceRNA or sponge       1   Breast cancer   
4   ZFPM2-AS1       MIF  interact with protein       1  Gastric cancer   

           interaction  
0  binding/interaction  
1           regulation  
2          association  
3           regulation  
4  binding/interaction  


In [8]:
# Create the encoder and fit it to the 'mechanism' column
mechanism_encoder = LabelEncoder()
lp_edges_df['mechanism_encoded'] = mechanism_encoder.fit_transform(lp_edges_df['mechanism'])

# check how each mechanism was encoded
mechanism_mapping = dict(zip(mechanism_encoder.classes_, mechanism_encoder.transform(mechanism_encoder.classes_)))
print("Mechanism label mapping:", mechanism_mapping)

# Show updated DataFrame
print(lp_edges_df.head())


Mechanism label mapping: {'ceRNA or sponge': 0, 'chromatin looping': 1, 'epigenetic regulation': 2, 'expression association': 3, 'interact with mRNA': 4, 'interact with protein': 5, 'transcriptional regulation': 6}
       source    target              mechanism  cancer         disease  \
0   LINC00313  miR-4429        ceRNA or sponge       1  Thyroid cancer   
1  FAM83H-AS1    CDKN1A  epigenetic regulation       1    Brain glioma   
2       NEAT1     TGFB1        ceRNA or sponge       1    Liver cancer   
3       NEAT1      ZEB1        ceRNA or sponge       1   Breast cancer   
4   ZFPM2-AS1       MIF  interact with protein       1  Gastric cancer   

           interaction  mechanism_encoded  
0  binding/interaction                  0  
1           regulation                  2  
2          association                  0  
3           regulation                  0  
4  binding/interaction                  5  


In [14]:
# Create the encoder and fit it to the 'interaction' column
interaction_encoder = LabelEncoder()
lp_edges_df['interaction_encoded'] = interaction_encoder.fit_transform(lp_edges_df['interaction'])

# check how each mechanism was encoded
interaction_mapping = dict(zip(interaction_encoder.classes_, interaction_encoder.transform(interaction_encoder.classes_)))
print("interaction label mapping:", interaction_mapping)

# Show updated DataFrame
print(lp_edges_df.head())


interaction label mapping: {'association': 0, 'binding/interaction': 1, 'regulation': 2}
       source    target              mechanism  cancer         disease  \
0   LINC00313  miR-4429        ceRNA or sponge       1  Thyroid cancer   
1  FAM83H-AS1    CDKN1A  epigenetic regulation       1    Brain glioma   
2       NEAT1     TGFB1        ceRNA or sponge       1    Liver cancer   
3       NEAT1      ZEB1        ceRNA or sponge       1   Breast cancer   
4   ZFPM2-AS1       MIF  interact with protein       1  Gastric cancer   

           interaction  mechanism_encoded  disease_encoded  \
0  binding/interaction                  0              210   
1           regulation                  2               27   
2          association                  0              125   
3           regulation                  0               29   
4  binding/interaction                  5               84   

   interaction_encoded  
0                    1  
1                    2  
2                 

In [15]:
# Create the encoder and fit it to the 'disease' column
disease_encoder = LabelEncoder()
lp_edges_df['disease_encoded'] = mechanism_encoder.fit_transform(lp_edges_df['disease'])
print(lp_edges_df.head())

       source    target              mechanism  cancer         disease  \
0   LINC00313  miR-4429        ceRNA or sponge       1  Thyroid cancer   
1  FAM83H-AS1    CDKN1A  epigenetic regulation       1    Brain glioma   
2       NEAT1     TGFB1        ceRNA or sponge       1    Liver cancer   
3       NEAT1      ZEB1        ceRNA or sponge       1   Breast cancer   
4   ZFPM2-AS1       MIF  interact with protein       1  Gastric cancer   

           interaction  mechanism_encoded  disease_encoded  \
0  binding/interaction                  0              210   
1           regulation                  2               27   
2          association                  0              125   
3           regulation                  0               29   
4  binding/interaction                  5               84   

   interaction_encoded  
0                    1  
1                    2  
2                    0  
3                    2  
4                    1  


In [17]:
# Rename columns for consistency
ppi_df = ppi_df.rename(columns={
    'gene1': 'source',
    'gene2': 'target',
    'combined_score': 'score'
})

# Show the first few rows
print(ppi_df.head())


     source target  score
0     RAB5A   GGA1    794
1      ARF6   GGA1    819
2     BACE1   GGA1    979
3    CDKN2A   GGA1    746
4  TRAPPC6A  RAB1B    817


In [18]:
# Get list of all lncRNA node names
lnc_nodes = all_nodes_df[all_nodes_df['type'] == 'lncRNA']['node'].tolist()
print(len(lnc_nodes))
#print(lnc_nodes)

# Get list of all protein node names
protein_nodes = all_nodes_df[all_nodes_df['type'] == 'protein']['node'].tolist()
print(len(protein_nodes))

1269
11585


In [19]:
# Create node type DataFrames
lnc_df = pd.DataFrame({'node': lnc_nodes, 'type': 'lncRNA'})
protein_df = pd.DataFrame({'node': protein_nodes, 'type': 'protein'})

print(lnc_df.head())
print(len(lnc_df))
print(protein_df.head())
print(len(protein_df))

              node    type
0      SLC25A5-AS1  lncRNA
1  ENST00000609697  lncRNA
2          NMRAL2P  lncRNA
3       FBXL19-AS1  lncRNA
4    CCDC144NL-AS1  lncRNA
1269
        node     type
0     ACOT12  protein
1     MRPL21  protein
2     RPL10A  protein
3  LINC02218  protein
4      PTGES  protein
11585


In [21]:

#Build node name → index mapping
lncRNA_index = {row['node']: i for i, row in lnc_df.iterrows()}
protein_index = {row['node']: i for i, row in protein_df.iterrows()}

print(len(lncRNA_index))
print(lncRNA_index)

1269
{'SLC25A5-AS1': 0, 'ENST00000609697': 1, 'NMRAL2P': 2, 'FBXL19-AS1': 3, 'CCDC144NL-AS1': 4, 'FGF13-AS1': 5, 'lnc-bc060912': 6, 'HNRNPKP2': 7, 'AC003092.1': 8, 'LncRNA-HGBC': 9, 'PSMG3-AS1': 10, 'NBAT1': 11, 'LINC01426': 12, 'CBR3-AS1': 13, 'HIF1A-AS1': 14, 'OXCT1-AS1': 15, 'lncRNA-135528': 16, 'LINC01224': 17, 'HOXA10-AS': 18, 'B4GALT1-AS1': 19, 'BALR-6': 20, 'CALML3-AS1': 21, 'RP11-531A24.3': 22, 'LINC00997': 23, 'SMIM45': 24, 'MIR7-3HG': 25, 'SNHG17': 26, 'LOC285194': 27, 'LINC00261': 28, 'RP11-296A18.3': 29, 'COMETT': 30, 'lncRNACNN3-206': 31, 'NFIA-AS2': 32, 'TSPOAP1-AS1': 33, 'PXN-AS1': 34, 'lncATB': 35, 'PCAT19': 36, 'CCND2-AS1': 37, 'lncRNA-6195': 38, 'PINK1-AS': 39, 'PBB12': 40, 'Lnc-HZ08': 41, 'MCM3AP-AS1': 42, 'DUBR': 43, 'GATA2-AS1': 44, 'SLNCR1': 45, 'ZNF667-AS1': 46, 'Gm15290': 47, 'LINC00355': 48, 'RP11-436H11.5': 49, 'MFI2': 50, 'PCAT6': 51, 'C5orf66-AS1': 52, 'LINC00858': 53, 'SOD2-OT1': 54, 'RP5-857K21.7': 55, 'PURPL': 56, 'KRT19P3': 57, 'RSU1P2': 58, 'LINC00887':

In [22]:
# === 4. Create HeteroData object ===


# type tensor: 0 for lncRNA, 1 for protein
# shape: [num_nodes, 1]
type_tensor = torch.tensor((all_nodes_df['type'] == 'protein').astype(float).values, dtype=torch.float).unsqueeze(1)  

# degree tensor
degree_tensor = torch.tensor(all_nodes_df['degree'].values, dtype=torch.float).unsqueeze(1)

# Final node feature matrix
node_features = torch.cat([type_tensor, degree_tensor], dim=1)

print(len(node_features))
print(node_features)


12854
tensor([[ 0.,  1.],
        [ 0.,  1.],
        [ 0.,  5.],
        ...,
        [ 1., 16.],
        [ 1.,  2.],
        [ 1.,  2.]])


In [23]:
## add node features (type + degree)

data = HeteroData()
data['lncRNA'].x = node_features[all_nodes_df['type'] == 'lncRNA']
data['protein'].x = node_features[all_nodes_df['type'] == 'protein']

print(len(data['lncRNA'].x))
print(data['lncRNA'].x)

print(len(data['protein'].x))
print(data['protein'].x)

1269
tensor([[ 0.,  1.],
        [ 0.,  1.],
        [ 0.,  5.],
        ...,
        [ 0.,  5.],
        [ 0., 50.],
        [ 0.,  1.]])
11585
tensor([[ 1.,  2.],
        [ 1., 20.],
        [ 1., 46.],
        ...,
        [ 1., 16.],
        [ 1.,  2.],
        [ 1.,  2.]])


In [24]:
# Add lncRNA–protein edges 

lp_edge_index = torch.tensor([
    [lncRNA_index[src] for src in lp_edges_df['source']],
    [protein_index[tgt] for tgt in lp_edges_df['target']]
], dtype=torch.long)
data['lncRNA', 'interacts', 'protein'].edge_index = lp_edge_index

lp_edge_attr = torch.tensor(
    lp_edges_df[['mechanism_encoded', 'cancer', 'disease_encoded', 'interaction_encoded']].values,
    dtype=torch.float
)
data['lncRNA', 'interacts', 'protein'].edge_attr = lp_edge_attr

print(len(data['lncRNA', 'interacts', 'protein'].edge_attr))
print(data['lncRNA', 'interacts', 'protein'].edge_attr)


7635
tensor([[  0.,   1., 210.,   1.],
        [  2.,   1.,  27.,   2.],
        [  0.,   1., 125.,   0.],
        ...,
        [  0.,   1., 157.,   1.],
        [  3.,   1., 136.,   2.],
        [  3.,   1.,  84.,   1.]])


In [25]:

# === 6. Add protein–protein edges ===

ppi_df = ppi_df[ppi_df['source'] != ppi_df['target']]  # Remove self-loops
ppi_df = ppi_df[ppi_df['source'].isin(protein_index) & ppi_df['target'].isin(protein_index)]

ppi_edge_index = torch.tensor([
    [protein_index[src] for src in ppi_df['source']],
    [protein_index[tgt] for tgt in ppi_df['target']]
], dtype=torch.long)
data['protein', 'interacts', 'protein'].edge_index = ppi_edge_index

ppi_edge_attr = torch.tensor(ppi_df['score'].values, dtype=torch.float).unsqueeze(1)
data['protein', 'interacts', 'protein'].edge_attr = ppi_edge_attr

print(len(data['protein', 'interacts', 'protein'].edge_attr))
print(data['protein', 'interacts', 'protein'].edge_attr)


148992
tensor([[794.],
        [819.],
        [979.],
        ...,
        [826.],
        [766.],
        [829.]])


In [26]:
print(data['lncRNA', 'interacts', 'protein'].edge_index)
print(data['protein', 'interacts', 'protein'].edge_index)

tensor([[ 1134,   805,   942,  ...,   257,   330,   476],
        [ 7464, 10917,  5586,  ...,  5286,  8632, 10045]])
tensor([[ 5471,  7731,  2687,  ...,   132,  3290,  3162],
        [ 7670,  7670,  7670,  ...,  5606,  4503, 11478]])


In [28]:
# === Final check ===
print(data)

HeteroData(
  lncRNA={ x=[1269, 2] },
  protein={ x=[11585, 2] },
  (lncRNA, interacts, protein)={
    edge_index=[2, 7635],
    edge_attr=[7635, 4],
  },
  (protein, interacts, protein)={
    edge_index=[2, 148992],
    edge_attr=[148992, 1],
  }
)


In [29]:
# Choose where to save the file
save_path = "data/combined_dbs_heteroGraph.pt"

# Save the HeteroData object
torch.save(data, save_path)

print(f"HeteroData graph saved to: {save_path}")

HeteroData graph saved to: data/combined_dbs_heteroGraph.pt
