In [2]:
import torch
import numpy as np
import pandas as pd
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.transforms import RandomLinkSplit

In [3]:
# Load your merged dataframe
df = pd.read_excel("data/merged_lnctard_STring.xlsx")
df.head()

Unnamed: 0,node1,node2,interaction,disease,source
0,LINC00313,miR-4429,binding/interaction,Papillary thyroid carcinoma,lncRNA
1,FAM83H-AS1,CDKN1A,regulation,Malignant glioma,lncRNA
2,NEAT1,TGFB1,association,Hepatocellular carcinoma,lncRNA
3,NEAT1,ZEB1,regulation,Breast cancer,lncRNA
4,ZFPM2-AS1,MIF,binding/interaction,Gastric cancer,lncRNA


In [4]:
# Extract edges
edges = df[['node1', 'node2']]

# Build full list of unique nodes
all_nodes = pd.concat([edges['node1'], edges['node2']]).unique()

# Apply label encoding to all nodes (convert gene names to integer IDs)
le = LabelEncoder()
le.fit(all_nodes)

# Map node1 and node2 to encoded integers
source = le.transform(edges['node1'])
target = le.transform(edges['node2'])

# Build edge_index tensor for PyG
edge_index = torch.tensor([source, target], dtype=torch.long)
num_nodes = len(le.classes_)

# --------------------------------
# Build Degree Feature (Feature 1)
# --------------------------------
node_degrees = np.zeros(num_nodes)
for s, t in zip(source, target):
    node_degrees[s] += 1
    node_degrees[t] += 1

degree_feature = node_degrees.reshape(-1, 1)

# ------------------------------------
# Build Node Type Feature (Feature 2)
# ------------------------------------

# Build dictionary of {node: type} from 'source' column in the file
# 'source' column has the type info: 'lncRNA' or 'protein'
node_type_dict = dict(zip(df['node1'], df['source']))

# Build type feature for all encoded nodes
node_types = []
for gene in le.classes_:
    type_str = node_type_dict.get(gene, "PPI")  # default to protein if not found
    if type_str == 'lncRNA':
        node_types.append(1)
    else:
        node_types.append(0)

node_types = np.array(node_types).reshape(-1, 1)

# ----------------------
# Combine both features
# ----------------------
x = np.concatenate([degree_feature, node_types], axis=1)
x = torch.tensor(x, dtype=torch.float)

# --------------------------
# Build the PyG Data object
# --------------------------
data = Data(edge_index=edge_index, x=x, num_nodes=num_nodes)



# Apply random link split for train/val/test
transform = RandomLinkSplit(is_undirected=True, add_negative_train_samples=True)
train_data, val_data, test_data = transform(data)

# Sanity check
print("Node features shape:", train_data.x.shape)
# Check basic info
print(f"Train edges: {train_data.edge_index.shape[1]}")
print(f"Validation edges: {val_data.edge_index.shape[1]}")
print(f"Test edges: {test_data.edge_index.shape[1]}")
print(f"Node features shape: {train_data.x.shape}")


  edge_index = torch.tensor([source, target], dtype=torch.long)


Node features shape: torch.Size([20232, 2])
Train edges: 610064
Validation edges: 610064
Test edges: 697214
Node features shape: torch.Size([20232, 2])


In [5]:
class GraphSAGE(torch.nn.Module):
    
    # in_channels should contains the number of node features
    def __init__(self, in_channels, hidden_channels):
        super(GraphSAGE, self).__init__()
        
        ## the first two layers
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        
        ## output layer for link prediction which gives a final score for each edge
        self.lin = torch.nn.Linear(hidden_channels, 1) 

        
    # return node embedding for each node.
    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)   ## RELU activation
        x = self.conv2(x, edge_index)
        return x

    
    # return probability of edge existance between each node pair
    def decode(self, z, edge_label_index):
        src = z[edge_label_index[0]]
        dst = z[edge_label_index[1]]
        return torch.sum(src * dst, dim=-1)  # dot product similarity

In [6]:
train_edge_index = train_data.edge_index
val_edge_index = val_data.edge_index
test_edge_index = test_data.edge_index

In [8]:
# Device Assignment + Model Initialization + Preparation

# select device to train on: use GPU if available else use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# intialize graphSAGE model
model = GraphSAGE(in_channels=x.shape[1], hidden_channels=64).to(device)

#Move node feature tensor to the same device as model (GPU or CPU)
x = x.to(device)
## same with edge features
train_edge_index = train_edge_index.to(device)


#choose optimizer to update weights after each backpropagation step
#Adam works excellent with most GNN.
# learning rate =0.1
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [9]:
## Training loop / The Core Learning Block

def train():
    model.train()
    optimizer.zero_grad()  # reset
    z = model.encode(x, train_edge_index)  ## calculate node embeddings
    pos_out = model.decode(z, train_edge_index) ## calculate probability for real edges in training data  
    pos_label = torch.ones(pos_out.size(0), device=device) ## give 1 to positive samples 


    ## create random edges between random nodes to create negative samples
    neg_edge_index = torch.randint(0, num_nodes, train_edge_index.size(), device=device)
    neg_out = model.decode(z, neg_edge_index) ## calculate probability for UNreal edges in training data 
    neg_label = torch.zeros(neg_out.size(0), device=device)  # ## give 0 to negative samples

    out = torch.cat([pos_out, neg_out]) ## combine all predictions 
    labels = torch.cat([pos_label, neg_label]) ## combine all labels 0 or 1

    # calculte loss function
    loss = F.binary_cross_entropy_with_logits(out, labels)
    loss.backward()   ##calculate gradients in backpropagation
    optimizer.step()   ## update weights 
    return loss.item()


In [10]:
# 50 cycle training

for epoch in range(1, 51):
    loss = train()
    if epoch % 5 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")


Epoch 5, Loss: 53359.0508
Epoch 10, Loss: 20182.1211
Epoch 15, Loss: 9519.5742
Epoch 20, Loss: 5461.8022
Epoch 25, Loss: 3390.5728
Epoch 30, Loss: 2547.5698
Epoch 35, Loss: 1434.9706
Epoch 40, Loss: 1044.2664
Epoch 45, Loss: 758.6997
Epoch 50, Loss: 470.5381


In [11]:

## Model Evaluation Block
## link prediction evaluation with AUC Metric

@torch.no_grad()   ## don't need gradients for this function, good for memory and speed.

def test(edge_index):
    model.eval()  # put model in evaluation mode.
    z = model.encode(x, train_edge_index)  # caculate embeddings for training graph structure.
    
    #positive sampling 
    pos_out = model.decode(z, edge_index)  
    pos_label = torch.ones(pos_out.size(0), device=device)

    # negative sampling
    neg_edge_index = torch.randint(0, num_nodes, edge_index.size(), device=device)
    neg_out = model.decode(z, neg_edge_index)
    neg_label = torch.zeros(neg_out.size(0), device=device)

    out = torch.cat([pos_out, neg_out]).cpu()
    labels = torch.cat([pos_label, neg_label]).cpu()

    auc = roc_auc_score(labels, out)   ## calculate area under ROC curve / AUC score is very good for link predictions tasks.
    return auc


In [12]:
val_auc = test(val_edge_index.to(device))
test_auc = test(test_edge_index.to(device))

print(f"Validation AUC: {val_auc:.4f}")
print(f"Test AUC: {test_auc:.4f}")


Validation AUC: 0.8037
Test AUC: 0.8038
