In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.transforms import RandomLinkSplit

In [2]:
# Load your merged dataframe
df = pd.read_excel("data/merged_lnctard_STring.xlsx")
df.head()

Unnamed: 0,node1,node2,interaction,disease,source
0,LINC00313,miR-4429,binding/interaction,Papillary thyroid carcinoma,lncRNA
1,FAM83H-AS1,CDKN1A,regulation,Malignant glioma,lncRNA
2,NEAT1,TGFB1,association,Hepatocellular carcinoma,lncRNA
3,NEAT1,ZEB1,regulation,Breast cancer,lncRNA
4,ZFPM2-AS1,MIF,binding/interaction,Gastric cancer,lncRNA


In [7]:

# We only need the nodes (node1, node2)
edges = df[['node1', 'node2']]

# Encode the node IDs into integer indices
le = LabelEncoder()
all_nodes = pd.concat([edges['node1'], edges['node2']])
le.fit(all_nodes)

# transform to integer indices and convert to numpy arrays (faster for torch.tensor)
source = le.transform(edges['node1']).astype('int64')
target = le.transform(edges['node2']).astype('int64')

# Build edge index tensor
edge_index = torch.tensor([source, target], dtype=torch.long)

# get total number of nodes
num_nodes = len(le.classes_)


# ---------------------
# Build Degree Feature 
# ---------------------

# Count the degree for each node
node_degrees = np.zeros(num_nodes)
for s, t in zip(source, target):
    node_degrees[s] += 1
    node_degrees[t] += 1

# Convert node features to torch tensor (reshape to be (num_nodes, 1))
x = torch.tensor(node_degrees, dtype=torch.float).unsqueeze(1)

# Create PyG Data object
#data = Data(edge_index=edge_index, x=node_features, num_nodes=num_nodes)
data = Data(edge_index=edge_index, x=x, num_nodes=num_nodes)


# Create train / val / test splits (now easier with random link split)
transform = RandomLinkSplit(is_undirected=True, add_negative_train_samples=True)
train_data, val_data, test_data = transform(data)

# Check basic info
print(f"Train edges: {train_data.edge_index.shape[1]}")
print(f"Validation edges: {val_data.edge_index.shape[1]}")
print(f"Test edges: {test_data.edge_index.shape[1]}")
print(f"Node features shape: {train_data.x.shape}")


Train edges: 610064
Validation edges: 610064
Test edges: 697214
Node features shape: torch.Size([20232, 1])


In [8]:
class GraphSAGE(torch.nn.Module):
    
    # in_channels should contains the number of node features
    def __init__(self, in_channels, hidden_channels):
        super(GraphSAGE, self).__init__()
        
        ## the first two layers
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        
        ## output layer for link prediction which gives a final score for each edge
        self.lin = torch.nn.Linear(hidden_channels, 1) 

        
    # return node embedding for each node.
    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)   ## RELU activation
        x = self.conv2(x, edge_index)
        return x

    
    # return probability of edge existance between each node pair
    def decode(self, z, edge_label_index):
        src = z[edge_label_index[0]]
        dst = z[edge_label_index[1]]
        return torch.sum(src * dst, dim=-1)  # dot product similarity

In [9]:
train_edge_index = train_data.edge_index
val_edge_index = val_data.edge_index
test_edge_index = test_data.edge_index

In [10]:
# Device Assignment + Model Initialization + Preparation

# select device to train on: use GPU if available else use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# intialize graphSAGE model
model = GraphSAGE(in_channels=node_features.shape[1], hidden_channels=64).to(device)

# Move node feature tensor to the same device as model (GPU or CPU)
x = x.to(device)
## same with edge features
train_edge_index = train_edge_index.to(device)


#choose optimizer to update weights after each backpropagation step
#Adam works excellent with most GNN.
# learning rate =0.1
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [11]:
## Training loop / The Core Learning Block

def train():
    model.train()
    optimizer.zero_grad()  # reset
    z = model.encode(x, train_edge_index)  ## calculate node embeddings
    pos_out = model.decode(z, train_edge_index) ## calculate probability for real edges in training data  
    pos_label = torch.ones(pos_out.size(0), device=device) ## give 1 to positive samples 


    ## create random edges between random nodes to create negative samples
    neg_edge_index = torch.randint(0, num_nodes, train_edge_index.size(), device=device)
    neg_out = model.decode(z, neg_edge_index) ## calculate probability for UNreal edges in training data 
    neg_label = torch.zeros(neg_out.size(0), device=device)  # ## give 0 to negative samples

    out = torch.cat([pos_out, neg_out]) ## combine all predictions 
    labels = torch.cat([pos_label, neg_label]) ## combine all labels 0 or 1

    # calculte loss function
    loss = F.binary_cross_entropy_with_logits(out, labels)
    loss.backward()   ##calculate gradients in backpropagation
    optimizer.step()   ## update weights 
    return loss.item()


In [12]:
# 50 cycle training

for epoch in range(1, 51):
    loss = train()
    if epoch % 5 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")


Epoch 5, Loss: 95715.6641
Epoch 10, Loss: 31545.9883
Epoch 15, Loss: 22965.6758
Epoch 20, Loss: 10659.5127
Epoch 25, Loss: 8051.0781
Epoch 30, Loss: 4806.2119
Epoch 35, Loss: 3816.8171
Epoch 40, Loss: 2143.0479
Epoch 45, Loss: 1677.1681
Epoch 50, Loss: 1186.1985


In [13]:

## Model Evaluation Block
## link prediction evaluation with AUC Metric

@torch.no_grad()   ## don't need gradients for this function, good for memory and speed.

def test(edge_index):
    model.eval()  # put model in evaluation mode.
    z = model.encode(x, train_edge_index)  # caculate embeddings for training graph structure.
    
    #positive sampling 
    pos_out = model.decode(z, edge_index)  
    pos_label = torch.ones(pos_out.size(0), device=device)

    # negative sampling
    neg_edge_index = torch.randint(0, num_nodes, edge_index.size(), device=device)
    neg_out = model.decode(z, neg_edge_index)
    neg_label = torch.zeros(neg_out.size(0), device=device)

    out = torch.cat([pos_out, neg_out]).cpu()
    labels = torch.cat([pos_label, neg_label]).cpu()

    auc = roc_auc_score(labels, out)   ## calculate area under ROC curve / AUC score is very good for link predictions tasks.
    return auc


In [14]:
val_auc = test(val_edge_index.to(device))
test_auc = test(test_edge_index.to(device))

print(f"Validation AUC: {val_auc:.4f}")
print(f"Test AUC: {test_auc:.4f}")


Validation AUC: 0.8033
Test AUC: 0.8033
