# Graph Neural Network with Node Information

In [1]:
# Libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, SAGEConv
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
import numpy as np
import pandas as pd
import os 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set the main path
main_path = '/Users/posmikdc/Documents/brown/classes/year2/fall25/csci2952g-dlgenomics/csci2952g-paper'
data_path = os.path.join(main_path, 'data/gao_shs27k_data')

# Define data and file objects
data = {}
files = [
    'protein.actions.SHS27k.STRING.pro2.txt',
    'protein.SHS27k.sequences.dictionary.pro3.tsv',
    'edge_list_12.npy',
    'x_list.pt',
    'vec5_CTC.txt'
]

# Read in files based on extension
for file in files:
    file_path = os.path.join(data_path, file)
    key = os.path.splitext(file)[0]
    
    if file.endswith('.npy'):
        data[key] = np.load(file_path, allow_pickle=True)
    elif file.endswith('.pt'):
        data[key] = torch.load(file_path, weights_only=False)
    elif file.endswith('.tsv'):
        data[key] = pd.read_csv(file_path, sep='\t')
    else:
        data[key] = pd.read_csv(file_path, sep=r'\s+')

## Baseline GNN: GCN Approach with Protein Degree as Node Features

This baseline GNN uses protein degree as node features (simple but informative). We implement link prediction to predict protein interactions. We create negative samples (non-interacting protein pairs) for training

This GNN approach uses GCN layers to aggregate neighborhood information. Evaluates with AUC, AP, and F1 metrics

In [3]:
# Create the Graph Data
def create_graph_data(ppi_df, min_score=0):
    """
    Create PyTorch Geometric graph data from PPI dataframe.
    Uses protein degree as simple node features.
    
    Parameters:
    -----------
    ppi_df : pd.DataFrame
        PPI dataframe
    min_score : int
        Minimum score threshold
        
    Returns:
    --------
    data : torch_geometric.data.Data
        Graph data object
    protein_to_idx : dict
        Mapping from protein ID to node index
    """
    # Filter by score
    df = ppi_df[ppi_df['score'] >= min_score].copy()
    
    # Get unique proteins
    proteins = pd.concat([df['item_id_a'], df['item_id_b']]).unique()
    protein_to_idx = {prot: idx for idx, prot in enumerate(proteins)}
    idx_to_protein = {idx: prot for prot, idx in protein_to_idx.items()}
    
    print(f"Number of proteins (nodes): {len(proteins)}")
    print(f"Number of interactions (edges): {len(df)}")
    
    # Create edge index
    edge_index = []
    edge_labels = []
    edge_scores = []
    
    for _, row in df.iterrows():
        src = protein_to_idx[row['item_id_a']]
        dst = protein_to_idx[row['item_id_b']]
        edge_index.append([src, dst])
        edge_index.append([dst, src])  # Add reverse edge for undirected graph
        
        # Binary label: 1 if interaction exists
        edge_labels.extend([1, 1])
        edge_scores.extend([row['score'], row['score']])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_labels = torch.tensor(edge_labels, dtype=torch.float)
    edge_scores = torch.tensor(edge_scores, dtype=torch.float)
    
    # Create node features: [degree, normalized_degree, clustering_coefficient]
    import networkx as nx
    G = nx.Graph()
    for _, row in df.iterrows():
        G.add_edge(row['item_id_a'], row['item_id_b'])
    
    node_features = []
    for idx in range(len(proteins)):
        prot = idx_to_protein[idx]
        degree = G.degree(prot) if prot in G else 0
        node_features.append([degree])
    
    node_features = torch.tensor(node_features, dtype=torch.float)
    
    # Normalize features
    node_features = (node_features - node_features.mean(dim=0)) / (node_features.std(dim=0) + 1e-8)
    
    # Create negative samples (non-edges)
    num_nodes = len(proteins)
    num_neg_samples = len(edge_labels) // 2  # Same number as positive edges
    
    neg_edge_index = []
    existing_edges = set(map(tuple, edge_index.t().numpy()))
    
    while len(neg_edge_index) < num_neg_samples:
        src = np.random.randint(0, num_nodes)
        dst = np.random.randint(0, num_nodes)
        if src != dst and (src, dst) not in existing_edges and (dst, src) not in existing_edges:
            neg_edge_index.append([src, dst])
            neg_edge_index.append([dst, src])
    
    neg_edge_index = torch.tensor(neg_edge_index, dtype=torch.long).t().contiguous()
    neg_edge_labels = torch.zeros(len(neg_edge_index[0]), dtype=torch.float)
    
    # Combine positive and negative edges
    all_edge_index = torch.cat([edge_index, neg_edge_index], dim=1)
    all_edge_labels = torch.cat([edge_labels, neg_edge_labels])
    
    # Create PyG data object
    data = Data(x=node_features, edge_index=edge_index)
    data.edge_label_index = all_edge_index
    data.edge_label = all_edge_labels
    data.num_nodes = len(proteins)
    
    print(f"Node feature shape: {node_features.shape}")
    print(f"Positive edges: {len(edge_labels) // 2}")
    print(f"Negative edges: {len(neg_edge_labels) // 2}")
    
    return data, protein_to_idx


In [4]:
# GNN (for Link Prediction)
class GNN_LinkPredictor(nn.Module):
    """
    Simple GNN for link prediction.
    """
    def __init__(self, in_channels, hidden_channels=64, num_layers=2):
        super().__init__()
        
        self.convs = nn.ModuleList()
        self.convs.append(GCNConv(in_channels, hidden_channels))
        
        for _ in range(num_layers - 1):
            self.convs.append(GCNConv(hidden_channels, hidden_channels))
        
        # Link prediction head
        self.lin = nn.Linear(hidden_channels * 2, 1)
        
    def encode(self, x, edge_index):
        """Encode nodes to embeddings."""
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=0.5, training=self.training)
        return x
    
    def decode(self, z, edge_label_index):
        """Decode edge predictions from node embeddings."""
        # Concatenate source and target node embeddings
        src = z[edge_label_index[0]]
        dst = z[edge_label_index[1]]
        edge_emb = torch.cat([src, dst], dim=-1)
        return self.lin(edge_emb).squeeze()
    
    def forward(self, x, edge_index, edge_label_index):
        z = self.encode(x, edge_index)
        return self.decode(z, edge_label_index)

In [5]:
# Training Function
def train_gnn(data, model, optimizer, device):
    """Train for one epoch."""
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(data.x, data.edge_index, data.edge_label_index)
    loss = F.binary_cross_entropy_with_logits(out, data.edge_label)
    
    # Backward pass
    loss.backward()
    optimizer.step()
    
    return loss.item()

In [6]:
# Evaluation Function
def evaluate_gnn(data, model, device):
    """Evaluate model."""
    model.eval()
    
    with torch.no_grad():
        out = model(data.x, data.edge_index, data.edge_label_index)
        pred = torch.sigmoid(out).cpu().numpy()
        labels = data.edge_label.cpu().numpy()
        
        auc = roc_auc_score(labels, pred)
        ap = average_precision_score(labels, pred)
        pred_binary = (pred > 0.5).astype(int)
        f1 = f1_score(labels, pred_binary)
    
    return auc, ap, f1


In [7]:

# Example usage:
ppi_df = data['protein.actions.SHS27k.STRING.pro2']

# Create graph data
graph_data, protein_to_idx = create_graph_data(ppi_df, min_score=400)

# Split edges into train/val/test
num_edges = len(graph_data.edge_label)
indices = torch.randperm(num_edges)
train_size = int(0.7 * num_edges)
val_size = int(0.15 * num_edges)

train_idx = indices[:train_size]
val_idx = indices[train_size:train_size + val_size]
test_idx = indices[train_size + val_size:]

train_data = Data(
    x=graph_data.x,
    edge_index=graph_data.edge_index,
    edge_label_index=graph_data.edge_label_index[:, train_idx],
    edge_label=graph_data.edge_label[train_idx]
)

val_data = Data(
    x=graph_data.x,
    edge_index=graph_data.edge_index,
    edge_label_index=graph_data.edge_label_index[:, val_idx],
    edge_label=graph_data.edge_label[val_idx]
)

test_data = Data(
    x=graph_data.x,
    edge_index=graph_data.edge_index,
    edge_label_index=graph_data.edge_label_index[:, test_idx],
    edge_label=graph_data.edge_label[test_idx]
)

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNN_LinkPredictor(in_channels=graph_data.x.shape[1], hidden_channels=64, num_layers=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Move data to device
train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)

# Training loop
print("\nTraining GNN...")
for epoch in range(1, 101):
    loss = train_gnn(train_data, model, optimizer, device)
    
    if epoch % 10 == 0:
        train_auc, train_ap, train_f1 = evaluate_gnn(train_data, model, device)
        val_auc, val_ap, val_f1 = evaluate_gnn(val_data, model, device)
        
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}')
        print(f'  Train - AUC: {train_auc:.4f}, AP: {train_ap:.4f}, F1: {train_f1:.4f}')
        print(f'  Val   - AUC: {val_auc:.4f}, AP: {val_ap:.4f}, F1: {val_f1:.4f}')

# Final test evaluation
test_auc, test_ap, test_f1 = evaluate_gnn(test_data, model, device)
print(f'\nTest Results - AUC: {test_auc:.4f}, AP: {test_ap:.4f}, F1: {test_f1:.4f}')

Number of proteins (nodes): 784
Number of interactions (edges): 4810
Node feature shape: torch.Size([784, 1])
Positive edges: 4810
Negative edges: 2405

Training GNN...
Epoch 010, Loss: 0.5358
  Train - AUC: 0.7918, AP: 0.8937, F1: 0.8023
  Val   - AUC: 0.8129, AP: 0.9030, F1: 0.8130
Epoch 020, Loss: 0.5135
  Train - AUC: 0.8135, AP: 0.9002, F1: 0.8151
  Val   - AUC: 0.8358, AP: 0.9105, F1: 0.8195
Epoch 030, Loss: 0.4900
  Train - AUC: 0.8289, AP: 0.9053, F1: 0.8385
  Val   - AUC: 0.8503, AP: 0.9159, F1: 0.8502
Epoch 040, Loss: 0.4615
  Train - AUC: 0.8589, AP: 0.9142, F1: 0.8603
  Val   - AUC: 0.8793, AP: 0.9252, F1: 0.8711
Epoch 050, Loss: 0.4498
  Train - AUC: 0.8692, AP: 0.9160, F1: 0.8798
  Val   - AUC: 0.8896, AP: 0.9278, F1: 0.8845
Epoch 060, Loss: 0.4444
  Train - AUC: 0.8715, AP: 0.9154, F1: 0.8760
  Val   - AUC: 0.8920, AP: 0.9277, F1: 0.8872
Epoch 070, Loss: 0.4321
  Train - AUC: 0.8719, AP: 0.9157, F1: 0.8827
  Val   - AUC: 0.8921, AP: 0.9278, F1: 0.8916
Epoch 080, Loss: 0.