In [None]:
import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

from torch_geometric.data import Data
import torch
from torch import nn
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


### Code adapted from Yasemin's original version

In [3]:
# load in data
file_path = 'gnpc_supp.xlsx'
sheet_name = 'SuppTbl5'
df = pd.read_excel(file_path, sheet_name=sheet_name, header=1)

In [4]:
# select only the columns we need for training/prediction
col_for_training = ['EntrezGeneSymbol', 'Avg_StdBeta_weighted_AD', 'Meta_p_weighted_AD', 'Avg_StdBeta_weighted_PD', 'Meta_p_weighted_PD',
                    'Avg_StdBeta_weighted_FTD', 'Meta_p_weighted_FTD', 'StdBeta_ALS', 'p_ALS'] 

features_df = df[col_for_training].copy() # avoids SettingWithCopyWarning warning in next cell
features_df.rename(columns={'EntrezGeneSymbol' : 'EntrezGeneSymbol', 'Avg_StdBeta_weighted_AD' : 'AD_beta', 
                            'Meta_p_weighted_AD' : 'AD_p', 'Avg_StdBeta_weighted_PD' : 'PD_beta', 'Meta_p_weighted_PD' : 'PD_p',
                            'Avg_StdBeta_weighted_FTD' : 'FTD_beta', 'Meta_p_weighted_FTD' : 'FTD_p', 'StdBeta_ALS': 'ALS_beta', 'p_ALS' : 'ALS_p'
                            }, inplace=True)# rename cols for easier access  
features_df.head()

Unnamed: 0,EntrezGeneSymbol,AD_beta,AD_p,PD_beta,PD_p,FTD_beta,FTD_p,ALS_beta,ALS_p
0,CRYBB2,0.020411,0.105892,-0.019541,0.002026,0.016847,0.126806,-0.075116,0.201204
1,RAF1,-0.018095,0.863149,0.004977,0.189185,0.007533,0.393355,-0.052058,0.369734
2,ZNF41,0.049706,0.000772,0.004067,0.466674,0.000385,0.385409,0.015578,0.790375
3,ELK1,0.02899,0.002776,0.010033,0.580357,0.000471,0.457739,0.088028,0.129374
4,GUCA1A,-0.014837,0.15134,-0.00616,0.018256,-0.054077,0.095542,0.030428,0.603799


In [5]:
# data cleaning
# remove NAs & fill with zero, keep only the first duplicate for genes with multiple entries

print(f"Shape before: {features_df.shape}")
features_df.drop_duplicates(subset='EntrezGeneSymbol', keep='first', inplace=True)
features_df.set_index('EntrezGeneSymbol', inplace=True)
features_df.fillna(0, inplace=True)
print(f"Shape after: {features_df.shape}")

Shape before: (7289, 9)
Shape after: (6386, 8)


In [6]:
# feature engineering
# p values could really skew our overall data distribution, so we will -log transform them

# small value added to prevent log(0)
epsilon = 1e-8

# create the -log10(p-value) features
p_value_cols = ['AD_p', 'PD_p', 'FTD_p'] #'ALS_p'
for col in p_value_cols:
    new_col_name = col.replace('_p', '_logp')
    features_df[new_col_name] = -np.log10(features_df[col] + epsilon)

# create feature matrix
final_features = [
    'AD_beta', 'AD_logp',
    'PD_beta', 'PD_logp',
    'FTD_beta', 'FTD_logp',
    #'ALS_beta', 'ALS_logp' # this is what we'll be predicting, so drop here
]

# filter out old p value cols
final_features_df = features_df[final_features]
final_features_df.head()

Unnamed: 0_level_0,AD_beta,AD_logp,PD_beta,PD_logp,FTD_beta,FTD_logp
EntrezGeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CRYBB2,0.020411,0.975136,-0.019541,2.693365,0.016847,0.89686
RAF1,-0.018095,0.063914,0.004977,0.723114,0.007533,0.405216
ZNF41,0.049706,3.11223,0.004067,0.330987,0.000385,0.414078
ELK1,0.02899,2.556574,0.010033,0.236305,0.000471,0.339382
GUCA1A,-0.014837,0.820047,-0.00616,1.738601,-0.054077,1.019804


In [7]:
# PPI network created using STRING contains UniProt values 
# we need to map these to Gene Symbol
original_df = pd.read_excel(file_path, sheet_name=sheet_name, header=1)

mapping_df = original_df[['UniProt', 'EntrezGeneSymbol']].dropna() # cols we want to map
mapping_df = mapping_df.drop_duplicates(subset='UniProt') # drop duplicates - only want one UniProt-Entrez Gene Symbol combo
uniprot_to_gene_map = dict(zip(mapping_df['UniProt'], mapping_df['EntrezGeneSymbol']))
print(uniprot_to_gene_map) 

{'P43320': 'CRYBB2', 'P04049': 'RAF1', 'P51814': 'ZNF41', 'P19419': 'ELK1', 'P43080': 'GUCA1A', 'Q14457': 'BECN1', 'Q01968': 'OCRL', 'O95238': 'SPDEF', 'O43623': 'SNAI2', 'Q13303': 'KCNAB2', 'Q9Y253': 'POLH', 'P11473': 'VDR', 'Q86XE5': 'HOGA1', 'P09622': 'DLD', 'Q9UIF7': 'MUTYH', 'Q13115': 'DUSP4', 'Q9H4I2': 'ZHX3', 'Q96PQ1': 'SIGLEC12', 'P00491': 'PNP', 'Q9H3D4': 'TP63', 'P41235': 'HNF4A', 'Q96BR1': 'SGK3', 'O60885': 'BRD4', 'Q9GZT5': 'WNT10A', 'Q13618': 'CUL3', 'Q13490': 'BIRC2', 'P19878': 'NCF2', 'Q13951': 'CBFB', 'P54274': 'TERF1', 'Q13418': 'ILK', 'Q9H2C0': 'GAN', 'Q08050': 'FOXM1', 'P54725': 'RAD23A', 'Q9NW38': 'FANCL', 'O75884': 'RBBP9', 'Q13526': 'PIN1', 'P16885': 'PLCG2', 'Q9NUW8': 'TDP1', 'P25440': 'BRD2', 'Q9BR61': 'ACBD6', 'O00189': 'AP4M1', 'O95817': 'BAG3', 'Q14232': 'EIF2B1', 'Q9UPW6': 'SATB2', 'P07196': 'NEFL', 'P49675': 'STAR', 'P35520': 'CBS', 'P02489': 'CRYAA', 'P07741': 'APRT', 'O95671': 'ASMTL', 'Q9UNE7': 'STUB1', 'P09104': 'ENO2', 'Q8N2W9': 'PIAS4', 'Q13651': 'IL1

In [8]:
# map UniProt to Gene Symbol

valid_gene_symbols = set(final_features_df.index) # get unique gene symbols from features df

# load edges
edges_df_uniprot = pd.read_csv('ppi_edges_symbols.csv', header=0, names=['protein1_uniprot', 'protein2_uniprot'])
    
# mapping
edges_df_uniprot['protein1'] = edges_df_uniprot['protein1_uniprot'].map(uniprot_to_gene_map)
edges_df_uniprot['protein2'] = edges_df_uniprot['protein2_uniprot'].map(uniprot_to_gene_map)
translated_edges_df = edges_df_uniprot.dropna(subset=['protein1', 'protein2']) # drop rows where mapping didn't work (missing UniProt)

# perform the same cleaning steps as before

print(f"Shape before: {translated_edges_df.shape}")

# filter edges df to make sure we have same GeneSymbol values
filtered_edges_df = translated_edges_df[
    translated_edges_df['protein1'].isin(valid_gene_symbols) & 
    translated_edges_df['protein2'].isin(valid_gene_symbols)
].copy() # avoids SettingWithCopyWarning warning

# filter out any instances where protein1 and protein2 are the same
filtered_edges_df = filtered_edges_df[filtered_edges_df['protein1'] != filtered_edges_df['protein2']]

# sort edges by protein1 then protein2, then drop duplicates
sorted_edges = np.sort(filtered_edges_df[['protein1', 'protein2']].values, axis=1)
unique_edges_df = pd.DataFrame(sorted_edges, columns=['protein1', 'protein2']).drop_duplicates()
print(f"Shape after: {unique_edges_df.shape}")
unique_edges_df.head()

Shape before: (78682, 4)
Shape after: (39381, 2)


Unnamed: 0,protein1,protein2
0,ARF5,RAB11FIP3
1,ARF4,ARF5
2,ARF1,ARF5
3,ARF5,ARFIP1
4,ARF5,ARFIP2


In [None]:
# Generate torch_geometric.data.data Data object

# map gene symbol to an index value
gene_symbols = features_df.index.tolist()
gene_to_idx = {gene: i for i, gene in enumerate(gene_symbols)}

scaler = StandardScaler() # we need to scale our beta values

# create X
X_scaled = scaler.fit_transform(final_features_df.values)
X = torch.tensor(X_scaled, dtype=torch.float)

# create Y - we're predicting beta values for just ALS
y_df = features_df[['ALS_beta']] # 'AD_beta', 'PD_beta', 'FTD_beta', 
y_scaled = scaler.fit_transform(y_df.values)
Y = torch.tensor(y_scaled, dtype=torch.float)

# create edge index tensor
edge_index = torch.tensor([
    [gene_to_idx[p1] for p1 in unique_edges_df['protein1']],
    [gene_to_idx[p2] for p2 in unique_edges_df['protein2']]
], dtype=torch.long)

# create training, validation, and testing masks (60/20/20 split)
N = final_features_df.shape[0]
split_labels = np.random.choice([0, 1, 2], N, p = [0.6, 0.2, 0.2]) # 0 is train, 1 is val, 2 is test

train_mask = torch.tensor(split_labels==0, dtype=torch.bool)
val_mask = torch.tensor(split_labels==1, dtype=torch.bool)
test_mask = torch.tensor(split_labels==2, dtype=torch.bool)

# create data object
graph_data = Data(
    x=X,
    edge_index=edge_index,
    y=Y,
    train_mask=train_mask,
    val_mask=val_mask,
    test_mask=test_mask,
    gene_symbols=gene_symbols # storing gene names for later interpretation (UMAP)
)
graph_data

Data(x=[6386, 6], edge_index=[2, 39381], y=[6386, 1], train_mask=[6386], val_mask=[6386], test_mask=[6386], gene_symbols=[6386])

In [10]:
def get_loss_weights(df):
    '''
    Calculates weights for loss function terms based on inverse variance of beta values.

    Parameters
    ----------
    df : Pandas DataFrame that includes the columns "AD_beta", "PD_beta", "FTD_beta", "ALS_beta"

    Returns
    -------
    ad_wt, pd_wt, ftd_wt, als_wt : weights for each component of the combined loss function
    '''
    # get df of just beta values 
    betas = df[["ALS_beta"]] #"AD_beta", "PD_beta", "FTD_beta"

    # get variance of beta values for each disease 
    variances_by_disease = betas.var()

    # calculate inverse variance to use as weight
    #ad_wt = 1/variances_by_disease["AD_beta"]
    #pd_wt = 1/variances_by_disease["PD_beta"]
    #ftd_wt = 1/variances_by_disease["FTD_beta"]
    als_wt = 1/variances_by_disease["ALS_beta"]

    return als_wt #ad_wt, pd_wt, ftd_wt

als_loss_wt = get_loss_weights(features_df)

In [19]:
# Model
from torch_geometric.nn import GATv2Conv

class MultiTaskGNN(torch.nn.Module):
    """
    A Graph Attention Network (GAT) for predicting multi-task regression targets.
    
    This model uses two GAT layers to learn node embeddings from the graph structure
    and initial node features. It then uses a linear output heads to make a
    regression prediction for ALS.
    """
    def __init__(self, in_channels, hidden_channels, heads=1, dropout=0.5):
        """
        Args:
            in_channels (int): Number of input features for each node.
            hidden_channels (int): Number of hidden units in the GAT layers.
            heads (int): Number of attention heads in the first GAT layer.
            dropout (float): Dropout rate for regularization.
        """
        super().__init__()
        
        self.dropout = dropout

        # First GAT layer: learns initial embeddings from input features.
        # Multi-head attention is used here for more robust feature learning.
        self.conv1 = GATv2Conv(in_channels, hidden_channels, heads=heads, dropout=dropout)
        
        # Second GAT layer: aggregates information from the first layer.
        # The input channels must be hidden_channels * heads from the previous layer.
        # We use a single head here to get the final shared embedding.
        self.conv2 = GATv2Conv(hidden_channels * heads, hidden_channels, heads=1, dropout=dropout)

        # Output layers (prediction heads)
        # We create a separate linear layer for each of the 4 regression tasks.
        # This allows the model to learn a specific final transformation for each disease.
        #self.out_ad = nn.Linear(hidden_channels, 1)
        #self.out_pd = nn.Linear(hidden_channels, 1)
        #self.out_ftd = nn.Linear(hidden_channels, 1)
        self.out_als = nn.Linear(hidden_channels, 1)

    def forward(self, data):
        """
        The forward pass of the model.
        
        Args:
            data (torch_geometric.data.Data): The input graph data object.
        
        Returns:
            tuple: A tuple containing the predictions for the task
                   and the final shared node embeddings for analysis.
        """
        x, edge_index = data.x, data.edge_index
        
        # Apply dropout to the input features for regularization
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        # Pass through the first GAT layer, followed by an ELU activation function
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        # Pass through the second GAT layer to get the shared embeddings
        shared_embeddings = self.conv2(x, edge_index)
        
        # Generate predictions from each disease-specific output head
       # pred_ad = self.out_ad(shared_embeddings)
        #pred_pd = self.out_pd(shared_embeddings)
        #pred_ftd = self.out_ftd(shared_embeddings)
        pred_als = self.out_als(shared_embeddings)
        
        return pred_als, shared_embeddings #pred_ad, pred_pd, pred_ftd,


In [None]:
# Training 
import sys
import os

# Configuration
Learning_Rate = 0.005
Weight_Decay = 5e-4
Epochs = 300
Hidden_Channels = 64
Attention_Heads = 8
Dropout_Rate = 0.6

def train(model, data, optimizer, criterion):
    """Performs a single training step."""
    model.train()
    optimizer.zero_grad()
    
    # Get model predictions
    pred_als, _ = model(data) #pred_ad, pred_pd, pred_ftd, 
    
    # Use the training mask to select the nodes for loss calculation
    train_mask = data.train_mask
    
    # Calculate loss for each task ONLY on the training nodes
    #loss_ad = criterion(pred_ad[train_mask], data.y[train_mask, 0].unsqueeze(1))
    #loss_pd = criterion(pred_pd[train_mask], data.y[train_mask, 1].unsqueeze(1))
    #loss_ftd = criterion(pred_ftd[train_mask], data.y[train_mask, 2].unsqueeze(1))
    loss_als = criterion(pred_als[train_mask], data.y[train_mask, 0].unsqueeze(1))
    
    # Total loss is the sum of the individual task losses
    total_loss = als_loss_wt*loss_als #loss_ad + loss_pd + loss_ftd + 
    
    total_loss.backward()
    optimizer.step()
    
    return total_loss.item()

@torch.no_grad()
def evaluate(model, data, mask):
    """Evaluates the model on a given data split (validation or test)."""
    model.eval()
    pred_als, _ = model(data) #pred_ad, pred_pd, pred_ftd, 
    
    # Calculate MSE loss for each task on the given mask
    #loss_ad = F.mse_loss(pred_ad[mask], data.y[mask, 0].unsqueeze(1))
    #loss_pd = F.mse_loss(pred_pd[mask], data.y[mask, 1].unsqueeze(1))
    #loss_ftd = F.mse_loss(pred_ftd[mask], data.y[mask, 2].unsqueeze(1))
    loss_als = F.mse_loss(pred_als[mask], data.y[mask, 0].unsqueeze(1))
    
    total_val_loss = loss_als #loss_ad + loss_pd + loss_ftd + 
    return total_val_loss.item()

def run():
    """Main function to run the training and evaluation process."""
    print("Starting the GNN Training Process")
    
    #  2. Load Data 
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    #data_path = 'data/02-preprocessed/processed_graph.pt'
    #data = torch.load(data_path)

    data = graph_data

    data = data.to(device)

    print("\nLoaded data object:")
    print(data)

    # 3. Initialize Model and Optimizer 
    model = MultiTaskGNN(
        in_channels=data.num_node_features,
        hidden_channels=Hidden_Channels,
        heads=Attention_Heads,
        dropout=Dropout_Rate
    ).to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=Learning_Rate, weight_decay=Weight_Decay)
    criterion = torch.nn.MSELoss() # Mean Squared Error for regression
    
    print("\nModel architecture:")
    print(model)

    #  4. Training Loop 
    best_val_loss = float('inf')
    best_model_state = None

    print("\nStarting Training")
    for epoch in range(1, Epochs + 1):
        train_loss = train(model, data, optimizer, criterion)
        val_loss = evaluate(model, data, data.val_mask)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict() # Save the model weights
            print(f"Epoch {epoch:03d}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}  (New best model!)")
        else:
            print(f"Epoch {epoch:03d}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # 5. Final Evaluation on Test Set 
    # Load the best performing model
    model.load_state_dict(best_model_state)
    
    test_loss = evaluate(model, data, data.test_mask)
    print("\n Training Complete")
    print(f"Best Validation Loss: {best_val_loss:.4f}")
    print(f"Final Test Loss: {test_loss:.4f}")
    
    # 6. Save the Best Model 
    #model_save_path = 'models/best_model.pt'
    #torch.save(best_model_state, model_save_path)
    #print(f"\nBest model weights saved to '{model_save_path}'")

In [21]:
run()

Starting the GNN Training Process
Using device: cuda

Loaded data object:
Data(x=[6386, 6], edge_index=[2, 39381], y=[6386, 1], train_mask=[6386], val_mask=[6386], test_mask=[6386], gene_symbols=[6386])

Model architecture:
MultiTaskGNN(
  (conv1): GATv2Conv(6, 64, heads=8)
  (conv2): GATv2Conv(512, 64, heads=1)
  (out_als): Linear(in_features=64, out_features=1, bias=True)
)

Starting Training
Epoch 001: Train Loss: 258.7323, Val Loss: 1.3350  (New best model!)
Epoch 002: Train Loss: 308.9192, Val Loss: 1.9692
Epoch 003: Train Loss: 548.5131, Val Loss: 1.1373  (New best model!)
Epoch 004: Train Loss: 256.5288, Val Loss: 1.2672
Epoch 005: Train Loss: 284.3479, Val Loss: 1.5039
Epoch 006: Train Loss: 364.0401, Val Loss: 1.2734
Epoch 007: Train Loss: 284.2778, Val Loss: 1.0733  (New best model!)
Epoch 008: Train Loss: 222.8749, Val Loss: 1.1095
Epoch 009: Train Loss: 235.1769, Val Loss: 1.2037
Epoch 010: Train Loss: 267.1075, Val Loss: 1.2057
Epoch 011: Train Loss: 267.9169, Val Loss: 1.

**Results:**

Best Validation Loss: 1.0617
Final Test Loss: 0.9164

High loss - we should do some hyperparameter optimization if we want to use this model