In [1]:
import pandas as pd
import numpy as np

In [2]:
#loading the GNPC supplementary table 5
file_path = 'gnpc_supp.xlsx'
sheet_name = 'SuppTbl5'

In [4]:
# Read the Excel file, skipping the first row (header=1)
# AD removed try - no matching except statment
df = pd.read_excel(file_path, sheet_name=sheet_name, header=1)
print("DataFrame shape:", df.shape)
print(df.head())
print(df.columns)

DataFrame shape: (7289, 75)
      SeqId    SomaId                                     TargetFullName  \
0  10000-28  SL019233                                 Beta-crystallin B2   
1   10001-7  SL002564  RAF proto-oncogene serine/threonine-protein ki...   
2  10003-15  SL019245                             Zinc finger protein 41   
3  10006-25  SL019228                ETS domain-containing protein Elk-1   
4  10008-43  SL019234              Guanylyl cyclase-activating protein 1   

  Target UniProt EntrezGeneID EntrezGeneSymbol Organism  \
0  CRBB2  P43320         1415           CRYBB2    Human   
1  c-Raf  P04049         5894             RAF1    Human   
2  ZNF41  P51814         7592            ZNF41    Human   
3   ELK1  P19419         2002             ELK1    Human   
4  GUC1A  P43080         2978           GUCA1A    Human   

   Avg_StdBeta_weighted_AD  Meta_p_weighted_AD  ...  FTD_StdBeta_I   FTD_p_I  \
0                 0.020411            0.105892  ...       0.006329  0.928649   


In [5]:
# Define the required columns and their new names

required_columns = {
    'EntrezGeneSymbol': 'GeneSymbol',
    'Avg_StdBeta_weighted_AD': 'AD_beta',
    'Meta_p_weighted_AD': 'AD_p',
    'Avg_StdBeta_weighted_PD': 'PD_beta',
    'Meta_p_weighted_PD': 'PD_p',
    'Avg_StdBeta_weighted_FTD': 'FTD_beta',
    'Meta_p_weighted_FTD': 'FTD_p',
    'StdBeta_ALS': 'ALS_beta', # This is the non-weighted one for ALS
    'p_ALS': 'ALS_p'
}

missing_cols = [col for col in required_columns.keys() if col not in df.columns]
if missing_cols:
    print(f"Error: The following required columns are missing: {missing_cols}")
else:
    # Select and rename the columns
    features_df = df[list(required_columns.keys())].copy() 
    features_df.rename(columns=required_columns, inplace=True)
    
    print("New DataFrame shape:", features_df.shape)
    print(features_df.head())

New DataFrame shape: (7289, 9)
  GeneSymbol   AD_beta      AD_p   PD_beta      PD_p  FTD_beta     FTD_p  \
0     CRYBB2  0.020411  0.105892 -0.019541  0.002026  0.016847  0.126806   
1       RAF1 -0.018095  0.863149  0.004977  0.189185  0.007533  0.393355   
2      ZNF41  0.049706  0.000772  0.004067  0.466674  0.000385  0.385409   
3       ELK1  0.028990  0.002776  0.010033  0.580357  0.000471  0.457739   
4     GUCA1A -0.014837  0.151340 -0.006160  0.018256 -0.054077  0.095542   

   ALS_beta     ALS_p  
0 -0.075116  0.201204  
1 -0.052058  0.369734  
2  0.015578  0.790375  
3  0.088028  0.129374  
4  0.030428  0.603799  


In [6]:
# Pre-processing

features_df['GeneSymbol'].fillna('UNKNOWN', inplace=True)
features_df['GeneSymbol'] = features_df['GeneSymbol'].astype(str)

# Some gene symbols might be "GENE1;GENE2". We take the first one.
features_df['GeneSymbol'] = features_df['GeneSymbol'].apply(lambda x: x.split(';')[0])

# Check for duplicates 
print(f"\nNumber of unique gene symbols: {features_df['GeneSymbol'].nunique()}")
print(f"Total number of rows: {len(features_df)}")
# If there are duplicates, we will keep the first occurrence
features_df.drop_duplicates(subset='GeneSymbol', keep='first', inplace=True)
print(f"Number of rows after dropping duplicates: {len(features_df)}")

# Set GeneSymbol as the index
features_df.set_index('GeneSymbol', inplace=True)

# Handle missing values (NaNs) in the data columns
print("\nMissing values per column before cleaning:")
print(features_df.isnull().sum())

# Fill NaN values with 0.
features_df.fillna(0, inplace=True)

print("\nMissing values per column after cleaning:")
print(features_df.isnull().sum())

print("\nDataFrame after cleaning:")
print(features_df.head())


Number of unique gene symbols: 6386
Total number of rows: 7289
Number of rows after dropping duplicates: 6386

Missing values per column before cleaning:
AD_beta     0
AD_p        0
PD_beta     0
PD_p        0
FTD_beta    0
FTD_p       0
ALS_beta    0
ALS_p       0
dtype: int64

Missing values per column after cleaning:
AD_beta     0
AD_p        0
PD_beta     0
PD_p        0
FTD_beta    0
FTD_p       0
ALS_beta    0
ALS_p       0
dtype: int64

DataFrame after cleaning:
             AD_beta      AD_p   PD_beta      PD_p  FTD_beta     FTD_p  \
GeneSymbol                                                               
CRYBB2      0.020411  0.105892 -0.019541  0.002026  0.016847  0.126806   
RAF1       -0.018095  0.863149  0.004977  0.189185  0.007533  0.393355   
ZNF41       0.049706  0.000772  0.004067  0.466674  0.000385  0.385409   
ELK1        0.028990  0.002776  0.010033  0.580357  0.000471  0.457739   
GUCA1A     -0.014837  0.151340 -0.006160  0.018256 -0.054077  0.095542   

      

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  features_df['GeneSymbol'].fillna('UNKNOWN', inplace=True)


In [7]:
# Feature Engineering 

# Define a small constant to add to p-values to avoid log(0)
epsilon = 1e-300

# Create the -log10(p-value) features
p_value_cols = ['AD_p', 'PD_p', 'FTD_p', 'ALS_p']
for col in p_value_cols:
    # The new column name will be i.e, 'AD_logp'
    new_col_name = col.replace('_p', '_logp')
    features_df[new_col_name] = -np.log10(features_df[col] + epsilon)

print("\nDataFrame after adding -log10(p) features:")
print(features_df.head())


# Re-ordering the final feature matrix
final_feature_order = [
    'AD_beta', 'AD_logp',
    'PD_beta', 'PD_logp',
    'FTD_beta', 'FTD_logp',
    'ALS_beta', 'ALS_logp'
]

final_features_df = features_df[final_feature_order]

print("\nFinal, ordered feature DataFrame:")
print(final_features_df.head())


#Save
output_path = 'protein_features.csv'
final_features_df.to_csv(output_path)

print(f"\nCreated and saved the feature matrix to '{output_path}'")
print(f"Final shape of the feature matrix: {final_features_df.shape}")


DataFrame after adding -log10(p) features:
             AD_beta      AD_p   PD_beta      PD_p  FTD_beta     FTD_p  \
GeneSymbol                                                               
CRYBB2      0.020411  0.105892 -0.019541  0.002026  0.016847  0.126806   
RAF1       -0.018095  0.863149  0.004977  0.189185  0.007533  0.393355   
ZNF41       0.049706  0.000772  0.004067  0.466674  0.000385  0.385409   
ELK1        0.028990  0.002776  0.010033  0.580357  0.000471  0.457739   
GUCA1A     -0.014837  0.151340 -0.006160  0.018256 -0.054077  0.095542   

            ALS_beta     ALS_p   AD_logp   PD_logp  FTD_logp  ALS_logp  
GeneSymbol                                                              
CRYBB2     -0.075116  0.201204  0.975136  2.693367  0.896860  0.696364  
RAF1       -0.052058  0.369734  0.063914  0.723114  0.405216  0.432111  
ZNF41       0.015578  0.790375  3.112235  0.330987  0.414078  0.102167  
ELK1        0.088028  0.129374  2.556576  0.236305  0.339382  0.888151  

In [8]:
# CHANGE BY AD: adding function to calculate weights for loss function
def get_loss_weights(df):
    '''
    Calculates weights for loss function terms based on inverse variance of beta values.

    Parameters
    ----------
    df : Pandas DataFrame that includes the columns "AD_beta", "PD_beta", "FTD_beta", "ALS_beta"

    Returns
    -------
    ad_wt, pd_wt, ftd_wt, als_wt : weights for each component of the combined loss function
    '''

    betas = df[["AD_beta", "PD_beta", "FTD_beta", "ALS_beta"]]

    # get variance of beta values for each disease 
    variances_by_disease = betas.var()

    # calculate inverse variance to use as weight
    ad_wt = 1/variances_by_disease["AD_beta"]
    pd_wt = 1/variances_by_disease["PD_beta"]
    ftd_wt = 1/variances_by_disease["FTD_beta"]
    als_wt = 1/variances_by_disease["ALS_beta"]

    return ad_wt, pd_wt, ftd_wt, als_wt

loss_weights = get_loss_weights(final_features_df)

In [9]:

# Create UniProt to Gene Symbol Mapping 

# Load the Original GNPC Data again to get the mapping 
file_path = 'gnpc_supp.xlsx'
sheet_name = 'SuppTbl5'

# Use header=1 
original_df = pd.read_excel(file_path, sheet_name=sheet_name, header=1)
    

# Select only the two columns we need for mapping
# Drop rows where either identifier is missing
mapping_df = original_df[['UniProt', 'EntrezGeneSymbol']].dropna()

mapping_df['EntrezGeneSymbol'] = mapping_df['EntrezGeneSymbol'].astype(str).apply(lambda x: x.split(';')[0])

# Create the dictionary: {UniProt_ID: GeneSymbol}
mapping_df = mapping_df.drop_duplicates(subset='UniProt')
uniprot_to_gene_map = dict(zip(mapping_df['UniProt'], mapping_df['EntrezGeneSymbol']))

print(f"Created a mapping dictionary with {len(uniprot_to_gene_map)} entries.")
print("Example mappings:", list(uniprot_to_gene_map.items())[:5])

Created a mapping dictionary with 6387 entries.
Example mappings: [('P43320', 'CRYBB2'), ('P04049', 'RAF1'), ('P51814', 'ZNF41'), ('P19419', 'ELK1'), ('P43080', 'GUCA1A')]


In [10]:
# Translate, Filter, and Clean Edges 

# Get the master list of our 6,386 valid gene symbols
final_features_df = pd.read_csv('protein_features.csv', index_col='GeneSymbol')
valid_gene_symbols = set(final_features_df.index)
print(f"\nLoaded {len(valid_gene_symbols)} unique gene symbols to use as our node list.")

edges_file_path = 'ppi_edges_symbols.csv' # This file contains UniProt IDs


# Load the edges file, which has a header row like "protein1, protein2"
edges_df_uniprot = pd.read_csv(edges_file_path, header=0, names=['protein1_uniprot', 'protein2_uniprot'])

print(f"\nLoaded raw edges file with UniProt IDs. Shape: {edges_df_uniprot.shape}")

# Translation 
# Map the UniProt IDs in both columns to Gene Symbols using our dictionary
edges_df_uniprot['protein1'] = edges_df_uniprot['protein1_uniprot'].map(uniprot_to_gene_map)
edges_df_uniprot['protein2'] = edges_df_uniprot['protein2_uniprot'].map(uniprot_to_gene_map)

# Drop rows where a translation failed 
translated_edges_df = edges_df_uniprot.dropna(subset=['protein1', 'protein2'])

#FILTERING (on the translated Gene Symbols) 
original_edge_count = len(translated_edges_df)

print("\nFiltering translated edges to match the nodes in our feature matrix")
filtered_edges_df = translated_edges_df[
    translated_edges_df['protein1'].isin(valid_gene_symbols) & 
    translated_edges_df['protein2'].isin(valid_gene_symbols)
].copy()

print(f"Kept {len(filtered_edges_df)} out of {original_edge_count} edges.")

# Cleaning
pre_self_loop_count = len(filtered_edges_df)
filtered_edges_df = filtered_edges_df[filtered_edges_df['protein1'] != filtered_edges_df['protein2']]
print(f"Removed {pre_self_loop_count - len(filtered_edges_df)} self-loops.")

pre_dedupe_count = len(filtered_edges_df)
sorted_edges = np.sort(filtered_edges_df[['protein1', 'protein2']].values, axis=1)
unique_edges_df = pd.DataFrame(sorted_edges, columns=['protein1', 'protein2']).drop_duplicates()
print(f" Removed {pre_dedupe_count - len(unique_edges_df)} duplicate edges.")
print(f"Final unique edge count: {len(unique_edges_df)}")

# Save 
clean_edges_output_path = 'protein_edges_clean.csv'
unique_edges_df.to_csv(clean_edges_output_path, index=False, header=False)

print(f"\nSaved the cleaned and translated edge list to '{clean_edges_output_path}'")



Loaded 6386 unique gene symbols to use as our node list.

Loaded raw edges file with UniProt IDs. Shape: (78950, 2)

Filtering translated edges to match the nodes in our feature matrix
Kept 78682 out of 78682 edges.
Removed 0 self-loops.
 Removed 39301 duplicate edges.
Final unique edge count: 39381

Saved the cleaned and translated edge list to 'protein_edges_clean.csv'


In [None]:
import torch
from torch_geometric.data import Data

# Create PyTorch Geometric Data Object 

# Load the node features we created
features_df = pd.read_csv('protein_features.csv', index_col='GeneSymbol')

# Load the clean edge list we just created
edges_df = pd.read_csv('protein_edges_clean.csv', header=None, names=['protein1', 'protein2'])

print(f"Loaded {len(features_df)} node features.")
print(f"Loaded {len(edges_df)} clean edges.")

# Prepare Data for PyTorch 

# 1. Create the protein-to-integer mapping
# The order of proteins in features_df is our canonical order
gene_symbols = features_df.index.tolist()
gene_to_idx = {gene: i for i, gene in enumerate(gene_symbols)}
print(f"\nCreated mapping for {len(gene_to_idx)} genes to integer indices.")

# 2. Create the feature tensor (X)
# Convert the pandas DataFrame to a NumPy array, then to a PyTorch tensor
X = torch.tensor(features_df.values, dtype=torch.float)
print(f"Feature matrix X created with shape: {X.shape}")

# 3. Create the target tensor (Y)
# Our target is to predict the beta values
y_df = features_df[['AD_beta', 'PD_beta', 'FTD_beta', 'ALS_beta']]
Y = torch.tensor(y_df.values, dtype=torch.float)
print(f"Target matrix Y created with shape: {Y.shape}")

# 4. Create the edge_index tensor
# Map the gene symbols in the edges_df to their integer indices
edge_index = torch.tensor([
    [gene_to_idx[p1] for p1 in edges_df['protein1']],
    [gene_to_idx[p2] for p2 in edges_df['protein2']]
], dtype=torch.long)
print(f"Edge index tensor created with shape: {edge_index.shape}")

# Create Node Masks for Splitting Data 
num_nodes = len(features_df)
perm = torch.randperm(num_nodes) # Random permutation of indices

# Splitting 70% for training, 15% for validation, 15% for testing
train_end = int(0.7 * num_nodes)
val_end = int(0.85 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[perm[:train_end]] = True

val_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask[perm[train_end:val_end]] = True

test_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask[perm[val_end:]] = True

print("\nCreated train/validation/test masks.")
print(f"Training nodes: {train_mask.sum().item()}")
print(f"Validation nodes: {val_mask.sum().item()}")
print(f"Test nodes: {test_mask.sum().item()}")


# Assemble the Final Data Object
graph_data = Data(
    x=X,
    edge_index=edge_index,
    y=Y,
    train_mask=train_mask,
    val_mask=val_mask,
    test_mask=test_mask,
    gene_symbols=gene_symbols, # Storing gene names for later interpretation
    loss_weights = loss_weights # so we can access for weighted loss function
)

print(f"\nAssembled final PyG Data object:\n{graph_data}")


# Save the Object to a File
output_file = 'processed_graph.pt'
torch.save(graph_data, output_file)


Loaded 6386 node features.
Loaded 39381 clean edges.

Created mapping for 6386 genes to integer indices.
Feature matrix X created with shape: torch.Size([6386, 8])
Target matrix Y created with shape: torch.Size([6386, 4])
Edge index tensor created with shape: torch.Size([2, 39381])

Created train/validation/test masks.
Training nodes: 4470
Validation nodes: 958
Test nodes: 958

Assembled final PyG Data object:
Data(x=[6386, 8], edge_index=[2, 39381], y=[6386, 4], train_mask=[6386], val_mask=[6386], test_mask=[6386], gene_symbols=[6386], loss_weights=[4])
