In [9]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data

In [10]:
# Load cleaned data
links_file = 'data/cleaned/links.csv'
data_file = 'data/cleaned/data.csv'
data = pd.read_csv(data_file)
links = pd.read_csv(links_file)

Features

In [11]:
from sklearn.preprocessing import StandardScaler
numeric_cols = ['Avg_StdBeta_weighted_AD', 'Meta_p_weighted_AD', 'Meta_pval_FDR_AD',
                'Meta_pval_Bonf_AD', 'Sig_pos_AD', 'Sig_neg_AD', 'max_sites_AD',
                'AD_StdBeta_A', 'AD_p_A', 'AD_StdBeta_C', 'AD_p_C', 'AD_StdBeta_D',
                'AD_p_D', 'AD_StdBeta_E', 'AD_p_E', 'AD_StdBeta_F', 'AD_p_F',
                'AD_StdBeta_I', 'AD_p_I', 'AD_StdBeta_J', 'AD_p_J', 'AD_StdBeta_L',
                'AD_p_L', 'AD_StdBeta_G', 'AD_p_G', 'AD_StdBeta_R', 'AD_p_R',
                'Avg_StdBeta_weighted_PD', 'Meta_p_weighted_PD', 'Meta_pval_FDR_PD',
                'Meta_pval_Bonf_PD', 'Sig_pos_PD', 'Sig_neg_PD', 'max_sites_PD',
                'PD_StdBeta_C', 'PD_p_C', 'PD_StdBeta_F', 'PD_p_F', 'PD_StdBeta_J',
                'PD_p_J', 'PD_StdBeta_L', 'PD_p_L', 'PD_StdBeta_Q', 'PD_p_Q',
                'PD_StdBeta_R', 'PD_p_R', 'PD_StdBeta_T', 'PD_p_T',
                'Avg_StdBeta_weighted_FTD', 'Meta_p_weighted_FTD', 'Meta_pval_FDR_FTD',
                'Meta_pval_Bonf_FTD', 'Sig_pos_FTD', 'Sig_neg_FTD', 'max_sites_FTD',
                'FTD_StdBeta_C', 'FTD_p_C', 'FTD_StdBeta_I', 'FTD_p_I', 'FTD_StdBeta_N',
                'FTD_p_N', 'FTD_StdBeta_Q', 'FTD_p_Q']

y_labels = ['Sig_pos_AD', 'Sig_neg_AD', 'Avg_StdBeta_weighted_AD', 'max_sites_AD',
            'Sig_pos_PD', 'Sig_neg_PD', 'Avg_StdBeta_weighted_PD', 'max_sites_PD',
            'Sig_pos_FTD', 'Sig_neg_FTD', 'Avg_StdBeta_weighted_FTD', 'max_sites_FTD']

x = StandardScaler().fit_transform(data[numeric_cols].drop(y_labels, axis=1))
y = pd.DataFrame(StandardScaler().fit_transform(data[y_labels]), columns=y_labels)

Task A: Disease significance (Classification)

In [12]:
# See if protein has more positive or negative significance (1 = mostly positive, 0 = mostly negative)
sig_AD = (data['Sig_pos_AD'] > data['Sig_neg_AD']).astype(int)
sig_PD = (data['Sig_pos_PD'] > data['Sig_neg_PD']).astype(int)
sig_FTD = (data['Sig_pos_FTD'] > data['Sig_neg_FTD']).astype(int)

y_sig = {'AD':sig_AD, 'PD':sig_PD, 'FTD':sig_FTD}

Task B: Mechanistic role (Classification)

In [13]:
# driver: Sig_pos_AD/Sig_neg_AD = 1 and StdBeta > 1 (class 0)
# mediator: Sig_pos_AD/Sig_neg_AD = 0 and StdBeta > 1 (class 1)
# bystanders: StdBeta < 1 (class 2)
beta_AD = np.abs(StandardScaler().fit_transform(data[['Avg_StdBeta_weighted_AD']])) > 1.0
beta_PD = np.abs(StandardScaler().fit_transform(data[['Avg_StdBeta_weighted_PD']])) > 1.0
beta_FTD = np.abs(StandardScaler().fit_transform(data[['Avg_StdBeta_weighted_FTD']])) > 1.0

# define three classes (0:drivers, 1:mediators, 2:bystanders)
AD_roles = {'0' : sig_AD & beta_AD.flatten(),
            '1' : ~sig_AD & beta_AD.flatten(),
            '2' : ~beta_AD.flatten()}
PD_roles = {'0' : sig_PD & beta_PD.flatten(),
            '1' : ~sig_PD & beta_PD.flatten(),
            '2' : ~beta_PD.flatten()}
FTD_roles = {'0' : sig_FTD & beta_FTD.flatten(),
             '1' : ~sig_FTD & beta_FTD.flatten(),
             '2' : ~beta_FTD.flatten()}
AD_roles = pd.DataFrame(AD_roles).idxmax(axis=1).astype(int)
PD_roles = pd.DataFrame(PD_roles).idxmax(axis=1).astype(int)
FTD_roles = pd.DataFrame(FTD_roles).idxmax(axis=1).astype(int)
y_roles = {'AD':AD_roles, 'PD':PD_roles, 'FTD':FTD_roles}

Task C: Disease-specific abundance (Regression)

In [14]:
y_abundance = {'AD':beta_AD,
               'PD':beta_PD, 
               'FTD':beta_FTD}

In [15]:
# Create edge data
unique_proteins = data['UniProt']
uniprot_to_node = dict(zip(unique_proteins, range(len(unique_proteins))))
links_no_self_loops = links.query('protein1 != protein2')
src_nodes = links['protein1'].map(uniprot_to_node).astype(int).values
dst_nodes = links['protein2'].map(uniprot_to_node).astype(int).values
edges = list(zip(src_nodes, dst_nodes))
edge_weight = links['combined_score'] / links['combined_score'].max()

# Combine labels
y = {}
for disease in ['AD', 'PD', 'FTD']:
    y[disease] = {'sig': torch.tensor(y_sig[disease].to_numpy().reshape(-1,1), dtype=torch.int),
                  'role': torch.tensor(y_roles[disease].to_numpy().reshape(-1,1), dtype=torch.int),
                  'abundance': torch.tensor(y_abundance[disease].reshape(-1,1), dtype=torch.float)}

x = torch.tensor(x, dtype=torch.float)
edge_index = torch.tensor(np.array(edges).reshape(2, -1), dtype=torch.int32)
edge_weight = torch.tensor(edge_weight.values, dtype=torch.float)
graph = Data(x=x, y=y, edge_index=edge_index, edge_weight=edge_weight)

# Save graph data
torch.save(graph, 'data/directed_graph.pt')

In [16]:
graph

Data(
  x=[6383, 51],
  edge_index=[2, 2566974],
  y={
    AD={
      sig=[6383, 1],
      role=[6383, 1],
      abundance=[6383, 1],
    },
    PD={
      sig=[6383, 1],
      role=[6383, 1],
      abundance=[6383, 1],
    },
    FTD={
      sig=[6383, 1],
      role=[6383, 1],
      abundance=[6383, 1],
    },
  },
  edge_weight=[2566974]
)