3 Layer GCN
layer 1: input size num_features, output size 1st hidden size (try 128), ReLU activation
layer 2: input size 1st hidden size (try 128), output size 2nd hidden size (try 64), ReLU activation
layer 3: input size 2nd hidden size (try 64), output size num_classes (try 64)
layers 4, 5, 6, 7: linear layers for predictions (input size is 64, output size different for each layer)

Model is being trained to predict:
1) Avg_StdBeta_weighted_AD (protein assoc w disease progression/severity, for regression)
2) Sig_pos_AD, Sig_neg_AD COMBINED into one binary column (both =1 or 0) (sig pos/neg assoc w disease status, for clasification)
3) Combine Sig_pos_AD, Sig_neg_AD, Avg_StdBeta_weighted_AD to identify protein mechanistic role (classification)

Start w AD for now, then expand to PD/FTD later

In [2]:
# imports
import random
import numpy as np
from torch_geometric.data import Data
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Step 1: create the PPI graph

# load in nodes and edges
nodes = pd.read_csv("nodes.csv")
edges = pd.read_csv("edges.csv")

# get edge indices as np array
edge_indices_np = edges[['src', 'dst']].values

# transpose array so it's in correct format for torch_geometric (2 x num_edges)
edge_indices_transposed = edge_indices_np.transpose()

# convert to pytorch tensor
edge_index_tensor = torch.tensor(edge_indices_transposed, dtype = torch.long)


In [None]:
# visualize graph using NetworkX (TO DO)

In [15]:
# load in GNPC data
data = pd.read_csv("Final_Project_Data.csv", header=1)
data_no_na = data.fillna(0) # convert NAs to 0
uniprot_col = data_no_na.iloc[:, 4]
numerical_data = data_no_na.iloc[:, 8:]
numerical_and_uniprot = pd.concat([uniprot_col, numerical_data], axis=1)
data_no_duplicates = numerical_and_uniprot.groupby('UniProt', as_index=False).mean() # calculate mean of all duplicates (same UniProt value)

# merge nodes and data
proteins = nodes[['index', 'UniProt']]
data_nodes_merge = pd.merge(proteins, data_no_duplicates, on = 'UniProt', how = 'left')
all_data = data_nodes_merge.sort_values(by='index').reset_index(drop=True) # sort by index
all_data.shape

(5505, 69)

In [None]:
# Step 2: Create X and Y

# select columns to be used for X and Y (make sure to scale!)
AD_cols = [col for col in all_data.columns if col.endswith("_AD") or col.startswith("AD")]
y_cols = ["Avg_StdBeta_weighted_AD", "Sig_pos_AD", "Sig_neg_AD"]
X_cols = [col for col in AD_cols if col not in y_cols]
X_np = all_data[X_cols].values # features

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_np) # scale X_np

x = torch.tensor(X_scaled, dtype=torch.float) # convert to tensor(dtype=torch.float)

# Y1: turn Avg_StdBeta_weighted_AD into tensor
beta_np = all_data[y_cols].iloc[:, 0].values # Avg_StdBeta as np array
beta_re = beta_np.reshape(-1, 1) # reshape to 2D for scaling
beta_scaled = scaler.fit_transform(beta_re) # scale beta (being used for regression)
y1 = torch.tensor(beta_scaled, dtype=torch.float) # convert to tensor(dtype=torch.float)

# Y2: combine Sig_pos_AD and Sig_neg_AD and turn into tensor (dtype=long)
sig_cols = all_data[y_cols].iloc[:, 1:]
all_data['combined_sig'] = ((all_data['Sig_pos_AD'] == 1) | (all_data['Sig_neg_AD'] == 1)).astype(int) # if either are 1, value in combined col is 1, else 0
sig_np = all_data['combined_sig'].values
y2 = torch.tensor(sig_np, dtype=torch.long) # convert to tensor(dtype=torch.long)

# Y3: classify proteins as drivers, mediators, and bystanders
    # driver: Sig_pos_AD/Sig_neg_AD = 1 and StdBeta > 1 (class 0)
    # mediator: Sig_pos_AD/Sig_neg_AD = 0 and StdBeta > 1 (class 1)
    # bystanders: StdBeta < 1 (class 2)

all_data['beta_scaled'] = beta_scaled # add scaled avg std beta values to df
all_data['high_beta'] = all_data['beta_scaled'].abs() > 1.0 # 1 is stdev cutoff for Avg_StdBeta bc mean is 0 for scaled data, so 1 is 1 stdev

# define three classes (drivers, mediators, bystanders)
class_defs = [
    # class 0 (driver)
    (all_data['combined_sig']) & (all_data['high_beta']),
    
    # class 1 (mediator)
    (~all_data['combined_sig']) & (all_data['high_beta']),

    # class 2 (bystander)
    (~all_data['high_beta'])
]

possible_classes = [0,1,2]
all_data['mech_role'] = np.select(class_defs, possible_classes) # add col to specify mechanistic role of each protein
beta_sig_np = all_data['mech_role'].values
y3 = torch.tensor(beta_sig_np, dtype=torch.long) # convert to tensor (dtype=torch.long)

In [None]:
# Step 3: Generate torch_geometric.data.data Data object (IN PROGRESS) - this is from Graph_III.ipynb

# create training mask
N = all_data.shape[0]
training_mask = torch.tensor(np.random.choice([True, False], N, p = [0.3, 0.7]), dtype=torch.bool)
# also need validation and test masks?

my_data = Data(x = x, y = y, training_mask = training_mask, edge_index = edge_index) # edge_attr = edge_weight
print(my_data)

In [None]:
# 3 layer GCN with softmax layer for classification (IN PROGRESS)- this is from Graph_III.ipynb

class My_GCN(torch.nn.Module):
    
    def __init__(self, n_node_features, n_neuron, n_classes):
        super(My_GCN, self).__init__()
        
        self.conv1 = GCNConv(n_node_features, n_neuron)
        self.conv2 = GCNConv(n_neuron, n_classes)
        

    def forward(self, x, edge_index, edge_weight):
        
        x1 = self.conv1(x, edge_index, edge_weight = edge_weight)
        x2 = F.relu(x1)
        x3 = self.conv2(x2, edge_index, edge_weight = edge_weight)
        
        self.x1 = x1
        self.x2 = x2
        self.x3 = x3
        
        return F.log_softmax(x3, dim = 1)
    
class FitModel():
    
    def __init__(self, my_model, learning_rate: float = 0.01):
        
        self.optimizer = torch.optim.Adam(my_model.parameters(), lr = learning_rate)
        self.model = my_model
        
        
    def Run(self, data, N_epochs: int = 200):
        
        lY = len(data.y)
        
        for n in range(N_epochs):
            
            self.model.train()
            self.optimizer.zero_grad()
            
            out = self.model(data.x, data.edge_index, data.edge_attr)
            loss = F.nll_loss(out[data.training_mask], data.y[data.training_mask])
            loss.backward()
            
            self.optimizer.step()
            
            Y_pred = out.argmax(dim=1)
            
            acc    = (Y_pred == data.y).sum()/lY
            
            if not n % 10:
                print(f'epoch: {n:>3} | loss: {loss:.2f} | accuracy: {acc*100:.2f}%' )
        
        
        self.out = out

My_Fit = FitModel(my_model, 0.0001)
My_Fit.Run(my_data)
Probs = np.exp(My_Fit.out.detach()).detach() #since we used log softmax
Y_pred = Probs.argmax(dim=1)
acc    = (Y_pred == my_data.y).sum()/N
print(acc)