3 Layer GCN
layer 1: input size num_features, output size 1st hidden size (try 128), ReLU activation
layer 2: input size 1st hidden size (try 128), output size 2nd hidden size (try 64), ReLU activation
layer 3: input size 2nd hidden size (try 64), output size num_classes (try 64)
layers 4, 5, 6, 7: linear layers for predictions (input size is 64, output size different for each layer)

Why 128 and 64? We want first hidden size to be 4-8x that of num_features, and we want 2nd hidden size to be about half of 1st hidden size

Model is being trained to predict:
1) Avg_StdBeta_weighted_AD (protein assoc w disease progression/severity, for regression)
2) Sig_pos_AD, Sig_neg_AD COMBINED into one binary column (both =1 or 0) (sig pos/neg assoc w disease status, for clasification)
3) Combine Sig_pos_AD, Sig_neg_AD, Avg_StdBeta_weighted_AD to identify protein mechanistic role - driver, mediator, or bystander (classification)

Start w AD for now, then expand to PD/FTD later


In [1]:
# imports
import random
import numpy as np
from torch_geometric.data import Data
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.nn import Linear

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: create the PPI graph

# load in nodes and edges
nodes = pd.read_csv("nodes.csv")
edges = pd.read_csv("edges.csv")

# get edge indices as np array
edge_indices_np = edges[['src', 'dst']].values

# transpose array so it's in correct format for torch_geometric (2 x num_edges)
edge_indices_transposed = edge_indices_np.transpose()

# convert to pytorch tensor
edge_index_tensor = torch.tensor(edge_indices_transposed, dtype = torch.long)

# get edge weights 
edge_weights = torch.tensor(edges['weight'].values, dtype=torch.float)

In [3]:
# visualize graph using NetworkX (TO DO)

In [3]:
# load in GNPC data
data = pd.read_csv("Final_Project_Data.csv", header=1)
data_no_na = data.fillna(0) # convert NAs to 0
uniprot_col = data_no_na.iloc[:, 4]
numerical_data = data_no_na.iloc[:, 8:]
numerical_and_uniprot = pd.concat([uniprot_col, numerical_data], axis=1)
data_no_duplicates = numerical_and_uniprot.groupby('UniProt', as_index=False).mean() # calculate mean of all duplicates (same UniProt value)

# merge nodes and data
proteins = nodes[['index', 'UniProt']]
data_nodes_merge = pd.merge(proteins, data_no_duplicates, on = 'UniProt', how = 'left')
all_data = data_nodes_merge.sort_values(by='index').reset_index(drop=True) # sort by index
all_data.shape

(5505, 69)

In [4]:
# Step 2: Create X and Y

# select columns to be used for X and Y (make sure to scale!)
AD_cols = [col for col in all_data.columns if col.endswith("_AD") or col.startswith("AD")]
y_cols = ["Avg_StdBeta_weighted_AD", "Sig_pos_AD", "Sig_neg_AD"]
X_cols = [col for col in AD_cols if col not in y_cols]
X_np = all_data[X_cols].values # features

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_np) # scale X_np

x = torch.tensor(X_scaled, dtype=torch.float) # convert to tensor(dtype=torch.float)

# Y1: turn Avg_StdBeta_weighted_AD into tensor
beta_np = all_data[y_cols].iloc[:, 0].values # Avg_StdBeta as np array
beta_re = beta_np.reshape(-1, 1) # reshape to 2D for scaling
beta_scaled = scaler.fit_transform(beta_re) # scale beta (being used for regression)
y1 = torch.tensor(beta_scaled, dtype=torch.float) # convert to tensor(dtype=torch.float)

# Y2: combine Sig_pos_AD and Sig_neg_AD and turn into tensor (dtype=long)
sig_cols = all_data[y_cols].iloc[:, 1:]
all_data['combined_sig'] = ((all_data['Sig_pos_AD'] == 1) | (all_data['Sig_neg_AD'] == 1)).astype(int) # if either are 1, value in combined col is 1, else 0
sig_np = all_data['combined_sig'].values
y2 = torch.tensor(sig_np, dtype=torch.long) # convert to tensor(dtype=torch.long)

# Y3: classify proteins as drivers, mediators, and bystanders
    # driver: Sig_pos_AD/Sig_neg_AD = 1 and StdBeta > 1 (class 0)
    # mediator: Sig_pos_AD/Sig_neg_AD = 0 and StdBeta > 1 (class 1)
    # bystanders: StdBeta < 1 (class 2)

all_data['beta_scaled'] = beta_scaled # add scaled avg std beta values to df
all_data['high_beta'] = all_data['beta_scaled'].abs() > 1.0 # 1 is stdev cutoff for Avg_StdBeta bc mean is 0 for scaled data, so 1 is 1 stdev

# define three classes (drivers, mediators, bystanders)
class_defs = [
    # class 0 (driver)
    (all_data['combined_sig']) & (all_data['high_beta']),
    
    # class 1 (mediator)
    (~all_data['combined_sig']) & (all_data['high_beta']),

    # class 2 (bystander)
    (~all_data['high_beta'])
]

possible_classes = [0,1,2]
all_data['mech_role'] = np.select(class_defs, possible_classes) # add col to specify mechanistic role of each protein
beta_sig_np = all_data['mech_role'].values
y3 = torch.tensor(beta_sig_np, dtype=torch.long) # convert to tensor (dtype=torch.long)

In [5]:
# Step 3: Generate torch_geometric.data.data Data object

# create training, validation, and testing masks
N = all_data.shape[0]
split_labels = np.random.choice([0, 1, 2], N, p = [0.6, 0.2, 0.2]) # 0 is train, 1 is val, 2 is test

train_mask = torch.tensor(split_labels==0, dtype=torch.bool)
val_mask = torch.tensor(split_labels==1, dtype=torch.bool)
test_mask = torch.tensor(split_labels==2, dtype=torch.bool)

my_data = Data(x=x, y1=y1, y2=y2, y3=y3, edge_index=edge_index_tensor, edge_weights=edge_weights, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)

In [None]:
# 3 layer multi-task GCN: not yet working, not sure why. Single models will probably be easier

class My_GCN(torch.nn.Module):
    
    def __init__(self, n_node_features, n_neuron1, n_neuron2, n_classes1, n_classes2):
        super(My_GCN, self).__init__()
        
        self.conv1 = GCNConv(n_node_features, n_neuron1)
        self.conv2 = GCNConv(n_neuron1, n_neuron2)
        self.conv3 = GCNConv(n_neuron2, n_neuron2)
        self.task1_out = Linear(n_neuron2, 1) # regression, predicting AvgStdBeta
        self.task2_out = Linear(n_neuron2, n_classes1) # binary classification
        self.task3_out = Linear(n_neuron2, n_classes2) # multi classification
        self.task4_out = Linear(n_neuron2, n_node_features) # so network can learn from topology

    def forward(self, data):
        # should we add dropouts?
        
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weights

        x1 = self.conv1(x, edge_index, edge_weight = edge_weight)
        x2 = F.relu(x1)
        x3 = F.dropout(x2, p=0.5, training=self.training)

        x4 = self.conv2(x3, edge_index, edge_weight = edge_weight)
        x5 = F.relu(x4)
        x6 = F.dropout(x5, p=0.5, training=self.training)

        x7 = self.conv3(x6, edge_index, edge_weight = edge_weight)
        out1 = self.task1_out(x7)
        out2 = self.task2_out(x7)
        out3 = self.task3_out(x7)
        out4 = self.task4_out(x7)
        
        return out1, out2, out3, out4
    
class FitModel():
    
    def __init__(self, my_model, learning_rate: float = 0.01):
        
        self.optimizer = torch.optim.Adam(my_model.parameters(), lr = learning_rate)
        self.model = my_model

    def get_accuracies(self, data, mask, out1, out2, out3):

        # use MAE for task 1
        MAE_task1 = F.l1_loss(out1[mask], data.y1[mask])

        # use sigmoid for task 2 (binary classification)    
        YProb_task2 = torch.sigmoid(out2[mask])
        YPred_task2 = torch.round(YProb_task2) # all values greater than 0.5 get rounded to 1
        YTrue_task2 = data.y2[mask].float().unsqueeze(1) # reshapes to [num_nodes, 1]
        acc_task2 = (YPred_task2 == YTrue_task2).sum().item() / YTrue_task2.shape[0]

        # use argmax for task 3 (multi classification)
        YPred_task3 = out3[mask].argmax(dim=1)
        YTrue_task3 = data.y3[mask]  
        acc_task3 = (YPred_task3 == YTrue_task3).sum().item() / YTrue_task3.shape[0]

        return MAE_task1, acc_task2, acc_task3
        
    def Run(self, data, N_epochs: int = 200):
        
        for n in range(N_epochs):
            
            self.model.train()
            self.optimizer.zero_grad()
            
            out1, out2, out3, out4 = self.model(data)
            loss1 = F.mse_loss(out1[data.train_mask], data.y1[data.train_mask])
            loss2 = F.cross_entropy(out2[data.train_mask], data.y2[data.train_mask]) # reshapes to [num_nodes, 2] matrix and converts to float
            loss3 = F.cross_entropy(out3[data.train_mask], data.y3[data.train_mask])
            loss4 = F.mse_loss(out4[data.train_mask], data.x[data.train_mask])

            total_loss = loss1 + 1.5*loss2 + 3*loss3 + 0.05*loss4 # TO DO: figure out what weights should be (weight*loss) - which tasks should be weighted more?

            total_loss.backward()
            
            self.optimizer.step()

            if n % 10 == 0:
                # validation accuracy
                self.model.eval()
                with torch.no_grad():
                    val_out1, val_out2, val_out3, val_out4 = self.model(data)
                MAE1_val, acc2_val, acc3_val = self.get_accuracies(data, data.val_mask, val_out1, val_out2, val_out3)
                print(f'Epoch: {n}, MAE for task 1: {MAE1_val}, Acc for task 2: {acc2_val}, Acc for task 3: {acc3_val}')
                print(f"Epoch {n}, Loss: {total_loss.item():.4f}")
        
        # testing accuracy
        self.model.eval()
        with torch.no_grad():
            test_out1, test_out2, test_out3, test_out4 = self.model(data)
        MAE1_test, acc2_test, acc3_test = self.get_accuracies(data, data.test_mask, test_out1, test_out2, test_out3)
        print("Testing:")
        print(f'MAE for task 1: {MAE1_test}, Acc for task 2: {acc2_test}, Acc for task 3: {acc3_test}')

        return total_loss.item(), loss1.item(), loss2.item(), loss3.item() 
            
my_model = My_GCN(n_node_features=24, n_neuron1=128, n_neuron2=64, n_classes1=2, n_classes2=3)
My_Fit = FitModel(my_model, 0.0001)
My_Fit.Run(my_data)

Epoch: 0, MAE for task 1: nan, Acc for task 2: 0.0, Acc for task 3: 0.06768953068592058
Epoch 0, Loss: nan
Epoch: 10, MAE for task 1: nan, Acc for task 2: 0.0, Acc for task 3: 0.06768953068592058
Epoch 10, Loss: nan
Epoch: 20, MAE for task 1: nan, Acc for task 2: 0.0, Acc for task 3: 0.06768953068592058
Epoch 20, Loss: nan
Epoch: 30, MAE for task 1: nan, Acc for task 2: 0.0, Acc for task 3: 0.06768953068592058
Epoch 30, Loss: nan
Epoch: 40, MAE for task 1: nan, Acc for task 2: 0.0, Acc for task 3: 0.06768953068592058
Epoch 40, Loss: nan
Epoch: 50, MAE for task 1: nan, Acc for task 2: 0.0, Acc for task 3: 0.06768953068592058
Epoch 50, Loss: nan
Epoch: 60, MAE for task 1: nan, Acc for task 2: 0.0, Acc for task 3: 0.06768953068592058
Epoch 60, Loss: nan
Epoch: 70, MAE for task 1: nan, Acc for task 2: 0.0, Acc for task 3: 0.06768953068592058
Epoch 70, Loss: nan
Epoch: 80, MAE for task 1: nan, Acc for task 2: 0.0, Acc for task 3: 0.06768953068592058
Epoch 80, Loss: nan
Epoch: 90, MAE for ta

(nan, nan, nan, nan)