## GATOmics Reproduction Test
### This notebook demonstrates the reproduction of GATOmics model performance on using pre-trained weights.

#### Import library and setting

In [None]:
import torch
import numpy as np
import pandas as pd
import scipy.sparse as sp
import os
import pickle
from sklearn import metrics, linear_model
from torch_geometric.utils import remove_self_loops, add_self_loops

from model_all import Net

# GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

Using device: cuda


In [8]:
def load_data(path):
    # load network
    network1 = []
    adj1 = sp.load_npz(path + "PP.adj.npz")      # gene-gene network
    adj2 = sp.load_npz(path + "PO.adj.npz")      # gene-outlying gene network
    adj3 = sp.load_npz("./Data/go.npz")  
    adj4 = sp.load_npz("./Data/exp.npz") 

    network1.append(adj1.tocsc())
    network1.append(adj2.tocsc())
    network1.append(adj3.tocsc())
    network1.append(adj4.tocsc())

    network2 = []
    adj5 = sp.load_npz(path + "O.adj_loop.npz")
    adj6 = sp.load_npz(path + "O.N_all.npz")

    network2.append(adj5.tocsc())
    network2.append(adj6.tocsc())

    # load node features
    l_feature = []      # gene
    feat1 = pd.read_csv(path + "P.feat-final.csv", sep=",").values[:, 1:]
    feat1 = torch.Tensor(feat1).to(device)
    feat2 = pd.read_csv(path + "P.feat-final.csv", sep=",").values[:, 1:]
    feat2 = torch.Tensor(feat2).to(device)
    feat3 = pd.read_csv(path + "P.feat-final.csv", sep=",").values[:, 1:]
    feat3 = torch.Tensor(feat3).to(device)
    feat4_exp = pd.read_csv(path + "P.feat-final.csv", sep=",").values[:, 1:]
    feat4_exp = torch.Tensor(feat4_exp).to(device)

    l_feature.append(feat1)
    l_feature.append(feat2)
    l_feature.append(feat3)
    l_feature.append(feat4_exp)

    r_feature = []
    feat4 = pd.read_csv(path + "P.feat-final.csv", sep=",").values[:, 1:]      # gene
    feat4 = torch.Tensor(feat4).to(device)
    feat5 = pd.read_csv(path + "O.feat-final.csv", sep=",").values[:, 1:]      # outlying gene
    feat5 = torch.Tensor(feat5).to(device)
    feat6 = pd.read_csv(path + "P.feat-final.csv", sep=",").values[:, 1:]      # miRNA
    feat6 = torch.Tensor(feat6).to(device)

    r_feature.append(feat4)
    r_feature.append(feat5)
    r_feature.append(feat6)

    # load edge
    pos_edge = np.array(np.loadtxt(path + "PP_pos.txt").transpose())
    pos_edge = torch.from_numpy(pos_edge).long()

    pb, _ = remove_self_loops(pos_edge)
    pos_edge1, _ = add_self_loops(pb)

    label = np.loadtxt(path + "label_file.txt")
    Y = torch.tensor(label).type(torch.FloatTensor).to(device).unsqueeze(1)

    return network1, network2, l_feature, r_feature, pos_edge, pos_edge1, Y

def LR(train_x, train_y, test_x):
    regr = linear_model.LogisticRegression(max_iter=10000)
    regr.fit(train_x, train_y.ravel())
    pre = regr.predict_proba(test_x)
    pre = pre[:,1]
    return pre

#### Setting up and loading data

In [9]:
FOLD_TO_LOAD = 2
MODEL_PATH = f"./Data/GATOmics.pt"
DATA_PATH = "./Data/pan-cancer/"

print(">>> Loading Data...")

network1, network2, l_feature, r_feature, pos_edge, pos_edge1, Y = load_data(DATA_PATH)

gene_file = "./Data/gene_names.txt"
with open(gene_file, "r") as f:
    gene_names = [line.strip().split(',')[1] for line in f.readlines()]

print("Data Loaded Successfully.")

>>> Loading Data...
Data Loaded Successfully.


#### Data Load

In [None]:
print(f">>> Loading Masks ...")

fold_path = f'./Data/10fold_index_mapped/fold_{FOLD_TO_LOAD}'
train_mask_file = os.path.join(fold_path, 'train_mask.txt')
test_mask_file = os.path.join(fold_path, 'test_mask.txt')

train_mask = np.loadtxt(train_mask_file).astype(bool)
test_mask = np.loadtxt(test_mask_file).astype(bool)

assert len(train_mask) == len(Y), "Train_mask size is different from Y."
assert len(test_mask) == len(Y), "Test_mask size is different from Y."

print(np.unique(Y[test_mask].cpu().numpy()))

print(f"Fold Masks Created.")

>>> Loading Masks ...
[0. 1.]
Fold Masks Created.


#### Initialize model and load weights

In [11]:
print(">>> Initializing Model...")

model = Net(l_feature, r_feature, network1, network2, 1, 64, 256, 128, pos_edge, pos_edge1).to(device)

if os.path.exists(MODEL_PATH):
    model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
    print(f"Successfully loaded weights from {MODEL_PATH}")
else:
    raise FileNotFoundError(f"Checkpoint not found: {MODEL_PATH}")

>>> Initializing Model...
Successfully loaded weights from ./Data/GATOmics.pt


#### Inference and Performance Evaluation

In [14]:
print(">>> Running Inference...")

model.eval()
with torch.no_grad():
    _, _, _, x = model()

    train_x = torch.sigmoid(x[train_mask]).cpu().detach().numpy()
    
    train_y = Y[train_mask].cpu().numpy().ravel()
    
    test_x = torch.sigmoid(x[test_mask]).cpu().detach().numpy()
    Yn = Y[test_mask].cpu().numpy().ravel()

    print(f"Training Logistic Regression on samples...")
    
    pred = LR(train_x, train_y, test_x)
        
    auc = metrics.roc_auc_score(Yn, pred)
    precision, recall, _ = metrics.precision_recall_curve(Yn, pred)
    auprc = metrics.auc(recall, precision)
    f1 = metrics.f1_score(Yn, (pred >= 0.5).astype(int))
    
    print("\n" + "="*40)
    print(f"GATOmics Reproduction Results")
    print("="*40)
    print(f"AUROC    : {auc:.4f}")
    print(f"AUPRC    : {auprc:.4f}")
    print(f"F1-Score : {f1:.4f}")
    print("="*40)

>>> Running Inference...
Training Logistic Regression on samples...

GATOmics Reproduction Results
AUROC    : 0.8898
AUPRC    : 0.8378
F1-Score : 0.6970
