## DISHyper Reproduction Test
### This notebook demonstrates the reproduction of DISHyper model performance on using pre-trained weights.

#### Importing libraries and functions

In [1]:
import torch
import numpy as np
import pandas as pd
import random
import os

from reproduction_utils import (
    DISHyperNet, 
    processingIncidenceMatrix, 
    getData, 
    _generate_G_from_H_weight, 
    cal_metrics,
    load_fold_data,
    get_train_test_indices_from_gene_names
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


#### Setting up environment and loading data

In [5]:
class Args:
    lr = 5e-3
    dropout = 0.2
    weight_decay = 5e-6
    epochs = 300
    n_hid = 256
args = Args()

# Path
dataPath = '../../Data/STRING'
FOLD_TO_LOAD = 8
MODEL_PATH = f"./Data/DISHyper.pt"

positiveGenePath = f'{dataPath}/dataset/pan-cancer/715true.txt'
negativeGenePath = f'{dataPath}/dataset/pan-cancer/1231false.txt'

print(">>> Loading Basic Data...")

# 1. Gene List
geneList = pd.read_csv(r'../../Data/msigdb/geneList.csv', header=None)
geneList = list(geneList[0].values)
# 2. Incidence Matrix
incidenceMatrix = processingIncidenceMatrix(geneList, dataPath)

# 3. Label Data
feature_genename_file = f'{dataPath}/feature_genename.txt'
filtered_geneList = pd.read_csv(feature_genename_file, header=None).iloc[:, 0].tolist()
sampleIndex, label, labelFrame = getData(positiveGenePath, negativeGenePath, filtered_geneList)

print("Data loaded successfully.")

>>> Loading Basic Data...
Original geneList size: 17442 → Filtered size: 10251
Final incidenceMatrix shape: (10251, 20647)
Data loaded successfully.


#### Load fold data

In [11]:
print(f">>> Loading Indices for Fold {FOLD_TO_LOAD} using Exact Training Logic...")

fold_dir = f'{dataPath}/10fold'
fold_path = os.path.join(fold_dir, f'fold_{FOLD_TO_LOAD}')

train_idx, valid_idx, test_idx, train_mask, valid_mask, test_mask, labels = load_fold_data(fold_path)

# 3. get_train_test_indices_from_gene_names 호출 (utils 함수 사용)
# 이 함수가 geneList, sampleIndex, labelFrame을 사용하여 최종적으로 모델에 들어갈 인덱스를 반환합니다.
feature_genename_file = f'{dataPath}/feature_genename.txt'

trainIndex, testIndex = get_train_test_indices_from_gene_names(
    feature_genename_file, 
    train_idx, 
    test_idx, 
    sampleIndex, 
    labelFrame
)

print(f" - Mapped Train Size: {len(trainIndex)}")
print(f" - Mapped Test Size : {len(testIndex)}")

>>> Loading Indices for Fold 8 using Exact Training Logic...
 - Mapped Train Size: 1752
 - Mapped Test Size : 97


#### Graph reconstruction

In [7]:
print(">>> Reconstructing Hypergraph based on Training Data...")

# Train Positive Genes
trainFrame = labelFrame.iloc[trainIndex]
trainPositiveGene = set(list(trainFrame.where(trainFrame==1).dropna().index))
train_positive_common_genes = trainPositiveGene.intersection(incidenceMatrix.index)

# 2. Hyperedge Weight
positiveMatrixSum = incidenceMatrix.loc[train_positive_common_genes].sum()

# Disease-specific Hyperedge Selection
selHyperedgeIndex = np.where(positiveMatrixSum >= 3)[0]
selHyperedge = incidenceMatrix.iloc[:, selHyperedgeIndex]

hyperedgeWeight = positiveMatrixSum[selHyperedgeIndex].values
selHyperedgeWeightSum = incidenceMatrix.iloc[:, selHyperedgeIndex].values.sum(0)
hyperedgeWeight = hyperedgeWeight / selHyperedgeWeightSum

# 3. Incidence Matrix (H)
H = np.array(selHyperedge).astype('float')
DV = np.sum(H * hyperedgeWeight, axis=1)

# Isolated Node Handling
for i in range(DV.shape[0]):
    if(DV[i] == 0):
        t = random.randint(0, H.shape[1]-1)
        H[i][t] = 0.0001

# 4. Adjacency Matrix (G)
G = _generate_G_from_H_weight(H, hyperedgeWeight)
N = H.shape[0]

adj = torch.Tensor(G).float().to(device)
features = torch.eye(N).float().to(device)
theLabels = torch.from_numpy(labelFrame.values.reshape(-1,)).long().to(device)

print(f"Graph Constructed. Nodes: {N}")

>>> Reconstructing Hypergraph based on Training Data...
Graph Constructed. Nodes: 10251


#### Inference and Performance Evaluation

In [10]:
print(">>> Running Inference...")

model = DISHyperNet(in_ch=N, n_hid=args.n_hid, n_class=2, dropout=args.dropout).to(device)

if os.path.exists(MODEL_PATH):
    model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
else:
    raise FileNotFoundError(f"Checkpoint not found: {MODEL_PATH}")

model.eval()
with torch.no_grad():
    output = model(features, adj)
    
    test_output = output[testIndex]
    test_labels = theLabels[testIndex]
    
    auc, auprc, f1 = cal_metrics(test_output, test_labels)
    
    print("\n" + "="*40)
    print(f"DISHyper Reproduction Results")
    print("="*40)
    print(f"AUROC    : {auc:.4f}")
    print(f"AUPRC    : {auprc:.4f}")
    print(f"F1-Score : {f1:.4f}")
    print("="*40)

>>> Running Inference...

DISHyper Reproduction Results
AUROC    : 0.9161
AUPRC    : 0.8806
F1-Score : 0.8312
