In [1]:
import torch
import numpy as np
import pandas as pd
import math
import random
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
import torch.nn.functional as F
from torch_geometric.datasets import DBLP
from torch_geometric.nn import HGTConv, Linear
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt
import os
import shutil

ImportError: cannot import name 'Tensor' from 'torch' (unknown location)

In [2]:
def seed_everything(seed = 42):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Random seed set as {seed}")

seed_everything(42)
device = 'cpu'

Random seed set as 42


In [3]:
config = dict()
config["lr"] = 0.001
config["weight_decay"] = 5e-3
config["epochs"] = 200
config['train_ratio'] = 0.8
config['val_ratio'] = 0.1
config['test_ratio'] = 0.1
config['hidden_channels'] = 64
config['num_heads'] = 4
config['num_layers'] = 2
config

{'lr': 0.001,
 'weight_decay': 0.005,
 'epochs': 200,
 'train_ratio': 0.8,
 'val_ratio': 0.1,
 'test_ratio': 0.1,
 'hidden_channels': 64,
 'num_heads': 4,
 'num_layers': 2}

In [4]:
patient_features = torch.tensor(np.load('data/Processed/patient_features.npy')).to(torch.float32)
procedure_features = torch.tensor(np.load('data/Processed/procedure_features.npy')).to(torch.float32)
medication_features = torch.tensor(np.load('data/Processed/medication_features.npy')).to(torch.float32)
lab_features = torch.tensor(np.load('data/Processed/lab_features.npy')).to(torch.float32)

patient_edges = torch.tensor(np.load('data/Processed/patient_edges.npy'))
procedure_edges = torch.tensor(np.load('data/Processed/procedures_edges.npy'))
medication_edges = torch.tensor(np.load('data/Processed/medication_edges.npy'))
lab_edges = torch.tensor(np.load('data/Processed/lab_edges.npy'))

In [5]:
num_patient_nodes = len(patient_features)
num_train_nodes = int(config['train_ratio'] * num_patient_nodes)
num_val_nodes = int(config['val_ratio'] * num_patient_nodes)
num_test_nodes = num_patient_nodes - num_train_nodes - num_val_nodes

# patient_nodes_indices = torch.arange(num_patient_nodes)
# shuffled_indices = torch.randperm(num_patient_nodes)

patient_nodes_indices = np.arange(num_patient_nodes)
np.random.shuffle(patient_nodes_indices)
shuffled_indices = torch.tensor(patient_nodes_indices)

train_indices = shuffled_indices[:num_train_nodes]
val_indices = shuffled_indices[num_train_nodes:num_train_nodes + num_val_nodes]
test_indices = shuffled_indices[num_train_nodes + num_val_nodes:]

train_mask = torch.zeros(num_patient_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_patient_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_patient_nodes, dtype=torch.bool)

train_mask[patient_nodes_indices[train_indices]] = True
val_mask[patient_nodes_indices[val_indices]] = True
test_mask[patient_nodes_indices[test_indices]] = True

print(len(train_mask), train_mask[:10])
print(len(val_mask), val_mask[:10])
print(len(test_mask), test_mask[:10])

1483 tensor([False, False,  True,  True,  True,  True,  True,  True,  True, False])
1483 tensor([ True,  True, False, False, False, False, False, False, False,  True])
1483 tensor([False, False, False, False, False, False, False, False, False, False])


In [6]:
class HGT(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_heads, num_layers, num_pat, num_proc, num_med, num_lab, data):
        super().__init__()

        self.lin_dict = torch.nn.ModuleDict()
        self.lin_dict["patient"] = Linear(num_pat, hidden_channels)
        self.lin_dict["procedure"] = Linear(num_proc, hidden_channels)
        self.lin_dict["medication"] = Linear(num_med, hidden_channels)
        self.lin_dict["lab"] = Linear(num_lab, hidden_channels)

        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HGTConv(hidden_channels, hidden_channels, data.metadata(), num_heads, group='sum')
            self.convs.append(conv)

        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        for node_type, x in x_dict.items():
            x_dict[node_type] = self.lin_dict[node_type](x).relu_()

        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict)

        out = self.lin(x_dict['patient'])
        out = F.sigmoid(out)
        return out

In [7]:
def get_results(feature_index, config, directory, label_name):
    
    labels = torch.tensor(np.load('data/Processed/MIMIC_y.npy')[:, feature_index]).to(torch.float32)
    
    data = HeteroData()
    data['patient'].x = patient_features
    data['patient'].y = labels
    data['patient'].train_mask = train_mask
    data['patient'].val_mask = val_mask
    data['patient'].test_mask = test_mask
    data['procedure'].x = procedure_features
    data['medication'].x = medication_features
    data['lab'].x = lab_features
    data['patient', 'same_patient', 'patient'].edge_index = patient_edges
    data['patient', 'proc_values', 'procedure'].edge_index = procedure_edges
    data['patient', 'med_values', 'medication'].edge_index = medication_edges
    data['patient', 'lab_values', 'lab'].edge_index = lab_edges
    data = T.ToUndirected()(data)
    data = data.to(device)
    
    model = HGT(hidden_channels=config['hidden_channels'], out_channels=1, num_heads=config['num_heads'], 
            num_layers=config['num_layers'], num_pat=3, num_proc=len(procedure_features), 
            num_med=len(medication_features), num_lab=len(lab_features), data=data).to(device)
    
    losses = []
    auprc_val = []
    auprc_test = []
    auprc_train = []
    acc_val = []
    acc_test = []
    acc_train = []

    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)

    optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])

    def train():
        model.train()
        optimizer.zero_grad()
        out = model(data.x_dict, data.edge_index_dict)
        mask = data['patient'].train_mask
        loss = F.cross_entropy(out[mask].squeeze(), data['patient'].y[mask])
        loss.backward()
        optimizer.step()
        train_acc = ((out[mask].detach().cpu().numpy().squeeze()>0.5).astype(int) == data['patient'].y[mask].detach().cpu().numpy()).sum()/len(out[mask])
        precision, recall, thresholds = precision_recall_curve(data["patient"].y[mask].cpu().numpy(), out[mask].squeeze().detach().cpu().numpy())
        train_auprc = auc(recall, precision)
        return train_acc, train_auprc, float(loss)

    @torch.no_grad()
    def test():
        model.eval()
        out = model(data.x_dict, data.edge_index_dict)
        val_mask = data['patient'].val_mask
        precision, recall, thresholds = precision_recall_curve(data["patient"].y[val_mask].cpu().numpy(), out[val_mask].squeeze().detach().cpu().numpy())
        val_auprc = auc(recall, precision)
        test_mask = data['patient'].test_mask
        precision, recall, thresholds = precision_recall_curve(data["patient"].y[test_mask].cpu().numpy(), out[test_mask].squeeze().detach().cpu().numpy())
        test_auprc = auc(recall, precision)
        val_acc = ((out[val_mask].detach().cpu().numpy().squeeze()>0.5).astype(int) == data['patient'].y[val_mask].detach().cpu().numpy()).sum()/len(out[val_mask])
        test_acc = ((out[test_mask].detach().cpu().numpy().squeeze()>0.5).astype(int) == data['patient'].y[test_mask].detach().cpu().numpy()).sum()/len(out[test_mask])

        return (val_auprc, test_auprc, val_acc, test_acc)

        
    logpath = directory + "Log.txt"
    modelpath = directory + "model.pth"
    
    with open(logpath, 'w') as file:
        print("LABEL:", feature_index, label_name, file=file)
        print("\n", file=file)
                
    for epoch in range(1, config["epochs"]):
        train_acc, train_auprc, loss = train()
        val_auprc, test_auprc, val_acc, test_acc = test()
        losses.append(loss)
        auprc_train.append(train_auprc)
        acc_train.append(train_acc)
        auprc_val.append(val_auprc)
        acc_val.append(val_acc)
        auprc_test.append(test_auprc)
        acc_test.append(test_acc)
        with open(logpath, 'a') as file:
            print(f'{epoch:03d}, Loss: {loss:.3f}, TrPRC: {train_auprc:.3f}, TrAcc: {train_acc:.3f}, VaPRC: {val_auprc:.3f}, VaAcc: {val_acc:.3f}, TePRC: {test_auprc:.3f}, TeAcc: {test_acc:.3f}', file=file)
        
    torch.save(model.state_dict(), modelpath)
    np.save(directory + "Train_AUPRC.npy", auprc_train)
    np.save(directory + "Val_AUPRC.npy", auprc_val)
    np.save(directory + "Test_AUPRC.npy", auprc_test)
    np.save(directory + "Train_Acc.npy", acc_train)
    np.save(directory + "Val_Acc.npy", acc_val)
    np.save(directory + "Test_Acc.npy", acc_test)
    np.save(directory + "Loss.npy", losses)
    
    best_auprc_val = np.max(auprc_val)
    best_auprc_train = auprc_train[np.argmax(auprc_val)]
    best_auprc_test = auprc_test[np.argmax(auprc_val)]
    best_acc_train = acc_train[np.argmax(auprc_val)]
    best_acc_val = acc_val[np.argmax(auprc_val)]
    best_acc_test = acc_test[np.argmax(auprc_val)]
    
    print(f'Task: {feature_index}, TrPRC: {best_auprc_train:.3f}, TrAcc: {best_acc_train:.3f}, VaPRC: {best_auprc_val:.3f}, VaAcc: {best_acc_val:.3f}, TePRC: {best_auprc_test:.3f}, TeAcc: {best_acc_test:.3f}')
        
    with open(logpath, 'a') as file:
        print("\n", file=file)
        print(f'Task: {feature_index}, TrPRC: {best_auprc_train:.3f}, TrAcc: {best_acc_train:.3f}, VaPRC: {best_auprc_val:.3f}, VaAcc: {best_acc_val:.3f}, TePRC: {best_auprc_test:.3f}, TeAcc: {best_acc_test:.3f}', file=file)
    
    plt.figure(figsize=(5,3))
    plt.title("Loss")
    plt.xlabel("Epochs")
    plt.plot(losses)
    plt.savefig(directory+"Loss.png")
    plt.close()
    plt.figure(figsize=(5,3))
    plt.title("Train AUPRC")
    plt.plot(auprc_train)
    plt.xlabel("Epochs")
    plt.savefig(directory+"Train_AUPRC.png")
    plt.close()
    plt.figure(figsize=(5,3))
    plt.title("Validation AUPRC")
    plt.xlabel("Epochs")
    plt.plot(auprc_val)
    plt.savefig(directory+"Val_AUPRC.png")
    plt.close()
    plt.figure(figsize=(5,3))
    plt.title("Test AUPRC")
    plt.xlabel("Epochs")
    plt.plot(auprc_test)
    plt.savefig(directory+"Test_AUPRC.png")
    plt.close()
    plt.figure(figsize=(5,3))
    plt.title("Train Acc")
    plt.plot(acc_train)
    plt.xlabel("Epochs")
    plt.savefig(directory+"Train_Acc.png")
    plt.close()
    plt.figure(figsize=(5,3))
    plt.title("Val Acc")
    plt.plot(acc_val)
    plt.xlabel("Epochs")
    plt.savefig(directory+"Val_Acc.png")
    plt.close()
    plt.figure(figsize=(5,3))
    plt.title("Test Acc")
    plt.plot(acc_test)
    plt.xlabel("Epochs")
    plt.savefig(directory+"Test_Acc.png")
    plt.close()
    
    return (best_auprc_train, best_auprc_val, best_auprc_test, best_acc_train, best_acc_val, best_acc_test)

### Run everything once

In [8]:
label_names = ["y_Expired", "y_Non.Adherence", "y_Developmental.Delay.Retardation", "y_Advanced.Heart.Disease",
          "y_Advanced.Lung.Disease", "y_Schizophrenia.and.other.Psychiatric.Disorders", "y_Alcohol.Abuse",
          "y_Other.Substance.Abuse", "y_Chronic.Pain.Fibromyalgia", "y_Chronic.Neurological.Dystrophies",
          "y_Advanced.Cancer", "y_Depression", "y_Dementia"]

In [9]:
# for i in range(13):
#     directory = "GNN_Results/" + str(i) + "/"
#     label_name = label_names[i]
#     get_results(i, config, directory, label_name)

In [10]:
for i in range(13):
    
    auprc_best = 0
    auprc_index = -1
    
    print("LABEL:", i, label_names[i])
    
    avg_auprc_train = 0
    avg_auprc_val = 0
    avg_auprc_test = 0
    
    avg_acc_train = 0
    avg_acc_val = 0
    avg_acc_test = 0
    
    for j in range(3):
        directory_name = "GNN_Results/" + str(i) + "_" + str(j) + "/"
        
        if os.path.exists(directory_name):
            shutil.rmtree(directory_name)
            
        os.makedirs(directory_name)
        
        label_name = label_names[i]
        auprc_train, auprc_val, auprc_test, acc_train, acc_val, acc_test = get_results(i, config, directory_name, label_name)
        avg_auprc_train += auprc_train
        avg_auprc_val += auprc_val
        avg_auprc_test += auprc_test
        avg_acc_train += acc_train
        avg_acc_val += acc_val
        avg_acc_test += acc_test
        if(auprc_val > auprc_best):
            auprc_best = auprc_val
            auprc_index = j
            
    print("Train AVERAGES:", avg_auprc_train/3, avg_acc_train/3)
    print("Val AVERAGES:", avg_auprc_val/3, avg_acc_val/3)
    print("Test AVERAGES:", avg_auprc_test/3, avg_acc_test/3)
    print("BEST MODEL:", auprc_index, auprc_best)
    print()

LABEL: 0 y_Expired




IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)