# For Colab

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!pwd

/content/drive/MyDrive/PythonProjects/skgnmda


In [4]:
cd /content/drive/MyDrive/PythonProjects/skgnmda

/content/drive/MyDrive/PythonProjects/skgnmda


# Prerequirements

In [6]:
dataset = 'mdkg_hmdad'

In [7]:
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14388704342630274988
xla_global_id: -1
]


# Load Data

In [8]:
from src.config import DISEASE_MICROBE_EXAMPLE, PROCESSED_DATA_DIR
from src.utils import format_filename
import numpy as np

examples_file = format_filename(
    PROCESSED_DATA_DIR, DISEASE_MICROBE_EXAMPLE, dataset=dataset
)
examples = np.load(examples_file)

In [9]:
examples.shape

(898, 3)

In [10]:
examples[:3, ]

array([[50863, 33211,     1],
       [43621, 40832,     1],
       [33293, 47880,     1]])

In [11]:
from src.data import MicrobeDiseaseData

data = MicrobeDiseaseData([examples[:, :1], examples[:, 1:2]], examples[:, 2:3].reshape(-1))

In [12]:
from keras import backend as K
from src.config import MICROBE_SIMILARITY_FILE, DISEASE_SIMILARITY_FILE, PROCESSED_DATA_DIR, ENTITY_VOCAB_TEMPLATE
import pandas as pd
from src.utils import pickle_load
import tensorflow as tf

microbe_similarity_df = pd.read_csv(MICROBE_SIMILARITY_FILE, index_col=0)
disease_similarity_df = pd.read_csv(DISEASE_SIMILARITY_FILE, index_col=0)

entity_vocab_size = len(
    pickle_load(
        format_filename(PROCESSED_DATA_DIR, ENTITY_VOCAB_TEMPLATE, dataset=dataset)
    )
)

microbe_similarity_matrix = np.zeros((entity_vocab_size, microbe_similarity_df.shape[1]), dtype="float64")
disease_similarity_matrix = np.zeros((entity_vocab_size, disease_similarity_df.shape[1]), dtype="float64")

for i, row in microbe_similarity_df.iterrows():
    for j in range(len(row)):
        microbe_similarity_matrix[i][j] = row[j]

for i, row in disease_similarity_df.iterrows():
    for j in range(len(row)):
        disease_similarity_matrix[i][j] = row[j]

microbe_similarity_matrix = tf.Variable(microbe_similarity_matrix,
                                        name='pre_term_microbe_embedding',
                                        dtype='float32',
                                        trainable=False)
disease_similarity_matrix = tf.Variable(disease_similarity_matrix,
                                        name='pre_term_disease_embedding',
                                        dtype='float32',
                                        trainable=False)

Logging Info - Loaded: /content/drive/MyDrive/PythonProjects/skgnmda/data_repository/processed/mdkg_hmdad_entity_vocab.pkl


In [13]:
def get_first_term_embedding(x):
    microbe_pre_embed = K.gather(microbe_similarity_matrix, K.cast(x, dtype='int64'))
    return microbe_pre_embed


def get_second_term_embedding(x):
    disease_pre_embed = K.gather(disease_similarity_matrix, K.cast(x, dtype='int64'))
    return disease_pre_embed

# Configure Model

In [14]:
from src.config import KGCNModelConfig

kgcn_config = KGCNModelConfig()

kgcn_config.model_name = 'Previous 1'
kgcn_config.embed_dim = 32
kgcn_config.neighbor_sample_size = 8
kgcn_config.n_depth = 2
kgcn_config.l2_weight = 0.01
kgcn_config.aggregator_type = 'sum'

In [15]:
kgcn_config.get_summary()

{'model_name': 'Previous 1',
 'embed_dim': 32,
 'neighbor_sample_size': 8,
 'n_depth': 2,
 'l2_weight': 0.01,
 'aggregator_type': 'sum'}

# Configure Data

In [16]:
from src.config import DataConfig, PROCESSED_DATA_DIR, ENTITY_VOCAB_TEMPLATE, \
    RELATION_VOCAB_TEMPLATE, ADJ_ENTITY_TEMPLATE, ADJ_RELATION_TEMPLATE
from src.utils import pickle_load, format_filename
import numpy as np

data_config = DataConfig()

data_config.entity_vocab_size = len(
    pickle_load(
        format_filename(PROCESSED_DATA_DIR, ENTITY_VOCAB_TEMPLATE, dataset=dataset)
    )
)  # the size of entity_vocab

data_config.relation_vocab_size = len(
    pickle_load(
        format_filename(
            PROCESSED_DATA_DIR, RELATION_VOCAB_TEMPLATE, dataset=dataset
        )
    )
)  # the size of relation_vocab

data_config.adj_entity = np.load(
    format_filename(PROCESSED_DATA_DIR, ADJ_ENTITY_TEMPLATE, dataset=dataset)
)  # load adj_entity matrix

data_config.adj_relation = np.load(
    format_filename(PROCESSED_DATA_DIR, ADJ_RELATION_TEMPLATE, dataset=dataset)
)  # load adj_relation matrix


Logging Info - Loaded: /content/drive/MyDrive/PythonProjects/skgnmda/data_repository/processed/mdkg_hmdad_entity_vocab.pkl
Logging Info - Loaded: /content/drive/MyDrive/PythonProjects/skgnmda/data_repository/processed/mdkg_hmdad_relation_vocab.pkl


In [17]:
data_config.get_summary()

{'entity_vocab_size': 66911, 'relation_vocab_size': 39}

# Bulid Model

In [18]:
from src.models.graph_models import PairKGCN

model = PairKGCN(kgcn_config=kgcn_config,
                 data_config=data_config)

KerasTensor(type_spec=TensorSpec(shape=(None, 32), dtype=tf.float32, name=None), name='lambda_1/Squeeze:0', description="created by layer 'lambda_1'")


In [19]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 second_input (InputLayer)   [(None, 1)]                  0         []                            
                                                                                                  
 receptive_filed_for_second  [(None, 1),                  0         ['second_input[0][0]']        
 _ent (Lambda)                (None, 8),                                                          
                              (None, 64)]                                                         
                                                                                                  
 receptive_filed_for_second  [(None, 8),                  0         ['second_input[0][0]']        
 _rel (Lambda)                (None, 64)]                                                     

# Configure Optimizer

In [20]:
from base.config import OptimizerConfig
from src.config import MODEL_SAVED_DIR

In [21]:
optimizer_config = OptimizerConfig()
optimizer_config.optimizer = 'adam'
optimizer_config.lr = 1e-3
optimizer_config.batch_size = 32
optimizer_config.n_epoch = 50
optimizer_config.checkpoint_dir = MODEL_SAVED_DIR
optimizer_config.callbacks_to_add = []

# Train Model

In [22]:
from src.optimization.optimization import KGCNTrainer

In [23]:
trainer = KGCNTrainer()
result = trainer.train(model, data, optimizer_config, [])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Logging Info - Training time: 00:02:25


In [24]:
result.get_result()

{'AUC': 0.9669098863596907,
 'ACC': 0.8830734966592427,
 'F1 Score': 0.8757396449704142,
 'AUPR': 0.9551894831429586}

# Cross Validation

In [25]:
from src.data import MicrobeDiseaseTrainTestSplit

train_test_spliter = MicrobeDiseaseTrainTestSplit(examples=examples,
                                                  with_gaussian_similarity=True)

In [26]:
from src.optimization.optimization import KGCNTrainer, KGCNTester
from src.models.graph_models import PairKGCNFactory

trainer = KGCNTrainer()
tester = KGCNTester()
factory = PairKGCNFactory(kgcn_config,
                          data_config,
                          first_term_size=291,  #291
                          second_term_size=39)  #39

In [27]:
from base.optimization import cross_validation

cross_validation(k=5,
                 data_size=len(examples),
                 train_test_spliter=train_test_spliter,
                 model_factory=factory,
                 trainer=trainer,
                 tester=tester,
                 optimization_config=optimizer_config)


Logging Info - Fold 1 >>>>>>>>>>>>>>

test_indices: [60, 761, 57, 392, 248, 779, 21, 431, 534, 90, 280, 241, 346, 552, 818, 81, 332, 745, 434, 778, 161, 37, 484, 593, 376, 763, 314, 605, 7, 370, 676, 213, 400, 237, 306, 330, 138, 44, 569, 98, 497, 20, 182, 447, 120, 542, 742, 626, 720, 118, 167, 566, 653, 724, 522, 523, 322, 114, 710, 638, 505, 609, 184, 576, 290, 718, 380, 147, 283, 25, 345, 639, 652, 851, 517, 32, 405, 168, 13, 868, 895, 830, 374, 845, 728, 268, 102, 389, 748, 515, 344, 737, 631, 881, 775, 425, 46, 4, 140, 341, 71, 637, 256, 224, 511, 632, 689, 893, 877, 562, 402, 105, 730, 793, 478, 379, 356, 412, 874, 564, 791, 244, 87, 767, 870, 599, 139, 654, 747, 72, 734, 879, 660, 38, 776, 433, 641, 124, 633, 99, 132, 440, 189, 894, 570, 469, 262, 122, 123, 69, 350, 453, 712, 528, 640, 665, 499, 790, 766, 255, 174, 325, 305, 886, 136, 485, 636, 764, 557, 835, 130, 855, 247, 159, 483, 340, 259, 642, 427]
train_indices: [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19

<base.evaluation.Result at 0x787532dcd510>