This notebook used for generating the meta pathes of `DTI-Net` dataset.

Requierments:

- Python version 3.9.10
- dgl@1.1.1
- torch@2.0.1

# Pre-process the data and get the metapathes


Section 1. Import the libraries.

In [1]:
from math import*
import pandas as pd
import numpy as np
import torch as th
import dgl
from pathlib import Path
import pickle

Define the data saving path.

In [6]:
ROOT_PATH = './DTINet/'
SAVE_PREFIX = './metapathes/'
Path(SAVE_PREFIX).mkdir(parents=True, exist_ok=True)

Prepare the data and generate the graph of `DTI-Net` dataset

In [7]:
def load_data():
    # read data as a numpy arrays
    drug_drug = np.loadtxt(ROOT_PATH + 'mat_drug_drug.txt')
    drug_disease = np.loadtxt(ROOT_PATH + 'mat_drug_disease.txt')
    drug_protein = np.loadtxt(ROOT_PATH + 'mat_drug_protein.txt')
    drug_protein_remove = np.loadtxt(
        ROOT_PATH + 'mat_drug_protein_remove_homo.txt')
    drug_se = np.loadtxt(ROOT_PATH + 'mat_drug_se.txt')
    drug_similarity = np.loadtxt(ROOT_PATH + 'Similarity_Matrix_Drugs.txt')

    protein_disease = np.loadtxt(ROOT_PATH + 'mat_protein_disease.txt')
    protein_drug = np.loadtxt(ROOT_PATH + 'mat_protein_drug.txt')
    protein_protein = np.loadtxt(ROOT_PATH + 'mat_protein_protein.txt')
    protein_similarity = np.loadtxt(
        ROOT_PATH + 'Similarity_Matrix_Proteins.txt')

    drug_drug = pd.DataFrame(drug_drug)
    drug_disease = pd.DataFrame(drug_disease)
    drug_protein = pd.DataFrame(drug_protein)
    drug_protein_remove = pd.DataFrame(drug_protein_remove)
    drug_se = pd.DataFrame(drug_se)
    drug_similarity = pd.DataFrame(drug_similarity)

    protein_disease = pd.DataFrame(protein_disease)
    protein_drug = pd.DataFrame(protein_drug)
    protein_protein = pd.DataFrame(protein_protein)
    protein_similarity = pd.DataFrame(protein_similarity)
    protein_similarity = protein_similarity/100

    drug_drug = pd.DataFrame(
        np.array(np.where(drug_drug == 1)).T, columns=['Drug1', 'Drug2'])
    drug_disease = pd.DataFrame(
        np.array(np.where(drug_disease == 1)).T, columns=['Drug', 'Disease'])
    drug_se = pd.DataFrame(
        np.array(np.where(drug_se == 1)).T, columns=['Drug', 'Se'])
    drug_similarity = pd.DataFrame(
        np.array(np.where(drug_similarity>=0.7)).T, columns=['Drug1', 'Drug2'])
    print(drug_similarity)
    protein_disease = pd.DataFrame(
        np.array(np.where(protein_disease == 1)).T, columns=['Protein', 'Disease'])
    protein_protein = pd.DataFrame(
        np.array(np.where(protein_protein == 1)).T, columns=['Protein1', 'Protein2'])
    protein_similarity = pd.DataFrame(np.array(
        np.where(protein_similarity>=0.7)).T, columns=['Protein1', 'Protein2'])

    drug_protein = pd.DataFrame(
        np.array(np.where(drug_protein == 1)).T, columns=['Drug', 'Protein'])

    graph_data = {
        ('drug', 'drug_drug', 'drug'): (th.tensor(drug_drug['Drug1'].values),
                                        th.tensor(drug_drug['Drug2'].values)),
        ('drug', 'drug_disease', 'disease'): (th.tensor(drug_disease['Drug'].values),
                                              th.tensor(drug_disease['Disease'].values)),
        ('drug', 'drug_protein', 'protein'): (th.tensor(drug_protein['Drug'].values),
                                              th.tensor(drug_protein['Protein'].values)),
        ('drug', 'drug_se', 'se'): (th.tensor(drug_se['Drug'].values),
                                    th.tensor(drug_se['Se'].values)),
        ('drug', 'drug_sim', 'drug'): (th.tensor(drug_similarity['Drug1'].values),
                                       th.tensor(drug_similarity['Drug2'].values)),
        ('protein', 'protein_disease', 'disease'): (th.tensor(protein_disease['Protein'].values),
                                                    th.tensor(protein_disease['Disease'].values)),
        ('protein', 'protein_protein', 'protein'): (th.tensor(protein_protein['Protein1'].values),
                                                    th.tensor(protein_protein['Protein2'].values)),
        ('protein', 'protein_sim', 'protein'): (th.tensor(protein_similarity['Protein1'].values),
                                                th.tensor(protein_similarity['Protein2'].values))
    }
    g = dgl.heterograph(graph_data)
    d_len = len(g.nodes('drug'))
    p_len = len(g.nodes('protein'))
    se_len = len(g.nodes('se'))
    di_len = len(g.nodes('disease'))

    dim = d_len + p_len + di_len + se_len

    type_mask = np.zeros((dim), dtype=int)
    type_mask[d_len:d_len+p_len] = 1
    type_mask[d_len+p_len:d_len+p_len+se_len] = 2
    type_mask[d_len+p_len+se_len:] = 3
    return g, dim, type_mask


Call load_data function defined above.

In [8]:
# create a graph and get dim
adjMat, dim, type_mask = load_data()


      Drug1  Drug2
0         0      0
1         1      1
2         2      2
3         3      3
4         3     40
...     ...    ...
1949    705    705
1950    706    706
1951    707    163
1952    707    538
1953    707    707

[1954 rows x 2 columns]


# Helper function defination.

In [9]:
def getMetaPathSemetric3(edgeKey, innerNodeKey, outerKey, rev=False):
    if rev == True:
        inKey = outerKey
    else:
        inKey = innerNodeKey
    edges = adjMat.edges(etype=edgeKey)
    innerLength = len(adjMat.nodes(inKey))
    res = {}
    for i in range(innerLength):
        res[i] = []
    for id in range(len(edges[1])):
        inner = edges[1][id]
        outer = edges[0][id]
        res[int(inner)].append(int(outer))
    return res


In [10]:
def check(m_dict, i, j):
    for k in m_dict.keys():
        if i in m_dict[k] and j in m_dict[k]:
            return True, k
    return False, None


In [11]:
def generate_metapath_adj(ename, innerNode, outerNode, filename, rev=False):
    dict_oio = getMetaPathSemetric3(ename, innerNode, outerNode, rev)
    l = len(adjMat.nodes(outerNode))
    oio = []
    for i in range(l):
        for j in range(l):
            b, k = check(dict_oio, i, j)
            if b:
                oio.append([i, k, j])
    oio = np.array(oio)
    sorted_index1 = sorted(list(range(len(oio))),
                           key=lambda i: oio[i, [2, 1, 0]].tolist())
    sorted_index2 = sorted(list(range(len(oio))),
                           key=lambda i: oio[i, [0, 2, 1]].tolist())
    temp = oio[sorted_index1]
    target = {}
    with open(SAVE_PREFIX + filename+".idx", 'wb') as out_file:
        left = 0
        right = 0
        for nodeId in range(l):
            while right < len(temp) and temp[right, 2] == nodeId:
                right += 1
            target[nodeId] = temp[left:right, :]
            left = right
        pickle.dump(target, out_file)
    generate_neighbour(oio, sorted_index2, filename, l)


def generate_path_adj(source,edgeName,filename):
    edges = adjMat.edges(etype=edgeName)
    l = len(adjMat.nodes(source))
    oio = []
    for i in range(len(edges[0])):
        src = edges[0][i]
        dst = edges[1][i]
        oio.append([src, dst])
    oio = np.array(oio)
    sorted_index1 = sorted(list(range(len(oio))),
                          key=lambda i: oio[i, [1, 0]].tolist())
    sorted_index2 = sorted(list(range(len(oio))),
                          key=lambda i: oio[i, [0,1]].tolist())
    temp = oio[sorted_index1]
    target = {}
    with open(SAVE_PREFIX + filename+'.idx', 'wb') as out_file:
        left = 0
        right = 0
        for nodeId in range(l):
            while right < len(temp) and temp[right, 1] == nodeId:
                right += 1
            target[nodeId] = temp[left:right, :]
            left = right
        pickle.dump(target, out_file)
    generate_neighbour(oio,sorted_index2,filename, l)

def generate_neighbour(oio, sortedIndex, fileName, l):
    oio = oio[sortedIndex]
    with open(SAVE_PREFIX + fileName+'.adjlist', 'w') as out_file:
        left = 0
        right = 0
        for nodeId in range(l):
            while right < len(oio) and oio[right, 0] == nodeId:
                right += 1
            neighbors = oio[left:right, -1]
            neighbors = list(map(str, neighbors))
            if len(neighbors) > 0:
                out_file.write('{} '.format(nodeId) +
                               ' '.join(neighbors) + '\n')
            else:
                out_file.write('{}\n'.format(nodeId))
            left = right


# Drug Drug (sim) meta-path generation

This cell computes the metapathes of the drug and drug based on similarity.

This cell's output is in the format of $[d_i , d_j]$

In [13]:
generate_path_adj('drug','drug_sim','d_d_sim')

# Drug Drug meta-path generation

In [14]:
generate_path_adj('drug','drug_drug','d_d')

# Protein Protein meta-path generation

This cell computes the metapathes of the drug and drug based on similarity.

This cell's output is in the format of $[d_i , d_j]$

In [15]:
generate_path_adj('protein','protein_protein', "p_p")

# Protein Protein (sim) meta-path generation

This cell computes the metapathes of the drug and drug based on similarity.

This cell's output is in the format of $[d_i , d_j]$

In [16]:
generate_path_adj('protein','protein_sim', "p_p_sim")

# Drug Protein meta-path generation

This cell computes the metapathes of the drug and drug based on protein.

This cell's output is in the format of $[d_i ,p_k, d_j]$

In [17]:
generate_metapath_adj('drug_protein', 'protein', 'drug','d_p_d')

# Drug Side effect meta-path generation

This cell computes the metapathes of the drug and drug based on side effect.



In [18]:
generate_metapath_adj('drug_se', 'se', 'drug','d_se_d')

# Drug Disease meta-path generation

This cell computes the metapathes of the drug and drug based on disease.

This cell's output is in the format of $[d_i ,di_k, d_j]$

In [19]:
generate_metapath_adj('drug_disease', 'disease', 'drug','d_di_d')

# Protein Drug meta-path generation

This cell computes the metapathes of the drug and drug based on protein.

This cell's output is in the format of $[d_i ,p_k, d_j]$

In [20]:
generate_metapath_adj('drug_protein', 'drug', 'protein','p_dr_p',True)

# Protein Disease meta-path generation

This cell computes the metapathes of the drug and drug based on protein.

This cell's output is in the format of $[d_i ,p_k, d_j]$

In [21]:
generate_metapath_adj('protein_disease', 'disease', 'protein','p_di_p')