In [57]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [58]:
!ls

base  data_repository  notebooks  requirements.txt  src


In [59]:
cd /content/drive/MyDrive/Academic/Topics/AI/Machine\ Learning\ Dr.\ Montazeri/Project/ml_mda

/content/drive/MyDrive/Academic/Topics/AI/Machine Learning Dr. Montazeri/Project/ml_mda


In [60]:
!pip install torch_geometric



In [61]:
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.1.0+cu121.html

Looking in links: https://data.pyg.org/whl/torch-2.1.0+cu121.html


# Requirements

In [62]:
import torch
from torch_geometric.data import Data, HeteroData
from torch_geometric.nn import SAGEConv, to_hetero, GATConv, Linear
import torch_geometric.transforms as T
from torch import nn

# Raw Data

In [63]:
from src.features import get_relations, get_entities, get_relation_types, get_associations

## Entities

In [64]:
raw_entities = get_entities()
raw_entities.head()

Unnamed: 0,type,name,id
0,Compound,CHEBI:74510,0
1,Gene,10584,1
2,Metabolite,HMDB0075603,2
3,Gene,79134,3
4,Disease,DOID:12306,4


## Relations

In [65]:
raw_relations = get_relations()
raw_relations.head()

Unnamed: 0,head,relation,tail
0,37448,17,41929
1,41929,17,37448
2,45065,17,41929
3,41929,17,45065
4,50858,17,41929


## Relation Types

In [66]:
raw_relation_types = get_relation_types()
raw_relation_types.head()

Unnamed: 0,type,name,id
0,VMH:metabolite-disease:Decreased,metabolite:disease,0
1,GNBR:Ud,Gene:Disease,1
2,GNBR:L,Gene:Disease,2
3,GNBR:Pa,Compound:Disease,3
4,VMH:metabolite-disease:Increased,metabolite:disease,4


## Associations

In [67]:
raw_associations = get_associations()
raw_associations.tail()

Unnamed: 0,disease,microbe,increased
893,64642,53920,0
894,25026,60601,0
895,25026,44316,0
896,31069,60226,0
897,64642,4251,0


In [68]:
y = torch.tensor(raw_associations['increased'].tolist(), dtype=torch.float32).reshape(-1, 1)
y.shape

torch.Size([898, 1])

# Heterogeneous Data

In [32]:
data = HeteroData()

In [33]:
entity_types = list(set(raw_entities['type'].tolist()))

In [34]:
for e in entity_types:
    data[e].x = torch.tensor(raw_entities.loc[raw_entities['type'] == e]['id'].tolist()).reshape(-1, 1)

In [35]:
data

HeteroData(
  Symptom={ x=[415, 1] },
  Anatomy={ x=[398, 1] },
  Organ={ x=[55, 1] },
  Disease={ x=[5645, 1] },
  Gene={ x=[22536, 1] },
  Division={ x=[111, 1] },
  Compound={ x=[9572, 1] },
  Metabolite={ x=[23000, 1] },
  Microbe={ x=[5179, 1] }
)

In [36]:
relation_types = raw_relation_types['type'].tolist()
relation_ids = raw_relation_types['id'].tolist()

In [37]:
for i in range(len(relation_types)):
    a = raw_relations.loc[raw_relations['relation'] == relation_ids[i]]['head'].tolist()
    b = raw_relations.loc[raw_relations['relation'] == relation_ids[i]]['tail'].tolist()
    head_type = raw_entities.loc[raw_entities['id'] == a[0]]['type'].item()
    tail_type = raw_entities.loc[raw_entities['id'] == b[0]]['type'].item()
    data[head_type, 't' + str(relation_ids[i]), tail_type].edge_index = torch.tensor([a, b])

In [38]:
import torch_geometric.transforms as T

data = T.ToUndirected()(data)

In [39]:
data

HeteroData(
  Symptom={ x=[415, 1] },
  Anatomy={ x=[398, 1] },
  Organ={ x=[55, 1] },
  Disease={ x=[5645, 1] },
  Gene={ x=[22536, 1] },
  Division={ x=[111, 1] },
  Compound={ x=[9572, 1] },
  Metabolite={ x=[23000, 1] },
  Microbe={ x=[5179, 1] },
  (Metabolite, t0, Disease)={ edge_index=[2, 38] },
  (Gene, t1, Disease)={ edge_index=[2, 814] },
  (Gene, t2, Disease)={ edge_index=[2, 96768] },
  (Compound, t3, Disease)={ edge_index=[2, 5238] },
  (Metabolite, t4, Disease)={ edge_index=[2, 192] },
  (Disease, t5, Disease)={ edge_index=[2, 1086] },
  (Gene, t6, Disease)={ edge_index=[2, 60468] },
  (Microbe, t7, Microbe)={ edge_index=[2, 23562] },
  (Gene, t8, Disease)={ edge_index=[2, 3896] },
  (Disease, t9, Gene)={ edge_index=[2, 15462] },
  (Disease, t10, Gene)={ edge_index=[2, 15246] },
  (Gene, t11, Disease)={ edge_index=[2, 2558] },
  (Gene, t12, Disease)={ edge_index=[2, 5672] },
  (Microbe, t13, Metabolite)={ edge_index=[2, 470] },
  (Compound, t14, Disease)={ edge_index=[2, 

# Heterogeneous GNN

In [40]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


model = GNN(hidden_channels=64, out_channels=2)
model = to_hetero(model, data.metadata(), aggr='sum')

In [41]:
model(torch.tensor([20]), )

TypeError: forward() missing 1 required positional argument: 'edge_index'

 # TransE

In [42]:
import argparse
import os.path as osp

import torch
import torch.optim as optim

from torch_geometric.datasets import FB15k_237
from torch_geometric.nn import ComplEx, DistMult, RotatE, TransE

In [45]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [53]:
model = TransE(num_nodes=data.num_nodes,
               num_relations=len(data.metadata()[1]),
               hidden_channels=50
               ).to(device)

In [None]:
loader = model.loader(
    head_index=data.edge_index[0],
    rel_type=data.edge_type,
    tail_index=data.edge_index[1],
    batch_size=1000,
    shuffle=True,
)

optimizer = optim.Adam(model.parameters(), lr=0.01)


def train():
    model.train()
    total_loss = total_examples = 0
    for head_index, rel_type, tail_index in loader:
        optimizer.zero_grad()
        loss = model.loss(head_index, rel_type, tail_index)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * head_index.numel()
        total_examples += head_index.numel()
    return total_loss / total_examples


@torch.no_grad()
def test(data):
    model.eval()
    return model.test(
        head_index=data.edge_index[0],
        rel_type=data.edge_type,
        tail_index=data.edge_index[1],
        batch_size=20000,
        k=10,
    )


for epoch in range(1, 501):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    if epoch % 25 == 0:
        rank, mrr, hits = test(val_data)
        print(f'Epoch: {epoch:03d}, Val Mean Rank: {rank:.2f}, '
              f'Val MRR: {mrr:.4f}, Val Hits@10: {hits:.4f}')

rank, mrr, hits_at_10 = test(test_data)
print(f'Test Mean Rank: {rank:.2f}, Test MRR: {mrr:.4f}, '
      f'Test Hits@10: {hits_at_10:.4f}')

# Test

In [71]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import PairNorm
from torch_geometric.data import Data
from torch.optim import Adam

In [72]:
class TransE(nn.Module):
    def __init__(self, num_entities, num_relations, embedding_dim):
        super(TransE, self).__init__()
        self.entity_embeddings = nn.Embedding(num_entities, embedding_dim)
        self.relation_embeddings = nn.Embedding(num_relations, embedding_dim)
        self.pair_norm = PairNorm()

    def forward(self, edges):
        src, dst, rel = edges.src, edges.dst, edges.data['rel']
        src_embedding = self.entity_embeddings(src)
        dst_embedding = self.entity_embeddings(dst)
        rel_embedding = self.relation_embeddings(rel)
        score = torch.norm(src_embedding + rel_embedding - dst_embedding, p=1, dim=-1)
        return score

In [73]:
num_entities = 3
num_relations = 3
embedding_dim = 64
model = TransE(num_entities, num_relations, embedding_dim)

In [76]:
model.relation_embeddings.weight.shape

torch.Size([3, 64])

In [None]:
# Example data for a simple knowledge graph
edges = torch.tensor([[0, 1, 2], [1, 2, 0]])  # (source, destination, relation)
data = Data(edge_index=edges[:2], edge_type=edges[2])

# Instantiate the model


# Define loss function and optimizer
criterion = nn.MarginRankingLoss(margin=1.0)
optimizer = Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    optimizer.zero_grad()
    scores = model(data)
    pos_scores = scores[:len(data.edge_index[0])]
    neg_scores = scores[len(data.edge_index[0]):]
    target = torch.tensor([-1], dtype=torch.float).expand_as(pos_scores).to(pos_scores.device)
    loss = criterion(pos_scores, neg_scores, target)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

# Example querying
entity_embeddings = model.entity_embeddings.weight.detach()
relation_embeddings = model.relation_embeddings.weight.detach()
entity_id = 0
relation_id = 2
target_entity_embedding = entity_embeddings[entity_id]
target_relation_embedding = relation_embeddings[relation_id]
target_embedding = target_entity_embedding + target_relation_embedding
distances = torch.norm(entity_embeddings - target_embedding, dim=1, p=1)
closest_entities = distances.argsort()[:5]  # Get indices of closest entities
print("Closest entities to (entity 0 + relation 2):", closest_entities.tolist())


In [78]:
from torch_geometric.datasets import FB15k_237


path = '/content/sample_data/'
train_data = FB15k_237(path, split='train')[0].to(device)
val_data = FB15k_237(path, split='val')[0].to(device)
test_data = FB15k_237(path, split='test')[0].to(device)

Downloading https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/FB15k-237/train.txt
Downloading https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/FB15k-237/valid.txt
Downloading https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/FB15k-237/test.txt
Processing...
Done!


In [79]:
train_data

Data(edge_index=[2, 272115], edge_type=[272115], num_nodes=14541)