In [4]:
pip install dgl

Collecting dgl
  Downloading dgl-1.1.3-cp310-cp310-manylinux1_x86_64.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dgl
Successfully installed dgl-1.1.3


In [5]:
from dgl.data import FB15k237Dataset

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [6]:
dataset = FB15k237Dataset()

Downloading /root/.dgl/FB15k-237.tgz from https://data.dgl.ai/dataset/FB15k-237.tgz...
Extracting file to /root/.dgl/FB15k-237_40695531
# entities: 14541
# relations: 237
# training edges: 272115
# validation edges: 17535
# testing edges: 20466
Done saving data into cached files.


In [18]:
dataset = FB15k237Dataset()
data = dataset[0]

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data):
      self.head, self.tail = data.edges()
      self.labels = data.edata['etype']

    def __len__(self):
        return len(self.head)

    def __getitem__(self, idx):
      head = torch.tensor(self.head[idx])
      tail = torch.tensor(self.tail[idx])
      label = torch.tensor(self.labels[idx])

      return head, label, tail

dataset = CustomDataset(data)

dataloader = DataLoader(
      dataset,
    batch_size = 64,
    shuffle = True,
)

In [14]:
# data에서 head, tail, label 추출
head, tail = data.edges()
label = data.edata['etype']


In [None]:
import numpy as np
import torch.nn.init as init
import math
import torch.nn.functional as F
import torch.optim as optim

class TransE(nn.Module):
    def __init__(self, k = 50, gamma = 1, dataloader = dataloader, graphs = data, epochs = 1000):
        super(TransE, self).__init__()

        self.head, self.tail = data.edges()
        self.labels = data.edata['etype']
        self.k = k
        self.epochs = epochs
        self.num_entity = len(data['entities'])
        self.num_label = len(self.labels)

        self.dataloader = dataloader

        bound = 6 / math.sqrt(self.k)

        # entity: 시작은 그냥하고 update 할 때마다 (loop 돌 때마다 normalize 해주면 됨)
        self.entity = nn.Parameter(init.uniform_(torch.empty(self.num_entity, self.k), -bound, bound), requires_grad=True)
        self.label = nn.Parameter(init.uniform_(torch.empty(self.num_label, self.k), -bound, bound), requires_grad=True)
        self.label.data = F.normalize(self.embed_label.data, p=2, dim=1)

        self.gamma = gamma

        # relationship matrix 만들기
        unique_heads = torch.unique(self.head)
        unique_tails = torch.unique(self.tail)
        self.rel_matrix = torch.zeros((len(unique_heads), len(unique_tails)), dtype=torch.float32)

        # relationship이 있는 애들 중에서 negative를 뽑는게 hard negative일 확률이 높겠지??
        for h, l, t in zip(self.head, self.label, self.tail):
          self.rel_matrix[h,t] = l


    def dissimiarity(self, h, l, t):
      # h, t: (k,) / l : (k, )
      d = torch.sum(torch.pow(h, 2)) + torch.sum(torch.pow(l, 2)) + torch.sum(torch.pow(t, 2)) - 2 * (torch.matmul(h, t) + torch.matmul(l, t - h))
      return d

    def corrupted_tail(self, h, t):
      related_tails = self.rel_matrix[h, :]
      num_related_tails = related_tails.size(0)
      if num_related_tails > 0:
        random_index = torch.randint(num_related_tails)
        # t일 경우 그냥 첫번째 negative sample 쓰기
        if related_tails[random_index] == t:
          random_index = 0
        corrupted_tail = related_tails[random_index].item()
      else:
        corrupted_tail = t

      return self.entity(corrupted_tail)

    def corrupted_head(self, h, t):
      related_heads = self.rel_matrix[:, t]
      num_related_heads = related_heads.size(0)
      if num_related_heads > 0:
        random_index = torch.randint(num_related_heads)
        # h일 경우 그냥 첫번째 negative sample 쓰기
        if related_heads[random_index] == h:
          random_index = 0
        corrupted_head = related_heads[random_index].item()
      else:
        corrupted_head = h

      return self.entity(corrupted_head)

    def forward(self, batch):
        head, label, tail = batch['head'], batch['label'], batch['tail']

        batch_size = head.size(0)
        h = self.entity(head) # (batch_size, k)
        t = self.entity(tail) # (batch_size, k)
        l = self.label(label) # (batch_size, embed_dim)

        # 행렬 연산 하면 돼서 굳이 for문 돌릴 필요 X (최대한 loop 없이 matrix 연산으로 가는게 좋음)
        # for n in range(batch_size):
        neg_t = self.corrupted_tail(h, t)
        neg_h = self.corrupted_head(h, t)

        pos_d = self.dissimiarity(h, l, t)
        neg_d_h = self.dissimiarity(neg_h, l, t) # head 갈아 낀 경우
        neg_d_t = self.dissimiarity(h, l, neg_t) # tail 갈아 낀 경우

        # 원래는 random하게 tail, head 갈아끼나?? 여기선 그냥,,, 둘 다 해
        loss1 = torch.sum(torch.sum(self.gamma + pos_d - neg_d_h))
        loss2 = torch.sum(torch.sum(self.gamma + pos_d - neg_d_t))

        loss = (loss1 + loss2) / 2

        return loss

    def fit(self):
      train_loss = []
      optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
      for epoch in range(self.epochs):

        total_loss = 0
        for batch_data in self.dataloader:

          optimizer.zero_grad()
          loss = self.forward(batch_data)
          loss.backward()
          total_loss += loss.item()

      optimizer.step()

      average_loss = total_loss / len(self.dataloader)
      print(f'Epoch {epoch + 1}/{self.epochs}, Average Loss: {average_loss:.4f}')
      train_loss.append(average_loss)

