In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch
!pip install transformers
!pip install datasets

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [4]:
from datasets import load_dataset

data = load_dataset("web_nlg", 'release_v1')

Downloading data:   0%|          | 0.00/2.30M [00:00<?, ?B/s]

Generating full split:   0%|          | 0/14237 [00:00<?, ? examples/s]

In [None]:
# dataset -> WN18RR, FB15-237K

# Embedding

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [None]:
class TransE(Model):

	def __init__(self, ent_tot, rel_tot, dim = 100, p_norm = 1, norm_flag = True, margin = None, epsilon = None):
		super(TransE, self).__init__(ent_tot, rel_tot)
		
		self.dim = dim
		self.margin = margin
		self.epsilon = epsilon
		self.norm_flag = norm_flag
		self.p_norm = p_norm

		self.ent_embeddings = nn.Embedding(self.ent_tot, self.dim)
		self.rel_embeddings = nn.Embedding(self.rel_tot, self.dim)

		if margin == None or epsilon == None:
			nn.init.xavier_uniform_(self.ent_embeddings.weight.data)
			nn.init.xavier_uniform_(self.rel_embeddings.weight.data)
		else:
			self.embedding_range = nn.Parameter(
				torch.Tensor([(self.margin + self.epsilon) / self.dim]), requires_grad=False
			)
			nn.init.uniform_(
				tensor = self.ent_embeddings.weight.data, 
				a = -self.embedding_range.item(), 
				b = self.embedding_range.item()
			)
			nn.init.uniform_(
				tensor = self.rel_embeddings.weight.data, 
				a= -self.embedding_range.item(), 
				b= self.embedding_range.item()
			)

		if margin != None:
			self.margin = nn.Parameter(torch.Tensor([margin]))
			self.margin.requires_grad = False
			self.margin_flag = True
		else:
			self.margin_flag = False


	def _calc(self, h, t, r, mode):
		if self.norm_flag:
			h = F.normalize(h, 2, -1)
			r = F.normalize(r, 2, -1)
			t = F.normalize(t, 2, -1)
		if mode != 'normal':
			h = h.view(-1, r.shape[0], h.shape[-1])
			t = t.view(-1, r.shape[0], t.shape[-1])
			r = r.view(-1, r.shape[0], r.shape[-1])
		if mode == 'head_batch':
			score = h + (r - t)
		else:
			score = (h + r) - t
		score = torch.norm(score, self.p_norm, -1).flatten()
		return score

	def forward(self, data):
		batch_h = data['batch_h']
		batch_t = data['batch_t']
		batch_r = data['batch_r']
		mode = data['mode']
		h = self.ent_embeddings(batch_h)
		t = self.ent_embeddings(batch_t) #bs x embedding dim
		r = self.rel_embeddings(batch_r)
		score = self._calc(h ,t, r, mode)
    #margin ranking loss
		if self.margin_flag:
			return self.margin - score
		else:
			return score

	def regularization(self, data):
		batch_h = data['batch_h']
		batch_t = data['batch_t']
		batch_r = data['batch_r']
		h = self.ent_embeddings(batch_h)
		t = self.ent_embeddings(batch_t)
		r = self.rel_embeddings(batch_r)
		regul = (torch.mean(h ** 2) + 
				 torch.mean(t ** 2) + 
				 torch.mean(r ** 2)) / 3
		return regul

	def predict(self, data):
		score = self.forward(data)
		if self.margin_flag:
			score = self.margin - score
			return score.cpu().data.numpy()
		else:
			return score.cpu().data.numpy()

In [None]:
# embedding lookup -> triple score calc (model by model) -> return, 이과정에서 vector 들이 잘 학습됌.

In [None]:
import torch
import torch.nn as nn
from .Model import Model # ?

In [None]:
class DistMult(Model):

	def __init__(self, ent_tot, rel_tot, dim = 100, margin = None, epsilon = None):
		super(DistMult, self).__init__(ent_tot, rel_tot)

		self.dim = dim
		self.margin = margin
		self.epsilon = epsilon
		self.ent_embeddings = nn.Embedding(self.ent_tot, self.dim)
		self.rel_embeddings = nn.Embedding(self.rel_tot, self.dim)

		if margin == None or epsilon == None:
			nn.init.xavier_uniform_(self.ent_embeddings.weight.data)
			nn.init.xavier_uniform_(self.rel_embeddings.weight.data)
		else:
			self.embedding_range = nn.Parameter(
				torch.Tensor([(self.margin + self.epsilon) / self.dim]), requires_grad=False
			)
			nn.init.uniform_(
				tensor = self.ent_embeddings.weight.data, 
				a = -self.embedding_range.item(), 
				b = self.embedding_range.item()
			)
			nn.init.uniform_(
				tensor = self.rel_embeddings.weight.data, 
				a= -self.embedding_range.item(), 
				b= self.embedding_range.item()
			)

	def _calc(self, h, t, r, mode):
		if mode != 'normal':
			h = h.view(-1, r.shape[0], h.shape[-1])
			t = t.view(-1, r.shape[0], t.shape[-1])
			r = r.view(-1, r.shape[0], r.shape[-1])
		if mode == 'head_batch':
			score = h * (r * t)
		else:
			score = (h * r) * t
		score = torch.sum(score, -1).flatten()
		return score

	def forward(self, data):
		batch_h = data['batch_h']
		batch_t = data['batch_t']
		batch_r = data['batch_r']
		mode = data['mode']
		h = self.ent_embeddings(batch_h)
		t = self.ent_embeddings(batch_t)
		r = self.rel_embeddings(batch_r)
		score = self._calc(h ,t, r, mode)
		return score

	def regularization(self, data):
		batch_h = data['batch_h']
		batch_t = data['batch_t']
		batch_r = data['batch_r']
		h = self.ent_embeddings(batch_h)
		t = self.ent_embeddings(batch_t)
		r = self.rel_embeddings(batch_r)
		regul = (torch.mean(h ** 2) + torch.mean(t ** 2) + torch.mean(r ** 2)) / 3
		return regul

	def l3_regularization(self):
		return (self.ent_embeddings.weight.norm(p = 3)**3 + self.rel_embeddings.weight.norm(p = 3)**3)

	def predict(self, data):
		score = -self.forward(data)
		return score.cpu().data.numpy()

In [None]:
import torch
import torch.nn as nn
from .Model import Model

In [None]:
# complex space -> real dim, imaginary dim
class ComplEx(Model):
    def __init__(self, ent_tot, rel_tot, dim = 100):
        super(ComplEx, self).__init__(ent_tot, rel_tot)

        self.dim = dim
        self.ent_re_embeddings = nn.Embedding(self.ent_tot, self.dim)
        self.ent_im_embeddings = nn.Embedding(self.ent_tot, self.dim)
        self.rel_re_embeddings = nn.Embedding(self.rel_tot, self.dim)
        self.rel_im_embeddings = nn.Embedding(self.rel_tot, self.dim)

        nn.init.xavier_uniform_(self.ent_re_embeddings.weight.data)
        nn.init.xavier_uniform_(self.ent_im_embeddings.weight.data)
        nn.init.xavier_uniform_(self.rel_re_embeddings.weight.data)
        nn.init.xavier_uniform_(self.rel_im_embeddings.weight.data)

    # (a+bi)(c+di)
    def _calc(self, h_re, h_im, t_re, t_im, r_re, r_im):
        return torch.sum(
            h_re * t_re * r_re
            + h_im * t_im * r_re
            + h_re * t_im * r_im
            - h_im * t_re * r_im,
            -1
        )

    def forward(self, h, r, t, n):
        h_re = self.ent_re_embeddings(h)
        h_im = self.ent_im_embeddings(h)
        t_re = self.ent_re_embeddings(t)
        t_im = self.ent_im_embeddings(t)
        r_re = self.rel_re_embeddings(r)
        r_im = self.rel_im_embeddings(r)
        n_re = self.ent_re_embeddings(n)
        n_im = self.ent_im_embeddings(n)
        pos_score = self._calc(h_re, h_im, t_re, t_im, r_re, r_im)
        neg_score = self._calc(n_re, n_im, t_re, t_im, r_re, r_im)
        neg_score_tail = self._calc(h_re, h_im, t_re, t_im, n_re, n_im)
        return pos_scorem, neg_score, neg_score_tail

    def regularization(self, data):
        batch_h = data['batch_h']
        batch_t = data['batch_t']
        batch_r = data['batch_r']
        h_re = self.ent_re_embeddings(batch_h)
        h_im = self.ent_im_embeddings(batch_h)
        t_re = self.ent_re_embeddings(batch_t)
        t_im = self.ent_im_embeddings(batch_t)
        r_re = self.rel_re_embeddings(batch_r)
        r_im = self.rel_im_embeddings(batch_r)
        regul = (torch.mean(h_re ** 2) + 
                 torch.mean(h_im ** 2) + 
                 torch.mean(t_re ** 2) +
                 torch.mean(t_im ** 2) +
                 torch.mean(r_re ** 2) +
                 torch.mean(r_im ** 2)) / 6
        return regul

    def predict(self, data):
        score = -self.forward(data)
        return score.cpu().data.numpy()

In [None]:
import torch, os
import json

ent2id = dict()
id2ent = set()
rel2id = dict()
id2rel = set()
with open('train.txt', 'r') as f:
    for line in f:
        line = line.strip()
        line = line.split('\t')
        id2ent.add(line[0])
        id2rel.add(line[1])
        id2ent.add(line[2])

with open('valid.txt', 'r') as f:
    for line in f:
        line = line.strip()
        line = line.split('\t')
        id2ent.add(line[0])
        id2rel.add(line[1])
        id2ent.add(line[2])

with open('test.txt', 'r') as f:
    for line in f:
        line = line.strip()
        line = line.split('\t')
        id2ent.add(line[0])
        id2rel.add(line[1])
        id2ent.add(line[2])

id2ent = sorted(list(id2ent))
id2rel = sorted(list(id2rel))

for i,meta in enumerate(id2ent):
    ent2id[meta] = i

for i,meta in enumerate(id2rel):
    rel2id[meta] = i

In [None]:
from torch.utils.data import Dataset
import torch
import json
import numpy as np

class DataSet(Dataset):
    def __init__(self, file_path):
        self.len = 0
        self.head = []
        self.rel = []
        self.tail = []
        self.triple = []
        self.negative = []
        self.ent2id = torch.load('ent2id.pt')
        self.id2ent = torch.load('id2ent.pt')
        self.rel2id = torch.load('rel2id.pt')
        self.id2rel = torch.load('id2rel.pt')
        self.ent_tot = len(self.id2ent)
        self.rel_tot = len(self.id2rel)
        with open(file_path) as f:
            for line in f:
                line = line.strip()
                line = line.split('\t')
                self.len += 1
                self.head.append(int(line[0]))
                self.rel.append(int(line[1]))
                self.tail.append(int(line[2]))
                self.negative.append(np.random.randint(0, len(self.id2ent)))

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        return self.head[idx], self.rel[idx], self.tail[idx], self.negative[idx]

In [None]:
# 0 23 17000 positive -> true score high
# 0 23 74000 negative -> true score negative

In [None]:
def main():
    opts = args
    print ("load data ...")
    train_data = DataSet('data/train2id.txt')
    train_loader = DataLoader(train_data, shuffle=True, batch_size=opts.batch_size)
    valid_data = DataSet('data/valid2id.txt')
    valid_loader = DataLoader(train_data, shuffle=True, batch_size=opts.batch_size)

    print("save model...")
    torch.save(model.state_dict(), 'kbgat.pt')
    print("[Saving embeddings of whole entities & relations...]")

    save_embeddings(model, opts, train_data.id2ent, train_data.id2rel)
    print("[Embedding results are saved successfully.]")

    print("load model ...")
    model = TransE(opts, train_data.ent_tot, train_data.rel_tot)
    if opts.optimizer == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=opts.lr, weight_decay=opts.weight_decay)
    elif opts.optimizer == 'SGD':
        optimizer = optim.SGD(model.parameters(), lr=opts.lr)
    model.cuda()
    loss = nn.MarginRankingLoss(margin=opts.margin)
    loss.cuda()
    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer, step_size=500, gamma=0.5, last_epoch=-1)

    print("start training")
    for epoch in range(1, opts.epochs + 1):
        print("epoch : " + str(epoch))
        model.train()
        epoch_start = time.time()
        epoch_loss = []
        tot = 0
        for i, batch_data in enumerate(train_loader):
            optimizer.zero_grad()
            batch_h, batch_r, batch_t, batch_n = batch_data
            batch_h = torch.LongTensor(batch_h).cuda()
            batch_r = torch.LongTensor(batch_r).cuda()
            batch_t = torch.LongTensor(batch_t).cuda()
            batch_n = torch.LongTensor(batch_n).cuda()
            pos_score, neg_score = model(batch_h, batch_r, batch_t, batch_n)
            train_loss = loss(pos_score, neg_score, -torch.ones(pos_score.size(-1)).cuda())
            train_loss.backward()
            optimizer.step()
            batch_loss = train_loss.item()
            epoch_loss.append(batch_loss)
            tot += batch_h.size(0)
            print('\r{:>10} epoch {} progress {} loss: {}\n'.format('', epoch, tot / train_data.__len__(),
                                                                    train_loss.item()), end='')
        scheduler.step()
        end = time.time()
        time_used = end - epoch_start
        print('one epoch time: {} minutes'.format(time_used / 60))
        print('{} epochs'.format(epoch))
        print('epoch {} loss: {}'.format(epoch, sum(epoch_loss) / len(epoch_loss)))

        with open('transe_log.txt', 'a') as f:
            f.write('loss : ' + str(sum(epoch_loss) / len(epoch_loss)) + '\n')

        if epoch % opts.save_step == 0:
            print("save model...")
            torch.save(model.state_dict(), 'transe.pt')

    print("save model...")
    torch.save(model.state_dict(), 'transe.pt')
    print("[Saving embeddings of whole entities & relations...]")

if __name__ == '__main__':
    main()