In [1]:
import math
import random
from collections import Counter
import numpy as np
import tensorflow as tf
from tqdm import tqdm

In [2]:
train_file = 'WN18/train.txt'
test_file = 'WN18/train.txt'

In [3]:
def load_data(path):
    data = []
    with open(path) as fp:
        for line in fp:
            line = line.strip()
            r = line.split('\t')
            if len(r) == 3:
                data.append(r)
    return data

In [4]:
train = load_data(train_file)

In [5]:
test = load_data(test_file)

In [7]:
# words实际上是实体+关系的字符串
words = Counter()
for x in train:
    words.update(x)
for x in test:
    words.update(x)

In [8]:
print(len(words))

40961


In [9]:
# 大约是words数量的10倍
n_token = 50 * 10000

In [10]:
class Tokenizer(tf.keras.models.Model):
    
    def __init__(self, num_buckets = 10 * 10000):
        super(Tokenizer, self).__init__()
        self.num_buckets = num_buckets

    def call(self, inputs):
        x = tf.strings.to_hash_bucket_fast(inputs, self.num_buckets)
        x = tf.cast(x, dtype=tf.float32)
        return x

In [12]:
input_real = tf.keras.Input(shape=(None,), dtype='string')
input_fake = tf.keras.Input(shape=(None,), dtype='string')

tok = Tokenizer(n_token)
emb = tf.keras.Sequential([
    tf.keras.Input(shape=(None,), dtype=tf.float32),
    tf.keras.layers.Embedding(n_token, 64)
])

real = input_real
fake = input_fake

real = tok(real)
fake = tok(fake)


head = real[:, 0]
rel = real[:, 1]
tail = real[:, 2]
head = emb(head)
rel = emb(rel)
tail = emb(tail)

distance = head + rel - tail
dis_real = tf.linalg.normalize(distance, axis=1)[1]

head = fake[:, 0]
rel = fake[:, 1]
tail = fake[:, 2]
head = emb(head)
rel = emb(rel)
tail = emb(tail)

distance = head + rel - tail
dis_fake = tf.linalg.normalize(distance, axis=1)[1]

x = dis_real - dis_fake

model = tf.keras.models.Model(
    inputs=[input_real, input_fake],
    outputs=x
)

model.compile(
    optimizer='adam',
    # 自定义loss函数
    # loss(x, y) = max(0, -y * (x1 - x2) + margin)
    # y = -1
    # margin = 1.0
    loss=lambda true, pred: tf.math.maximum(0.0, pred + true)
)

In [16]:
def data_generate(data, batch_size=32):
    """数据生成，每个迭代返回pos和neg两个sample集合，还有margin"""
    entities = {}
    for x in data:
        if x[0] not in entities:
            entities[x[0]] = {}
        if x[2] not in entities:
            entities[x[2]] = {}
        if x[1] not in entities[x[0]]:
            entities[x[0]][x[1]] = []
        if x[1] not in entities[x[2]]:
            entities[x[2]][x[1]] = []
        entities[x[0]][x[1]].append(x[2])
        entities[x[2]][x[1]].append(x[0])
    words = list(entities.keys())
    n_batch = math.ceil(len(data) / batch_size)
    def _get_random(x0, r, x1):
        while True:
            neg = random.choice(words)
            if neg != x0 and neg != x1:
                if neg not in entities[x0][r]:
                    if neg not in entities[x1][r]:
                        return neg
    for i in range(n_batch):
        batch = data[i * batch_size: (i + 1) * batch_size]
        neg = [
            [x0, r, _get_random(x0, r, x1)]
            for x0, r, x1 in batch
        ]
        y = np.array([[1.]] * len(batch))  # margin
        yield [np.array(batch), np.array(neg)], y

In [20]:
batch_size = 1024
n_batch = math.ceil(len(train) / batch_size)
for epoch in range(30):
    pbar = tqdm(data_generate(train, batch_size), total=n_batch)
    losses = []
    for [pos, neg], y in pbar:
        loss = model.train_on_batch([pos, neg], y)
        losses.append(loss)
        pbar.set_description(f'epoch: {epoch} loss: {np.mean(losses):.4f}')

epoch: 0 loss: 0.9707: 100%|██████████| 139/139 [01:03<00:00,  2.17it/s]
epoch: 1 loss: 0.8725: 100%|██████████| 139/139 [01:03<00:00,  2.18it/s]
epoch: 2 loss: 0.7729: 100%|██████████| 139/139 [01:03<00:00,  2.19it/s]
epoch: 3 loss: 0.6872: 100%|██████████| 139/139 [01:04<00:00,  2.16it/s]
epoch: 4 loss: 0.6057: 100%|██████████| 139/139 [01:04<00:00,  2.16it/s]
epoch: 5 loss: 0.5311: 100%|██████████| 139/139 [01:04<00:00,  2.15it/s]
epoch: 6 loss: 0.4607: 100%|██████████| 139/139 [01:05<00:00,  2.14it/s]
epoch: 7 loss: 0.3963: 100%|██████████| 139/139 [01:04<00:00,  2.15it/s]
epoch: 8 loss: 0.3413: 100%|██████████| 139/139 [01:04<00:00,  2.14it/s]
epoch: 9 loss: 0.2956: 100%|██████████| 139/139 [01:04<00:00,  2.16it/s]
epoch: 10 loss: 0.2563: 100%|██████████| 139/139 [01:04<00:00,  2.15it/s]
epoch: 11 loss: 0.2239: 100%|██████████| 139/139 [01:04<00:00,  2.14it/s]
epoch: 12 loss: 0.1976: 100%|██████████| 139/139 [01:04<00:00,  2.15it/s]
epoch: 13 loss: 0.1754: 100%|██████████| 139/139

In [21]:
# Save model
tok.save('/tmp/tok')
emb.save('/tmp/emb')

INFO:tensorflow:Assets written to: /tmp/tok/assets


In [11]:
entities = sorted(words.keys())
rel = [x for x in entities if x.startswith('_')]
entities = [x for x in entities if not x.startswith('_')]

In [12]:
tok = tf.keras.models.load_model('/tmp/tok')
emb = tf.keras.models.load_model('/tmp/emb')

In [13]:
print(len(rel), len(entities))

18 40943


In [14]:
ent_vecs = emb(tok(tf.constant([entities]))).numpy()[0]
rel_vecs = emb(tok(tf.constant([rel]))).numpy()[0]

In [15]:
print(ent_vecs.shape, rel_vecs.shape)

(40943, 64) (18, 64)


In [None]:
# 计算 Hit@10
ret = []
good, bad = 0, 0
pbar = tqdm(test)
for a, b, c in pbar:
    target = emb(tok(tf.constant([[a]]))) \
            + emb(tok(tf.constant([[b]])))
    diss = tf.linalg.normalize(target - ent_vecs, axis=2)[1].numpy().flatten()
    if c in [entities[i] for i in np.argsort(diss)[:10]]:
        good += 1
    else:
        bad += 1
    pbar.set_description(
        f'good: {good}, bad: {bad}, total: {good + bad}, hit@10: {good / (good + bad):.4f}'
    )

good: 7136, bad: 1756, total: 8892, hit@10: 0.8025:   6%|▋         | 8891/141442 [20:27<5:08:28,  7.16it/s]

In [None]:
# https://github.com/thunlp/KB2E
# Hit@10(raw) transE = 75.4 or 78.9
print(good, bad, len(test), good / len(test))