In [46]:
%load_ext autoreload
%autoreload 2

In [6]:
import numpy as np

In [7]:
a = np.array([[2,1,3],[4,5,6]])

In [8]:
a[:, 2]

array([3, 6])

In [11]:
6 // 2

3

Load Dataset

In [12]:
from KGEmb.datasets.kg_dataset import KGDataset


In [13]:
dataset_path = './KGEmb/data/FB15K'

In [36]:
dataset_path = './KGEmb/data/large_dummy_data'

In [37]:
dataset = KGDataset(dataset_path, debug=False)

In [38]:
dataset.get_shape()


(100, 40, 100)

In [None]:
dataset.get_filters()['lhs'].keys()

In [None]:

args.sizes = dataset.get_shape()

# load data
logging.info("\t " + str(dataset.get_shape()))
train_examples = dataset.get_examples("train")
valid_examples = dataset.get_examples("valid")
test_examples = dataset.get_examples("test")
filters = 

# Dummy Dataset

In [23]:
import numpy as np
import pickle as pkl
import os

# Create dummy directory
os.makedirs("KGEmb/data/dummy_data", exist_ok=True)

# Sample triples (head, relation, tail)
dummy_triples = {
    "train": np.array([
        [0, 0, 1],  # Paris (0) - capital_of (0) - France (1)
        [2, 0, 3],  # Berlin (2) - capital_of - Germany (3)
    ], dtype=np.int64),
    
    "valid": np.array([
        [4, 0, 5]   # Rome (4) - capital_of - Italy (5)
    ], dtype=np.int64),
    
    "test": np.array([
        [6, 0, 7]   # Madrid (6) - capital_of - Spain (7)
    ], dtype=np.int64)
}

# Create filters (using process.py logic)
all_triples = np.concatenate(list(dummy_triples.values()))
n_relations = 1  # Only capital_of relation

def create_filters(triples, n_rels):
    filters = {"lhs": {}, "rhs": {}}
    for h, r, t in triples:
        # Right-hand filters (h, r) -> [t]
        key = (h, r)
        filters["rhs"].setdefault(key, []).append(t)
        
        # Left-hand filters (t, r + n_rels) -> [h]
        inv_r = r + n_rels
        inv_key = (t, inv_r)
        filters["lhs"].setdefault(inv_key, []).append(h)
    return filters

filters = create_filters(all_triples, n_relations)

# Save files
for split in ["train", "valid", "test"]:
    with open(f"KGEmb/data/dummy_data/{split}.pickle", "wb") as f:
        pkl.dump(dummy_triples[split], f)

with open("KGEmb/data/dummy_data/to_skip.pickle", "wb") as f:
    pkl.dump(filters, f)

In [34]:
import numpy as np
import pickle as pkl
import os
from collections import defaultdict

# Configuration
ENTITIES = 100
RELATIONS = 20
SPLIT_SIZES = {
    'train': 500,
    'valid': 100,
    'test': 100
}
DATA_PATH = "KGEmb/data/large_dummy_data"

def generate_dummy_dataset():
    os.makedirs(DATA_PATH, exist_ok=True)
    
    # Generate random triples
    dataset = {}
    for split, size in SPLIT_SIZES.items():
        heads = np.random.randint(0, ENTITIES, size)
        tails = np.random.randint(0, ENTITIES, size)
        rels = np.random.randint(0, RELATIONS, size)
        dataset[split] = np.stack([heads, rels, tails], axis=1).astype(np.int64)
    
    # Create filters
    all_triples = np.concatenate(list(dataset.values()))
    filters = {'lhs': defaultdict(list), 'rhs': defaultdict(list)}
    
    for h, r, t in all_triples:
        # Right-hand filters
        filters['rhs'][(h, r)].append(t)
        # Left-hand filters (inverse relations)
        filters['lhs'][(t, r + RELATIONS)].append(h)
    
    # Save files
    for split in SPLIT_SIZES:
        with open(f"{DATA_PATH}/{split}.pickle", "wb") as f:
            pkl.dump(dataset[split], f)
    
    with open(f"{DATA_PATH}/to_skip.pickle", "wb") as f:
        pkl.dump(filters, f)

generate_dummy_dataset()

# Load saved KGEmb model

In [73]:
import torch
from KGEmb.models.hyperbolic import AttH

In [71]:
class ModelArgs:
    def __init__(self, dataset):
        """Knowledge Graph Embedding Model Configuration"""
        self.rank = 1000                # Embedding dimension
        self.learning_rate = 1e-1       # Initial learning rate
        self.batch_size = 1000          # Training batch size
        self.reg = 0.1                  # Regularization strength
        self.max_epochs = 10            # Maximum training epochs
        self.patience = 20              # Early stopping patience
        self.debug = False              # Debug mode flag
        self.dtype = 'double'           # Data type (float32/double)
        self.neg_sample_size = 50       # Negative samples per positive
        self.double_neg = True          # Use double negative sampling
        self.bias = 'constant'          # Bias type in model
        self.init_size = 1e-3           # Embedding initialization scale
        self.multi_c = True             # Multiple curvatures (hyperbolic)
        self.dropout = 0                # Dropout rate
        self.sizes = dataset.get_shape()# (n_entities, n_relations) from dataset
        self.gamma = 0

    def __repr__(self):
        return f"ModelArgs({vars(self)})"
model_args = ModelArgs(dataset)

In [74]:
model = AttH(model_args)
model_path = '/Users/nathan/Documents/projects/curve_rag/KGEmb/logs/04_20/large_dummy_data/AttH_19_42_15/model.pt'
model.load_state_dict(torch.load(model_path, weights_only=True))
model.eval()

AttH(
  (entity): Embedding(100, 1000)
  (rel): Embedding(40, 1000)
  (bh): Embedding(100, 1)
  (bt): Embedding(100, 1)
  (rel_diag): Embedding(40, 2000)
  (context_vec): Embedding(40, 1000)
  (act): Softmax(dim=1)
)

In [82]:
# Get all entity embeddings (n_entities x embedding_dim)
entity_embeddings = model.entity.weight.data.cpu().numpy()

# Get relation embeddings (n_relations x embedding_dim)
relation_embeddings = model.rel.weight.data.cpu().numpy()

# Save to file
import numpy as np
os.makedirs("./embeddings", exist_ok=True)

np.save("./embeddings/entity_emb.npy", entity_embeddings)
np.save("./embeddings/relation_emb.npy", relation_embeddings)

In [76]:
entity_embeddings

array([[-0.10112191,  0.10021676,  0.10050033, ...,  0.08976474,
         0.09802748, -0.10050702],
       [-0.09734707, -0.09873435,  0.09664864, ..., -0.09972362,
         0.10046091,  0.09921627],
       [ 0.09772456, -0.10011804,  0.09926715, ...,  0.09629074,
         0.09508758, -0.10133961],
       ...,
       [-0.09776815, -0.09312213, -0.09745788, ...,  0.07169162,
        -0.08136066,  0.09823062],
       [ 0.10166223, -0.0786465 , -0.10073881, ..., -0.09493197,
        -0.05466219,  0.09033944],
       [ 0.10023003, -0.09891245,  0.09755384, ..., -0.10133544,
        -0.09676388,  0.0957422 ]], shape=(100, 1000))

In [97]:
import torch
model.eval()

# Example prediction: (h, r) -> predict t
def predict_tail(h, r, top_k=5):
    with torch.no_grad():
        scores, _ = model.get_queries(torch.tensor([[h, r, 0]]))
        #print(type(scores[0]), scores)
        scores = scores[0]
        values, indices = torch.topk(scores, k=top_k)
        return [(idx, score.item()) 
                for idx, score in zip(indices[0], values[0])]

# Usage
predictions = predict_tail(h=0, r=0)  # (Paris, capital_of) -> top 5 countries

In [99]:
predictions

[(tensor(74), 0.05480053575637172),
 (tensor(50), 0.054452233924053406),
 (tensor(21), 0.05377454100519527),
 (tensor(584), 0.05350505764806299),
 (tensor(541), 0.053469977380110126)]