In [1]:
import os
import numpy as np
import pandas as pd

DATA_NAME = "smallpedia"
in_csv = "tkgl-smallpedia_edgelist.csv"
out_dir = "processed"
os.makedirs(out_dir, exist_ok=True)

df = pd.read_csv(in_csv)

# sort by time for temporal consistency
df = df.sort_values("ts").reset_index(drop=True)

# --- node mapping (Qxxx -> 1..N) ---
nodes = pd.Index(pd.concat([df["head"], df["tail"]]).unique())
node2id = {q: i+1 for i, q in enumerate(nodes)}  # start from 1

df["u"] = df["head"].map(node2id).astype(int)
df["i"] = df["tail"].map(node2id).astype(int)

# --- relation mapping (Pxxx -> 0..R-1) ---
rels = pd.Index(df["relation_type"].unique())
rel2id = {p: j for j, p in enumerate(rels)}
df["rel_id"] = df["relation_type"].map(rel2id).astype(int)

# --- required columns ---
df["label"] = 1
df["idx"] = np.arange(1, len(df) + 1, dtype=int)  # edge index starts from 1

out_csv = os.path.join(out_dir, f"ml_{DATA_NAME}.csv")
df_out = df[["u", "i", "ts", "label", "idx"]]
df_out.to_csv(out_csv, index=False)

# --- edge features: one-hot relation ---
R = len(rels)
E = len(df)
edge_feat = np.zeros((E + 1, R), dtype=np.float32)  # +1 for padding row 0
edge_feat[df["idx"].values, df["rel_id"].values] = 1.0

out_edge = os.path.join(out_dir, f"ml_{DATA_NAME}.npy")
np.save(out_edge, edge_feat)

# --- node features: all zeros (choose dimension) ---
node_dim = 1
N = len(nodes)
node_feat = np.zeros((N + 1, node_dim), dtype=np.float32)

out_node = os.path.join(out_dir, f"ml_{DATA_NAME}_node.npy")
np.save(out_node, node_feat)

print("Saved:", out_csv, out_edge, out_node)
print("Nodes:", N, "Edges:", E, "Rel types:", R)


Saved: processed/ml_smallpedia.csv processed/ml_smallpedia.npy processed/ml_smallpedia_node.npy
Nodes: 47433 Edges: 550376 Rel types: 283
