In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("tkgl-smallpedia_edgelist.csv")
df.head()

Unnamed: 0,ts,head,tail,relation_type
0,1900,Q648,Q163700,P166
1,1900,Q28003,Q18425,P1346
2,1900,Q44949,Q458,P17
3,1900,Q48246,Q756994,P166
4,1900,Q77203,Q61769,P185


In [4]:
unique_labels = pd.concat([df['head'], df['tail']]).unique()

In [6]:
len(unique_labels)

47433

In [8]:
unique_relations = df['relation_type'].unique()
len(unique_relations)

283

In [9]:
unique_relations

array(['P166', 'P1346', 'P17', 'P185', 'P793', 'P937', 'P608', 'P61',
       'P1343', 'P106', 'P770', 'P27', 'P611', 'P131', 'P417', 'P1376',
       'P122', 'P841', 'P36', 'P361', 'P138', 'P366', 'P150', 'P1001',
       'P37', 'P119', 'P38', 'P47', 'P85', 'P195', 'P26', 'P2505', 'P137',
       'P53', 'P159', 'P1308', 'P97', 'P463', 'P1056', 'P108', 'P3460',
       'P127', 'P609', 'P511', 'P488', 'P35', 'P551', 'P123', 'P39',
       'P241', 'P3320', 'P735', 'P466', 'P837', 'P1037', 'P612', 'P140',
       'P98', 'P197', 'P176', 'P527', 'P3300', 'P618', 'P6', 'P516',
       'P169', 'P3919', 'P102', 'P2541', 'P54', 'P5769', 'P69', 'P1142',
       'P734', 'P641', 'P1532', 'P1454', 'P4330', 'P115', 'P276', 'P81',
       'P1029', 'P4791', 'P2868', 'P8047', 'P84', 'P1411', 'P807',
       'P1891', 'P184', 'P1366', 'P112', 'P512', 'P607', 'P7779', 'P1416',
       'P410', 'P126', 'P3938', 'P118', 'P1066', 'P2828', 'P355', 'P170',
       'P180', 'P4345', 'P800', 'P1050', 'P411', 'P451', 'P5096', '

In [10]:
import requests
from tqdm import tqdm

WDQS_URL = "https://query.wikidata.org/sparql"

def chunk_list(lst, chunk_size=500):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i+chunk_size]

def fetch_wikidata_labels(ids, lang="en", chunk_size=500):
    """
    ids: list string such as ["Q648", "Q458", ...] or ["P166", ...]
    Returns DataFrame: id, label, description
    """

    rows = []

    for chunk in tqdm(list(chunk_list(list(ids), chunk_size))):
        values_str = " ".join([f"wd:{x}" for x in chunk])

        query = f"""
        SELECT ?id ?idLabel ?idDescription WHERE {{
          VALUES ?id {{ {values_str} }}
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{lang}". }}
        }}
        """

        r = requests.get(
            WDQS_URL,
            params={"format": "json", "query": query},
            headers={"User-Agent": "TKGC-RAG/1.0 (local project)"}
        )
        r.raise_for_status()
        data = r.json()

        for b in data["results"]["bindings"]:
            uri = b["id"]["value"]  # np. http://www.wikidata.org/entity/Q648
            qid = uri.split("/")[-1]
            label = b.get("idLabel", {}).get("value", "")
            desc = b.get("idDescription", {}).get("value", "")
            rows.append((qid, label, desc))

    return pd.DataFrame(rows, columns=["id", "label", "description"])


In [11]:
entities_df = fetch_wikidata_labels(unique_labels, lang="en", chunk_size=400)
relations_df = fetch_wikidata_labels(unique_relations, lang="en", chunk_size=400)

entities_df.to_csv("entities_wikidata.csv", index=False)
relations_df.to_csv("relations_wikidata.csv", index=False)


100%|██████████| 119/119 [07:43<00:00,  3.89s/it]
100%|██████████| 1/1 [00:00<00:00,  1.25it/s]


In [12]:
print("Entities missing:", len(set(unique_labels) - set(entities_df["id"])))
print("Relations missing:", len(set(unique_relations) - set(relations_df["id"])))


Entities missing: 0
Relations missing: 0


In [14]:
import json
import re
from collections import defaultdict
def normalize_label(s: str) -> str:
    """
    Normalization of labels to text use (RAG/LLM):
    - lower
    - delete spaces
    - delete dots at the end
    - simplify quotation marks
    """
    if s is None:
        return ""
    s = str(s).strip().lower()
    s = s.replace("“", '"').replace("”", '"').replace("’", "'")
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"\.$", "", s)
    return s


def df_to_maps(df: pd.DataFrame, id_col="id", label_col="label", desc_col="description"):
    """
    Creates maps from DataFrame (id, label, description):
      id -> label, id -> description
      label -> [ids]
      normalized_label -> [ids]
    """
    id_to_label = {}
    id_to_desc = {}

    label_to_ids = defaultdict(list)
    nlabel_to_ids = defaultdict(list)

    for _, row in df.iterrows():
        _id = str(row.get(id_col, "")).strip()
        if not _id:
            continue

        label = str(row.get(label_col, "")).strip()
        desc = str(row.get(desc_col, "")).strip()

        # ID -> label/desc
        # (If ID duplicates, leave first non-empty label/desc)
        if _id not in id_to_label or (not id_to_label[_id] and label):
            id_to_label[_id] = label
        if _id not in id_to_desc or (not id_to_desc[_id] and desc):
            id_to_desc[_id] = desc

        # label -> [ids] (note: label can be empty)
        if label:
            label_to_ids[label].append(_id)
            nlabel = normalize_label(label)
            if nlabel:
                nlabel_to_ids[nlabel].append(_id)

    # delete duplicates (order retained)
    def dedupe_list(xs):
        seen = set()
        out = []
        for x in xs:
            if x not in seen:
                out.append(x)
                seen.add(x)
        return out

    label_to_ids = {k: dedupe_list(v) for k, v in label_to_ids.items()}
    nlabel_to_ids = {k: dedupe_list(v) for k, v in nlabel_to_ids.items()}

    return id_to_label, id_to_desc, label_to_ids, nlabel_to_ids


# === 1) Loading CSV ===
entities_df = pd.read_csv("entities_wikidata.csv")
relations_df = pd.read_csv("relations_wikidata.csv")

# Check column names are correct: id, label, description
for name, df in [("entities_wikidata.csv", entities_df), ("relations_wikidata.csv", relations_df)]:
    missing = [c for c in ["id", "label", "description"] if c not in df.columns]
    if missing:
        raise ValueError(f"{name} is missing columns: {missing}. Found: {list(df.columns)}")

# === 2) Entity maps (Q-ids) ===
qid_to_label, qid_to_desc, label_to_qids, nlabel_to_qids = df_to_maps(entities_df)

# === 3) Relation maps (P-ids) ===
pid_to_label, pid_to_desc, label_to_pids, nlabel_to_pids = df_to_maps(relations_df)

# === 4) Save to JSON ===
# ID -> label/desc
with open("qid_to_label.json", "w", encoding="utf-8") as f:
    json.dump(qid_to_label, f, ensure_ascii=False, indent=2)
with open("qid_to_desc.json", "w", encoding="utf-8") as f:
    json.dump(qid_to_desc, f, ensure_ascii=False, indent=2)

with open("pid_to_label.json", "w", encoding="utf-8") as f:
    json.dump(pid_to_label, f, ensure_ascii=False, indent=2)
with open("pid_to_desc.json", "w", encoding="utf-8") as f:
    json.dump(pid_to_desc, f, ensure_ascii=False, indent=2)

# label -> [ids]
with open("label_to_qids.json", "w", encoding="utf-8") as f:
    json.dump(label_to_qids, f, ensure_ascii=False, indent=2)
with open("label_to_pids.json", "w", encoding="utf-8") as f:
    json.dump(label_to_pids, f, ensure_ascii=False, indent=2)

# normalized_label -> [ids]
with open("norm_label_to_qids.json", "w", encoding="utf-8") as f:
    json.dump(nlabel_to_qids, f, ensure_ascii=False, indent=2)
with open("norm_label_to_pids.json", "w", encoding="utf-8") as f:
    json.dump(nlabel_to_pids, f, ensure_ascii=False, indent=2)

# === 5) Report ===
print("Saved JSON dictionaries:")
print(f"  qid_to_label: {len(qid_to_label)}")
print(f"  qid_to_desc : {len(qid_to_desc)}")
print(f"  pid_to_label: {len(pid_to_label)}")
print(f"  pid_to_desc : {len(pid_to_desc)}")
print(f"  label_to_qids (distinct labels): {len(label_to_qids)}")
print(f"  label_to_pids (distinct labels): {len(label_to_pids)}")
print(f"  norm_label_to_qids (distinct normalized labels): {len(nlabel_to_qids)}")
print(f"  norm_label_to_pids (distinct normalized labels): {len(nlabel_to_pids)}")

# Ambiguities check (label -> many ID)
amb_q = sum(1 for v in label_to_qids.values() if len(v) > 1)
amb_p = sum(1 for v in label_to_pids.values() if len(v) > 1)
print("\nAmbiguity check:")
print(f"  entity labels mapping to >1 Q-id: {amb_q}")
print(f"  relation labels mapping to >1 P-id: {amb_p}")


Saved JSON dictionaries:
  qid_to_label: 47433
  qid_to_desc : 47433
  pid_to_label: 283
  pid_to_desc : 283
  label_to_qids (distinct labels): 47178
  label_to_pids (distinct labels): 283
  norm_label_to_qids (distinct normalized labels): 47168
  norm_label_to_pids (distinct normalized labels): 283

Ambiguity check:
  entity labels mapping to >1 Q-id: 238
  relation labels mapping to >1 P-id: 0
