In [2]:
import json
from collections import Counter

from SPARQLWrapper import SPARQLWrapper, JSON
import time

from tqdm import tqdm

import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np

import faiss

  from .autonotebook import tqdm as notebook_tqdm


## Collecting names and constraints

In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON

PROP_2_LABEL = {}
PROP_2_DATA_TYPE = {}

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

# SPARQL query for properties with data types: Item, Quantity, Point in time
query = """
SELECT ?property ?propertyLabel ?typeLabel WHERE {
  ?property a wikibase:Property .
  ?property wikibase:propertyType ?type .
  
  VALUES ?type { wikibase:WikibaseItem wikibase:Quantity wikibase:Time }
  
  BIND(
    IF(?type = wikibase:WikibaseItem, "Item",
      IF(?type = wikibase:Quantity, "Quantity",
        IF(?type = wikibase:Time, "Point in time", "Unknown")
      )
    ) AS ?typeLabel
  )
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
"""

sparql.setQuery(query)
sparql.setReturnFormat(JSON)

try:
    results = sparql.query().convert()

    # Extract and print property IDs with their labels and data types
    for result in results["results"]["bindings"]:
        prop = result["property"]["value"].split("/")[-1]
        label = result.get("propertyLabel", {}).get("value", "No label")
        data_type = result.get("typeLabel", {}).get("value", "Unknown")

        PROP_2_LABEL[prop] = label
        PROP_2_DATA_TYPE[prop] = data_type        

except Exception as e:
    print(f"Error executing SPARQL query: {e}")

In [3]:
len(PROP_2_LABEL), len(PROP_2_DATA_TYPE)

(2410, 2410)

In [4]:
with open("prop2data_type.json", 'w') as f:
    json.dump(PROP_2_DATA_TYPE, f)

In [5]:
with open("prop2label.json", 'w') as f:
    json.dump(PROP_2_LABEL, f)

In [6]:
with open('prop2constraints.json', 'r') as f:
    constraint_dict = json.load(f)

In [7]:
constraint_dict = {key: value for key, value in constraint_dict.items() if key in PROP_2_LABEL}
len(constraint_dict)

2317

In [8]:
data_constraints = ["subject type constraint", "value-type constraint" ]
                            # "allowed-entity-types constraint", 
                            #   "one-of constraint"]

In [9]:
properties_with_data_constraints = {}
for prop in constraint_dict:

    for constraint in data_constraints:
        if constraint in constraint_dict[prop]:
            # class and item 
            if 'P2308' in constraint_dict[prop][constraint] and 'P2309' in constraint_dict[prop][constraint]:
                properties_with_data_constraints[prop] = constraint_dict[prop]
                

In [10]:
len(properties_with_data_constraints)

1754

In [11]:
properties_with_data_constraints

{'P1661': {'allowed units constraint': {'P2305': ['no value'],
   'P2316': ['Q21502408']},
  'range constraint': {'P2312': ['20,000,000'],
   'P2313': ['1'],
   'P2316': ['Q21502408']},
  'allowed qualifiers constraint': {'P2306': ['Property:P585',
    'Property:P2241',
    'Property:P7452',
    'Property:P1065',
    'Property:P2960',
    'Property:P813',
    'Property:P1001']},
  'required qualifier constraint': {'P2306': ['Property:P585'],
   'P2316': ['Q62026391']},
  'item-requires-statement constraint': {'P2306': ['Property:P856']},
  'subject type constraint': {'P2309': ['Q21503252'], 'P2308': ['Q35127']},
  'allowed-entity-types constraint': {'P2305': ['Q59712033', 'Q29934200']},
  'property scope constraint': {'P5314': ['Q54828448', 'Q54828449']}},
 'P9897': {'allowed-entity-types constraint': {'P2305': ['Q29934200']},
  'property scope constraint': {'P5314': ['Q54828448']},
  'value-type constraint': {'P2308': ['Q110910264'], 'P2309': ['Q21503252']},
  'one-of constraint': {'P

In [12]:
prop2constraint = {}
for prop in properties_with_data_constraints:
    const_dict = {}
    if 'subject type constraint' in properties_with_data_constraints[prop]:
        const_dict['subject type constraint'] = properties_with_data_constraints[prop]['subject type constraint']
    if 'value-type constraint' in properties_with_data_constraints[prop]: 
        const_dict['value-type constraint'] = properties_with_data_constraints[prop]['value-type constraint']
    
    if len(const_dict) != 0:
        prop2constraint[prop] = const_dict

In [13]:
len(prop2constraint)

1754

In [14]:
prop2constraint

{'P1661': {'subject type constraint': {'P2309': ['Q21503252'],
   'P2308': ['Q35127']}},
 'P9897': {'subject type constraint': {'P2308': ['Q7889', 'Q620615'],
   'P2309': ['Q30208840']},
  'value-type constraint': {'P2308': ['Q110910264'], 'P2309': ['Q21503252']}},
 'P3156': {'subject type constraint': {'P2308': ['Q11424',
    'Q15416',
    'Q7889',
    'Q16070115',
    'Q21191270',
    'Q622550'],
   'P2309': ['Q21503252'],
   'P2303': ['Q47457020']},
  'value-type constraint': {'P2308': ['Q26708074'],
   'P2309': ['Q21503252'],
   'P2316': ['Q21502408']}},
 'P9086': {'subject type constraint': {'P2308': ['Q11424'],
   'P2309': ['Q21503252']}},
 'P2629': {'subject type constraint': {'P2308': ['Q11424',
    'Q15416',
    'Q7889',
    'Q21191270'],
   'P2309': ['Q21503252']},
  'value-type constraint': {'P2308': ['Q23790218'],
   'P2309': ['Q21503252'],
   'P2316': ['Q21502408']}},
 'P8476': {'subject type constraint': {'P2309': ['Q21503252'],
   'P2308': ['Q6256']}},
 'P8477': {'subjec

In [15]:
value_rel_dict = {}

for prop in prop2constraint:
    temp_dict = {}
    for key in prop2constraint[prop]:
        temp_dict[key] = {"P2309": prop2constraint[prop][key]["P2309"], "P2308": prop2constraint[prop][key]["P2308"]}

    value_rel_dict[prop] = temp_dict
             
len(value_rel_dict)

1754

In [16]:
properties = [prop for prop in prop2constraint.keys()]
len(set(properties))

1754

In [17]:
properties

['P1661',
 'P9897',
 'P3156',
 'P9086',
 'P2629',
 'P8476',
 'P8477',
 'P6452',
 'P9028',
 'P853',
 'P6657',
 'P7603',
 'P5475',
 'P2758',
 'P3402',
 'P880',
 'P9887',
 'P2643',
 'P3501',
 'P3216',
 'P9798',
 'P9126',
 'P8328',
 'P660',
 'P2756',
 'P8901',
 'P852',
 'P470',
 'P1087',
 'P2021',
 'P3823',
 'P6438',
 'P2371',
 'P4437',
 'P1981',
 'P1731',
 'P2747',
 'P6697',
 'P5070',
 'P5040',
 'P5041',
 'P5042',
 'P2560',
 'P9866',
 'P916',
 'P1414',
 'P9325',
 'P2597',
 'P7295',
 'P5425',
 'P5537',
 'P11547',
 'P8874',
 'P1081',
 'P3306',
 'P7573',
 'P5150',
 'P579',
 'P5201',
 'P3428',
 'P6069',
 'P6095',
 'P6789',
 'P141',
 'P814',
 'P11593',
 'P2127',
 'P5900',
 'P5386',
 'P3650',
 'P5152',
 'P10994',
 'P3096',
 'P3818',
 'P2684',
 'P2564',
 'P1145',
 'P8026',
 'P2051',
 'P1657',
 'P8889',
 'P2377',
 'P5970',
 'P2784',
 'P9895',
 'P1088',
 'P1611',
 'P2363',
 'P8652',
 'P11611',
 'P7327',
 'P5805',
 'P8615',
 'P8480',
 'P908',
 'P2299',
 'P9683',
 'P2637',
 'P6658',
 'P3834',
 'P642

In [18]:
with open('subject_object_constraints.json', 'w') as f:
    json.dump(value_rel_dict, f)

In [19]:
with open('subject_object_constraints.json', 'r') as f:
    value_rel_dict = json.load(f)

In [20]:
prop2subj_constraint = {}
prop2obj_constraint = {}

for prop, constraints in value_rel_dict.items():

    if 'subject type constraint' in constraints:
        subj_constraints = constraints['subject type constraint']

        for const_type in subj_constraints.values():
            for subj_c in const_type:
                if subj_c in prop2subj_constraint:
                    prop2subj_constraint[subj_c].append(prop)
                else:
                    prop2subj_constraint[subj_c] = [prop]
    
    if 'value-type constraint' in constraints: 
        obj_constraints  = constraints['value-type constraint']
    
        for const_type in obj_constraints.values():
            for obj_c in const_type:
                if obj_c in prop2obj_constraint:
                    prop2obj_constraint[obj_c].append(prop)
                else:
                    prop2obj_constraint[obj_c] = [prop]


prop2obj_constraint

{'Q21503252': ['P9897',
  'P3156',
  'P2629',
  'P853',
  'P5475',
  'P3216',
  'P9126',
  'P660',
  'P8901',
  'P852',
  'P3823',
  'P4437',
  'P1981',
  'P1731',
  'P5070',
  'P5040',
  'P5041',
  'P5042',
  'P2560',
  'P9866',
  'P916',
  'P1414',
  'P9325',
  'P5425',
  'P5537',
  'P8874',
  'P3306',
  'P7573',
  'P5150',
  'P2127',
  'P5152',
  'P10994',
  'P3096',
  'P2684',
  'P2564',
  'P1145',
  'P1657',
  'P8889',
  'P2377',
  'P5970',
  'P9895',
  'P1611',
  'P2363',
  'P8652',
  'P7327',
  'P5805',
  'P8480',
  'P908',
  'P6426',
  'P2366',
  'P2365',
  'P2359',
  'P2358',
  'P3592',
  'P12168',
  'P914',
  'P5522',
  'P8500',
  'P1878',
  'P4202',
  'P4002',
  'P10714',
  'P5125',
  'P4426',
  'P10588',
  'P811',
  'P1026',
  'P5023',
  'P5202',
  'P197',
  'P9239',
  'P1416',
  'P1877',
  'P114',
  'P113',
  'P945',
  'P11106',
  'P11799',
  'P2279',
  'P66',
  'P4743',
  'P6942',
  'P11105',
  'P85',
  'P2817',
  'P10602',
  'P10663',
  'P748',
  'P9047',
  'P84',
  'P14

In [21]:
with open('prop2obj_constraint.json', 'w') as f:
    json.dump(prop2obj_constraint, f)

with open('prop2subj_constraint.json', 'w') as f:
    json.dump(prop2subj_constraint, f)

In [18]:
c = Counter()

for prop in value_rel_dict:

    for key in value_rel_dict[prop]:
        for item in value_rel_dict[prop][key]["P2309"]:
            c[item] += 1

# Q21503252 - instance of 
# Q30208840 - instance or subclass of 
# Q21514624 - subclass of
c

Counter({'Q21503252': 1748, 'Q30208840': 697, 'Q21514624': 151})

In [19]:
entities = set()
for prop in value_rel_dict:

    for key in value_rel_dict[prop]:
        for item in value_rel_dict[prop][key]["P2308"]:
            entities.add(item)
entities = list(entities)

In [20]:
len(entities)

3138

In [21]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

BATCH_SIZE = 50

def fetch_labels(batch):
    entity_values = " ".join(f"wd:{entity}" for entity in batch)
    
    query = f"""
    SELECT ?entity ?entityLabel WHERE {{
      VALUES ?entity {{ {entity_values} }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    try:
        results = sparql.query().convert()
        return {
            result["entity"]["value"].split("/")[-1]: result.get("entityLabel", {}).get("value", "No label")
            for result in results["results"]["bindings"]
        }
    except Exception as e:
        print(f"Error with batch {batch[:5]}...: {e}")
        return {}

ENTITY_2_LABEL = {}

# Process in batches
for i in range(0, len(entities), BATCH_SIZE):
    batch = entities[i:i + BATCH_SIZE]
    print(f"Processing batch {i // BATCH_SIZE + 1}/{(len(entities) // BATCH_SIZE) + 1}")
    
    labels = fetch_labels(batch)
    ENTITY_2_LABEL.update(labels)
    
    time.sleep(2)

# for entity, label in all_labels.items():
#     print(f"{entity}: {label}")

Processing batch 1/63
Processing batch 2/63
Processing batch 3/63
Processing batch 4/63
Processing batch 5/63
Processing batch 6/63
Processing batch 7/63
Processing batch 8/63
Processing batch 9/63
Processing batch 10/63
Processing batch 11/63
Processing batch 12/63
Processing batch 13/63
Processing batch 14/63
Processing batch 15/63
Processing batch 16/63
Processing batch 17/63
Processing batch 18/63
Processing batch 19/63
Processing batch 20/63
Processing batch 21/63
Processing batch 22/63
Processing batch 23/63
Processing batch 24/63
Processing batch 25/63
Processing batch 26/63
Processing batch 27/63
Processing batch 28/63
Processing batch 29/63
Processing batch 30/63
Processing batch 31/63
Processing batch 32/63
Processing batch 33/63
Processing batch 34/63
Processing batch 35/63
Processing batch 36/63
Processing batch 37/63
Processing batch 38/63
Processing batch 39/63
Processing batch 40/63
Processing batch 41/63
Processing batch 42/63
Processing batch 43/63
Processing batch 44/

In [22]:
len(ENTITY_2_LABEL)

3138

In [23]:
value_rel_dict

{'P1661': {'subject type constraint': {'P2309': ['Q21503252'],
   'P2308': ['Q35127']}},
 'P9897': {'subject type constraint': {'P2309': ['Q30208840'],
   'P2308': ['Q7889', 'Q620615']},
  'value-type constraint': {'P2309': ['Q21503252'], 'P2308': ['Q110910264']}},
 'P3156': {'subject type constraint': {'P2309': ['Q21503252'],
   'P2308': ['Q11424',
    'Q15416',
    'Q7889',
    'Q16070115',
    'Q21191270',
    'Q622550']},
  'value-type constraint': {'P2309': ['Q21503252'], 'P2308': ['Q26708074']}},
 'P9086': {'subject type constraint': {'P2309': ['Q21503252'],
   'P2308': ['Q11424']}},
 'P2629': {'subject type constraint': {'P2309': ['Q21503252'],
   'P2308': ['Q11424', 'Q15416', 'Q7889', 'Q21191270']},
  'value-type constraint': {'P2309': ['Q21503252'], 'P2308': ['Q23790218']}},
 'P8476': {'subject type constraint': {'P2309': ['Q21503252'],
   'P2308': ['Q6256']}},
 'P8477': {'subject type constraint': {'P2309': ['Q21503252'],
   'P2308': ['Q6256']}},
 'P6452': {'subject type cons

In [24]:
base_ontology_triplets = []
for prop, constraint in value_rel_dict.items():
    triple = {}
    verb_prop = PROP_2_LABEL[prop]
    triple["property"] = verb_prop
    for constraint_type in constraint:
        if constraint_type == "subject type constraint":
            triple["subject"] = []
            subjects = [ENTITY_2_LABEL[entity] for entity in constraint[constraint_type]["P2308"]]
            triple["subject"].extend(subjects)
        
        if constraint_type == "value-type constraint":
            triple["value"] = []
            values = [ENTITY_2_LABEL[entity] for entity in constraint[constraint_type]["P2308"]]
            triple["value"].extend(values)
    base_ontology_triplets.append(triple)

In [25]:
base_ontology_triplets

[{'property': 'Alexa rank', 'subject': ['website']},
 {'property': 'App Store age rating',
  'subject': ['video game', 'mobile app'],
  'value': ['Apple App Store rating category']},
 {'property': 'Australian Classification',
  'subject': ['film',
   'television program',
   'video game',
   'video game compilation',
   'television series episode',
   'trailer'],
  'value': ['Australian Classification category']},
 {'property': 'BAMID film rating', 'subject': ['film']},
 {'property': 'BBFC rating',
  'subject': ['film',
   'television program',
   'video game',
   'television series episode'],
  'value': ['BBFC classification category']},
 {'property': 'BTI Governance Index', 'subject': ['country']},
 {'property': 'BTI Status Index', 'subject': ['country']},
 {'property': 'CBFC rating', 'subject': ['film']},
 {'property': 'CCC classification', 'subject': ['video game']},
 {'property': 'CERO rating',
  'subject': ['video game',
   'expansion pack',
   'video game compilation',
   'ficti

In [26]:
with open('ontology_entity2label.json', 'w') as f:
    json.dump(ENTITY_2_LABEL, f)

## Index utils

In [2]:
tokenizer = AutoTokenizer.from_pretrained('facebook/contriever')
model = AutoModel.from_pretrained('facebook/contriever').to('cuda:4')

In [3]:
def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

def embed_entity_batch(entity_list):
    inputs = tokenizer(entity_list, padding=True, truncation=True, return_tensors='pt')

    outputs = model(**inputs.to('cuda:4'))
    embeddings = mean_pooling(outputs[0], inputs['attention_mask'])
    return embeddings

## Relation index

In [2]:
with open("prop2label.json", 'r') as f:
    PROP_2_LABEL = json.load(f)

In [6]:
PROP_2_LABEL

{'P6': 'head of government',
 'P16': 'transport network',
 'P17': 'country',
 'P19': 'place of birth',
 'P20': 'place of death',
 'P21': 'sex or gender',
 'P22': 'father',
 'P25': 'mother',
 'P26': 'spouse',
 'P27': 'country of citizenship',
 'P30': 'continent',
 'P31': 'instance of',
 'P35': 'head of state',
 'P36': 'capital',
 'P37': 'official language',
 'P38': 'currency',
 'P39': 'position held',
 'P40': 'child',
 'P47': 'shares border with',
 'P50': 'author',
 'P53': 'family',
 'P54': 'member of sports team',
 'P57': 'director',
 'P58': 'screenwriter',
 'P59': 'constellation',
 'P61': 'discoverer or inventor',
 'P65': 'site of astronomical discovery',
 'P66': 'ancestral home',
 'P69': 'educated at',
 'P78': 'top-level Internet domain',
 'P81': 'connecting line',
 'P84': 'architect',
 'P85': 'anthem',
 'P86': 'composer',
 'P87': 'librettist',
 'P88': 'commissioned by',
 'P91': 'sexual orientation',
 'P92': 'main regulatory text',
 'P97': 'noble title',
 'P98': 'editor',
 'P101': 'f

In [8]:
prop_items = list(PROP_2_LABEL.items())
prop_ids = [int(item[0][1:]) for item in prop_items]
prop_names = [item[1] for item in prop_items]
len(set(prop_ids)), len(set(prop_names))

(2408, 2408)

In [10]:
prop_embeddings = []
batch_size = 100

for i in tqdm(range(0, len(prop_names), batch_size)):

    if i + batch_size > len(prop_names):
        prop_list = prop_names[i: len(prop_names)]
    else:
        prop_list = prop_names[i: i + batch_size]

    prop_embeddings.append(embed_entity_batch(prop_list).detach().to('cpu'))

100%|██████████| 25/25 [00:01<00:00, 17.35it/s]


In [15]:
prop_output = np.array(torch.concat(prop_embeddings))
prop_output.shape

(2408, 768)

In [13]:
assert all(isinstance(id_, int) for id_ in prop_ids)
prop_ids[:5]

[6, 16, 17, 19, 20]

In [16]:
dim = prop_output.shape[1]
metric = faiss.METRIC_INNER_PRODUCT
prop_index = faiss.index_factory(dim, "IDMap,Flat", metric)

prop_index.add_with_ids(prop_output, prop_ids)

In [18]:
print(prop_index.is_trained)

True


In [20]:
from time import time

before = time()
distances, indices = prop_index.search(prop_output[100:103, :], 3)
after = time()
after - before

0.005139827728271484

In [21]:
indices, distances

(array([[  196,   690,   744],
        [  197,  3032,    81],
        [  199,   749, 12526]]),
 array([[1.4933411 , 1.1615491 , 1.0979967 ],
        [1.8069766 , 1.3875215 , 1.0804527 ],
        [1.7670994 , 0.9554285 , 0.93438303]], dtype=float32))

In [24]:
print([PROP_2_LABEL["P"+str(i)] for i in indices[0]])
print([PROP_2_LABEL["P"+str(i)] for i in indices[1]])
print([PROP_2_LABEL["P"+str(i)] for i in indices[2]])

['minor planet group', 'space group', 'asteroid family']
['adjacent station', 'adjacent building', 'connecting line']
['organizational divisions', 'parent organization', 'performing organization']


In [27]:
faiss.write_index(prop_index, "wikidata_relations.index")

## Indexing ontology labels

In [4]:
with open('ontology_entity2label.json', 'r') as f:
    ENTITY_2_LABEL = json.load(f)
ENTITY_2_LABEL

{'Q60534428': 'multivolume work',
 'Q1087471': 'relic associated with Jesus',
 'Q1156854': 'policy',
 'Q6498903': 'supervillain',
 'Q15720608': 'Wikidata qualifier',
 'Q15690029': 'fictional educational institution',
 'Q4271324': 'mythical character',
 'Q42262353': 'Y-DNA haplogroup',
 'Q65209857': 'virtual character',
 'Q192611': 'electoral unit',
 'Q6586093': 'lifetime achievement',
 'Q55876909': 'Catholic parish church',
 'Q639030': 'people mover',
 'Q23649978': 'Kijkwijzer rating category',
 'Q32859534': 'fictional activity',
 'Q17099416': 'Wikimedia list of songs',
 'Q15711994': 'group of isomeric entities',
 'Q278784': 'district heating',
 'Q848197': 'parliamentary group',
 'Q2634521': 'title of Jesus',
 'Q4835091': 'territory',
 'Q10884': 'tree',
 'Q3390477': 'online marketplace',
 'Q621751': 'approximation algorithm',
 'Q15706549': 'magnetic ordering',
 'Q48282': 'student',
 'Q2675537': 'music teacher',
 'Q3523102': 'source of information',
 'Q18510489': 'comedy troupe',
 'Q897

In [6]:
entity_items = list(ENTITY_2_LABEL.items())
entity_ids = [int(item[0][1:]) for item in entity_items]
entity_names = [item[1] for item in entity_items]
len(set(entity_ids)), len(set(entity_names))

(3138, 3087)

In [7]:
entity_embeddings = []
batch_size = 100

for i in tqdm(range(0, len(entity_names), batch_size)):

    if i + batch_size > len(entity_names):
        entity_list = entity_names[i: len(entity_names)]
    else:
        entity_list = entity_names[i: i + batch_size]

    entity_embeddings.append(embed_entity_batch(entity_list).detach().to('cpu'))

100%|██████████| 32/32 [00:01<00:00, 20.06it/s]


In [8]:
entity_output = np.array(torch.concat(entity_embeddings))
entity_output.shape

(3138, 768)

In [10]:
dim = entity_output.shape[1]
metric = faiss.METRIC_INNER_PRODUCT
entity_index = faiss.index_factory(dim, "IDMap,Flat", metric)

entity_index.add_with_ids(entity_output, entity_ids)

In [11]:
print(entity_index.is_trained)

True


In [12]:
from time import time

before = time()
distances, indices = entity_index.search(entity_output[100:103, :], 3)
after = time()
after - before

0.004317283630371094

In [13]:
indices, distances

(array([[ 1924249, 42014143,   849203],
        [  188968, 12558574,  2075301],
        [ 1522115,  1753139,  1146001]]),
 array([[2.6669273, 1.9941075, 0.8888495],
        [2.1201134, 1.3751981, 1.3191648],
        [1.8139822, 1.4395627, 1.2694929]], dtype=float32))

In [14]:
indices, distances

(array([[ 1924249, 42014143,   849203],
        [  188968, 12558574,  2075301],
        [ 1522115,  1753139,  1146001]]),
 array([[2.6669273, 1.9941075, 0.8888495],
        [2.1201134, 1.3751981, 1.3191648],
        [1.8139822, 1.4395627, 1.2694929]], dtype=float32))

In [15]:
print([ENTITY_2_LABEL["Q"+str(i)] for i in indices[0]])
print([ENTITY_2_LABEL["Q"+str(i)] for i in indices[1]])
print([ENTITY_2_LABEL["Q"+str(i)] for i in indices[2]])

['measurand', 'biomedical measurand type', 'electronic countermeasure']
['perspective', 'point of view', 'view']
['energy source', 'electric power source', 'light source']


In [38]:
distances, idx = entity_index.search(embed_entity_batch(['organization']).detach().cpu().numpy(), 10)
distances, idx

(array([[1.4062808, 1.199734 , 1.170858 , 1.1634574, 1.1296779, 1.1264762,
         1.1156702, 1.1106048, 1.0937347, 1.0656824]], dtype=float32),
 array([[   43229,  2659904,  5341295,  7210356,  4438121, 17149090,
          1530022,  4120211, 16519632,   163740]]))

In [48]:
print([ENTITY_2_LABEL["Q"+str(i)] for i in idx[0]])

['organization', 'government organization', 'educational organization', 'political organization', 'sports organization', 'armed organization', 'religious organization', 'regional organization', 'scientific organization', 'nonprofit organization']


In [16]:
faiss.write_index(entity_index, "wikidata_ontology_entities.index")

In [67]:
with open('prop2label.json', 'r') as f:
    PROP_2_LABEL = json.load(f)
# PROP_2_LABEL

In [54]:
[PROP_2_LABEL[prop] for prop in prop2obj_constraint["Q"+'43229']]

['Commons media contributed by',
 'academic appointment',
 'addressee',
 'affiliation',
 'after a work by',
 'afterward owned by',
 'allegiance',
 'applies to jurisdiction',
 'appointed by',
 'archives at',
 'artist files at',
 'author',
 'beforehand owned by',
 'board member',
 'broadcast by',
 'central bank/issuer',
 'collection',
 'collection creator',
 'commissioned by',
 'contributor to the creative work or subject',
 'copyright holder',
 'cover art by',
 'creator',
 'dedicated heritage entity',
 'defendant',
 'depositary',
 'described by source',
 'designed by',
 'developer',
 'digitised by',
 'discoverer or inventor',
 'documentation files at',
 'donated by',
 'editor',
 'editor-in-chief',
 'employer',
 'endorsed by',
 'exhibited creator',
 'external auditor',
 'founded by',
 'has subsidiary',
 'illustrator',
 'imprimatur granted by',
 'inker',
 'investigated by',
 'issued by',
 'killed by',
 'landscape architect',
 'language regulatory body',
 'location',
 'location of first pe

In [56]:
subj_prop_lens = []
obj_prop_lens = []

for props in prop2obj_constraint.values():
    obj_prop_lens.append(len(props))

for props in prop2subj_constraint.values():
    subj_prop_lens.append(len(props))

In [65]:
max(obj_prop_lens), np.mean(obj_prop_lens), np.median(obj_prop_lens)

(709, 2.4506237006237006, 1.0)

In [66]:
max(subj_prop_lens), np.mean(subj_prop_lens), np.median(subj_prop_lens)

(1039, 3.6275586620069897, 1.0)

In [4]:
def get_entity_hierarchy(entity_id):
    """
    Retrieves the full hierarchical types of a given Wikidata entity, including both
    'instance of' (P31) and 'subclass of' (P279) relationships, along with depth information.

    Args:
        entity_id (str): The Wikidata entity ID (e.g., 'Q5' for Human).

    Returns:
        list of dict: A list containing dictionaries with type, typeLabel, supertype, supertypeLabel, and depth.
    """
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

    query = f"""
    SELECT DISTINCT ?type ?typeLabel ?supertype ?supertypeLabel (COUNT(?mid) AS ?depth) WHERE {{
      {{
        wd:{entity_id} wdt:P31/wdt:P279* ?type.
        OPTIONAL {{ ?type wdt:P279 ?supertype. }}
      }}
      UNION
      {{
        wd:{entity_id} wdt:P279* ?type.
        OPTIONAL {{ ?type wdt:P279 ?supertype. }}
      }}
      OPTIONAL {{
        wd:{entity_id} (wdt:P31/wdt:P279*) ?mid.
        ?mid wdt:P279* ?type.
      }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    GROUP BY ?type ?typeLabel ?supertype ?supertypeLabel
    ORDER BY ?depth
    """

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    hierarchy = []
    for result in results["results"]["bindings"]:
        hierarchy.append({
            "type": result.get("type", {}).get("value"),
            "typeLabel": result.get("typeLabel", {}).get("value"),
            "supertype": result.get("supertype", {}).get("value"),
            "supertypeLabel": result.get("supertypeLabel", {}).get("value"),
            "depth": int(result.get("depth", {}).get("value", 0))
        })

    return hierarchy

# Example usage:
entity_id = "Q5"  # Human
hierarchy = get_entity_hierarchy(entity_id)
for item in hierarchy:
    print(item)


HTTPError: HTTP Error 403: Forbidden