In [1]:
from pymongo.mongo_client import MongoClient
import json

## Utils

In [2]:
mappings_dir = "utils/ontology_mappings/"

with open(mappings_dir+'prop2label.json', 'r') as f:
    PROP_2_LABEL = json.load(f)

with open(mappings_dir+'entity_type2label.json', 'r') as f:
    ENTITY_2_LABEL = json.load(f)


In [None]:
def get_mongo_client(mongo_uri):
    client = MongoClient(mongo_uri)
    return client

mongo_client = get_mongo_client("mongodb://localhost:27018/?directConnection=true")
db = mongo_client.get_database("wikidata_ontology")
db.list_collection_names()

['properties',
 'property_aliases',
 'entity_aliases',
 'triplets',
 'filtered_triplets',
 'entity_type_aliases',
 'entity_types']

In [4]:
print(db.get_collection('entity_type_aliases').find_one().keys())
print(db.get_collection('entity_types').find_one().keys())
print(db.get_collection('properties').find_one().keys())
print(db.get_collection('property_aliases').find_one().keys())

dict_keys(['_id', 'entity_type_id', 'alias_label', 'alias_text_embedding'])
dict_keys(['_id', 'entity_type_id', 'label', 'parent_type_ids', 'valid_subject_property_ids', 'valid_object_property_ids'])
dict_keys(['_id', 'property_id', 'label', 'valid_subject_type_ids', 'valid_object_type_ids'])
dict_keys(['_id', 'relation_id', 'alias_label', 'alias_text_embedding'])


In [5]:
print(list(db.get_collection('entity_type_aliases').list_search_indexes()))
print(list(db.get_collection('property_aliases').list_search_indexes()))

[{'id': '684c685a5ed97f053516d78c', 'name': 'entity_type_aliases', 'type': 'search', 'status': 'READY', 'queryable': True, 'latestVersion': 0, 'latestDefinition': {'mappings': {'dynamic': True, 'fields': {'alias_text_embedding': {'type': 'knnVector', 'dimensions': 768, 'similarity': 'cosine'}}}}}]
[{'id': '684c686e5ed97f053516d78d', 'name': 'property_aliases_ids', 'type': 'search', 'status': 'READY', 'queryable': True, 'latestVersion': 0, 'latestDefinition': {'mappings': {'dynamic': True, 'fields': {'alias_text_embedding': {'type': 'knnVector', 'dimensions': 768, 'similarity': 'cosine'}, 'relation_id': {'type': 'token'}}}}}]


## Functions for structural search

In [42]:
from utils.structured_dynamic_index_utils import Aligner as Aligner

## Triple extraction tests

In [43]:
from utils.openai_utils import LLMTripletExtractor
from utils.structured_dynamic_index_utils_with_db import Aligner as DBAligner

aligner = DBAligner(db)
model_name = 'gpt-4.1-mini'
extractor = LLMTripletExtractor(model=model_name)

In [44]:
aligner.retrieve_properties_labels_and_constraints(['P291', 'P189'])

{'P189': {'label': 'location of discovery',
  'valid_subject_type_ids': [],
  'valid_object_type_ids': ['Q3895768', 'Q17334923', 'Q27096213']},
 'P291': {'label': 'place of publication',
  'valid_subject_type_ids': ['Q11424',
   'Q386724',
   'Q2031291',
   'Q3331189',
   'Q3523102',
   'Q17489659'],
  'valid_object_type_ids': ['Q220505', 'Q15796005', 'Q27096213', 'Q27787439']}}

In [45]:
from unidecode import unidecode
import re

def extract_triplets(text, sample_id):
    """Extract and refine knowledge graph triplets from text using LLM.
    
    Args:
        text (str): Input text to extract triplets from
        
    Returns:
        tuple: (final_triplets, filtered_triplets) where:
            - final_triplets: List of validated and refined triplets
            - filtered_triplets: List of triplets that couldn't be validated
    """
    # Extract initial triplets using LLM
    extracted_triplets = extractor.extract_triplets_from_text(text)
    
    final_triplets = []
    filtered_triplets = []

    for triplet in extracted_triplets['triplets']:        
        try:
            # Get candidate entity types
            subj_type_ids, obj_type_ids = aligner.retrieve_similar_entity_types(triplet=triplet)
            
            # Get candidate properties/relations
            properties = aligner.retrieve_properties_for_entity_type(
                target_relation=triplet['relation'],
                object_types=obj_type_ids, 
                subject_types=subj_type_ids,
                k=5
            )
            prop_2_label_and_constraint = aligner.retrieve_properties_labels_and_constraints(property_id_list=[p[0] for p in properties])
            entity_type_id_2_label = aligner.retrieve_entity_type_labels(subj_type_ids + obj_type_ids)

            # Build candidate triplet backbones
            candidates = []

            for prop_id, prop_direction in properties:
                valid_subject_type_ids = prop_2_label_and_constraint[prop_id]['valid_subject_type_ids']
                valid_object_type_ids = prop_2_label_and_constraint[prop_id]['valid_object_type_ids']
                property_label = prop_2_label_and_constraint[prop_id]['label']

                if prop_direction == 'direct':
                    # Include hierarchy here too??
                    subject_types = set(subj_type_ids) & set(valid_subject_type_ids)
                    object_types = set(obj_type_ids) & set(valid_object_type_ids)

                    # Use original type sets if no constraints matched
                    # meaning that property can be connected with <ANY> entity type
                    subject_types = subj_type_ids if len(subject_types) == 0 else subject_types
                    object_types = obj_type_ids if len(object_types) == 0 else object_types
                else:
                    subject_types = set(obj_type_ids) & set(valid_subject_type_ids)
                    object_types = set(subj_type_ids) & set(valid_object_type_ids) 
                    
                    # Use original type sets if no constraints matched
                    # meaning that property can be connected with <ANY> entity type
                    subject_types = obj_type_ids if len(subject_types) == 0 else subject_types
                    object_types = subj_type_ids if len(object_types) == 0 else object_types
                    
                subject_types = subj_type_ids if len(subject_types) == 0 else subject_types
                object_types = obj_type_ids if len(object_types) == 0 else object_types

                candidates.append({
                    "subject": triplet['subject'] if prop_direction == 'direct' else triplet['object'],
                    "relation": property_label,
                    'object': triplet['object'] if prop_direction == 'direct' else triplet['subject'],
                    "subject_types": [entity_type_id_2_label[t]['label'] for t in subject_types],
                    "object_types": [entity_type_id_2_label[t]['label'] for t in object_types]
                })


            # Refine relation and entity types using LLM - choose among valid backbones for triplet
            backbone_triplet = extractor.refine_relation_and_entity_types(
                text=text, 
                triplet=triplet,
                candidate_triplets=candidates,
            )
            print(backbone_triplet)
            backbone_triplet['qualifiers'] = triplet['qualifiers']

            # Refine entity names
            final_triplet = refine_entities(text, backbone_triplet, aligner, sample_id)
            
            final_triplets.append(final_triplet)
            # !! Add validation that triplet was formed from bd types
        except Exception as e:
            print(e)
            filtered_triplets.append(triplet)
        # print("2nd resulted triplet: ", final_triplet)
    # print("-"*100)
    if len(final_triplets) > 0:
        aligner.add_triplets(final_triplets, sample_id=sample_id)
    if len(filtered_triplets) > 0:
        aligner.add_filtered_triplets(filtered_triplets, sample_id=sample_id)
    return final_triplets, filtered_triplets


def refine_entities(text, triplet, aligner, sample_id):
    """Refine entity names using type constraints."""

    print(triplet)

    triplet['subject'] = unidecode(triplet['subject'])
    triplet['object'] = unidecode(triplet['object'])

    print(triplet['object_type'], triplet['subject_type'])
    ################################ Handle object refinement ################################
    obj_hierarchy = aligner.retrieve_entity_type_hirerarchy(triplet['object_type'])
    updated_obj = 'None'
    
    # do not change time or quantity entities
    if not any(t in ['Q186408', 'Q309314'] for t in obj_hierarchy):
        similar_objects = aligner.retrieve_entity_by_type(
            entity_name=triplet['object'],
            entity_type=triplet['object_type'],
            sample_id=sample_id
        )
        if len(similar_objects) > 0:
            if triplet['object'] in similar_objects:
                updated_obj = similar_objects[triplet['object']]
            else:
                updated_obj = extractor.refine_entity(
                    text=text,
                    triplet=triplet,
                    candidates=list(similar_objects.values()),
                    is_object=True
                )
                updated_obj = unidecode(updated_obj)
    
    if re.sub(r'[^\w\s]', '', updated_obj) != 'None':
        if triplet['object'] != updated_obj:
            aligner.add_entity(entity_name=updated_obj, alias=triplet['object'], entity_type=triplet['object_type'], sample_id=sample_id)
        triplet['object'] = updated_obj
    else:
        aligner.add_entity(entity_name=triplet['object'], alias=triplet['object'], entity_type=triplet['object_type'], sample_id=sample_id)

    ################################# Handle subject refinement ################################
    updated_subj = 'None'
    similar_subjects = aligner.retrieve_entity_by_type(
        entity_name=triplet['subject'],
        entity_type=triplet['subject_type'],
        sample_id=sample_id
    )
    if len(similar_subjects) > 0:
        if triplet['subject'] in similar_subjects:
            updated_subj = similar_subjects[triplet['subject']]
        else:
            updated_subj = extractor.refine_entity(
                text=text,
                triplet=triplet,
                candidates=list(similar_subjects.values()),
                is_object=False
            )
            updated_subj = unidecode(updated_subj)
    
    if re.sub(r'[^\w\s]', '', updated_subj) != 'None':
        if triplet['subject'] != updated_subj:
            aligner.add_entity(entity_name=updated_subj, alias=triplet['subject'], entity_type=triplet['subject_type'], sample_id=sample_id)
        triplet['subject'] = updated_subj
    else:
        aligner.add_entity(entity_name=triplet['subject'], alias=triplet['subject'], entity_type=triplet['subject_type'], sample_id=sample_id)
        
    return triplet

In [51]:
text = "Yura Borisov is up for Best Supporting Actor for his role in Sean Baker's “Anora” at the Academy Awards on March 2, making him the first Russian to be nominated in an acting category since the fall of the Soviet Union."

In [52]:
import time

start_time = time.time()
result = extract_triplets(text, "0")
end_time = time.time()
print(f"\nExecution time: {end_time - start_time:.2f} seconds")


{'subject': 'Yura Borisov', 'relation': 'nominated for', 'object': 'Best Supporting Actor', 'subject_type': 'human', 'object_type': 'class of award'}
{'subject': 'Yura Borisov', 'relation': 'nominated for', 'object': 'Best Supporting Actor', 'subject_type': 'human', 'object_type': 'class of award', 'qualifiers': [{'relation': 'for work', 'object': 'Anora'}, {'relation': 'award event', 'object': 'Academy Awards'}, {'relation': 'point in time', 'object': 'March 2'}]}
class of award human
class of award
['artificial object', 'class (collection of items defined by common characteristics)', 'cultural artifact', 'metaclass', 'entity', 'class of award', 'object', 'award', 'abstract entity', 'collective entity']
0
{'Best Supporting Actor': 'Best Supporting Actor'}
['continuant', 'person or organization', 'individual organism', 'natural person', 'class (collection of items defined by common characteristics)', 'person', 'group or class of organisms', 'physical object', 'agent (distinct and ident

In [55]:
list(db.get_collection("entity_aliases").find({}, {'_id': 0, "label": 1, "alias": 1, "entity_type": 1, "sample_id": 1}))

[{'label': 'Best Supporting Actor',
  'entity_type': 'award',
  'alias': 'Best Supporting Actor',
  'sample_id': '0'},
 {'label': 'Borisov',
  'entity_type': 'human',
  'alias': 'Borisov',
  'sample_id': '0'},
 {'label': 'fall of the Soviet Union',
  'entity_type': 'event',
  'alias': 'fall of the Soviet Union',
  'sample_id': '0'},
 {'label': 'Anora', 'entity_type': 'film', 'alias': 'Anora', 'sample_id': '0'},
 {'label': 'science fiction',
  'entity_type': 'genre',
  'alias': 'science fiction',
  'sample_id': '0'},
 {'label': 'Inception',
  'entity_type': 'film',
  'alias': 'Inception',
  'sample_id': '0'},
 {'label': 'Borisov',
  'entity_type': 'human',
  'alias': 'Yura Borisov',
  'sample_id': '0'},
 {'label': 'Sean Baker',
  'entity_type': 'human',
  'alias': 'Sean Baker',
  'sample_id': '0'}]

In [24]:
db.get_collection("triplets").delete_many({})

DeleteResult({'n': 18, 'electionId': ObjectId('7fffffff0000000000000008'), 'opTime': {'ts': Timestamp(1751714329, 2), 't': 8}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1751714329, 2), 'signature': {'hash': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 'keyId': 0}}, 'operationTime': Timestamp(1751714329, 2)}, acknowledged=True)

In [25]:
db.get_collection("entity_aliases").delete_many({})

DeleteResult({'n': 19, 'electionId': ObjectId('7fffffff0000000000000008'), 'opTime': {'ts': Timestamp(1751714329, 4), 't': 8}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1751714329, 4), 'signature': {'hash': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 'keyId': 0}}, 'operationTime': Timestamp(1751714329, 4)}, acknowledged=True)

In [23]:
extractor.calculate_cost()

0.0091016

In [62]:
list(db.get_collection('entity_aliases').list_search_indexes())

[{'id': '68460260d541617ce2e16b42',
  'name': 'entities',
  'type': 'search',
  'status': 'READY',
  'queryable': True,
  'latestVersion': 0,
  'latestDefinition': {'mappings': {'dynamic': True,
    'fields': {'entity_type': {'type': 'token'},
     'sample_id': {'type': 'token'},
     'alias_text_embedding': {'type': 'knnVector',
      'dimensions': 768,
      'similarity': 'cosine'}}}}}]

In [11]:
db.get_collection('entity_aliases').delete_many({})

DeleteResult({'n': 17726, 'electionId': ObjectId('7fffffff0000000000000008'), 'opTime': {'ts': Timestamp(1751709397, 1773), 't': 8}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1751709397, 1773), 'signature': {'hash': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 'keyId': 0}}, 'operationTime': Timestamp(1751709397, 1773)}, acknowledged=True)

In [9]:
list(db.get_collection('triplets').find({}))

[{'_id': ObjectId('686446949362efa843c65a26'),
  'subject': 'Michael Joseph Jackson',
  'object': 'August 29, 1958',
  'relation': 'date of birth',
  'object_type': 'point in time',
  'subject_type': 'human',
  'completion_token_num': 0,
  'prompt_token_nums': 0,
  'qualifiers': [],
  'sample_id': 'a4ddb2ba-eb7e-4268-92ca-fec0036c35cc',
  'source_text_id': 0},
 {'_id': ObjectId('686446969362efa843c65a27'),
  'subject': 'Michael Joseph Jackson',
  'subject_type': 'human',
  'object_type': 'point in time',
  'relation': 'date of death',
  'object': 'June 25, 2009',
  'completion_token_num': 0,
  'prompt_token_nums': 0,
  'qualifiers': [],
  'sample_id': 'a4ddb2ba-eb7e-4268-92ca-fec0036c35cc',
  'source_text_id': 0},
 {'_id': ObjectId('686446979362efa843c65a28'),
  'subject': 'Michael Joseph Jackson',
  'object': 'singer',
  'relation': 'occupation',
  'object_type': 'profession',
  'subject_type': 'human',
  'completion_token_num': 0,
  'prompt_token_nums': 0,
  'qualifiers': [],
  'samp

In [27]:
db.list_collection_names()

['entity_types',
 'properties',
 'entity_aliases',
 'triplets',
 'entity_type_aliases',
 'property_aliases']

In [32]:
import time
text = "In 2010, Christopher Nolan directed the science fiction movie Inception"
# text = 'Sam Altman’s net worth is a major topic for people following the tech industry, as the CEO of OpenAI is one of the most prominent figures out there, especially with the rise of artificial intelligence (AI) and chatbots like ChatGPT.'
# text = "Musk’s xAI releases artificial intelligence model Grok 3, claims better performance than rivals in early testing."
start_time = time.time()
result = extract_triplets(text, "0")
end_time = time.time()
print(f"\nExecution time: {end_time - start_time:.2f} seconds")


{'subject': 'Christopher Nolan', 'relation': 'director', 'object': 'Inception', 'subject_type': 'human', 'object_type': 'movie'}
{'subject': 'Christopher Nolan', 'relation': 'director', 'object': 'Inception', 'subject_type': 'human', 'object_type': 'movie', 'qualifiers': [{'relation': 'point in time', 'object': '2010'}]}
movie human
movie
'NoneType' object is not subscriptable
{'subject': 'Inception', 'relation': 'genre', 'object': 'science fiction', 'subject_type': 'film', 'object_type': 'genre'}
{'subject': 'Inception', 'relation': 'genre', 'object': 'science fiction', 'subject_type': 'film', 'object_type': 'genre', 'qualifiers': []}
genre film
genre
['class (collection of items defined by common characteristics)', 'metaclass', 'entity', 'genre', 'abstract entity', 'collective entity']
0
{}
['video work', 'video and/or audio work', 'function', 'film', 'binary relation', 'sequence', 'intellectual work', 'artificial object', 'creative work', 'class (mathematical collection of sets that

In [33]:
extractor.calculate_cost()

0.013104400000000002

In [34]:
result

([{'subject': 'Inception',
   'relation': 'genre',
   'object': 'science fiction',
   'subject_type': 'film',
   'object_type': 'genre',
   'qualifiers': [],
   'sample_id': '0',
   '_id': ObjectId('684604708cb451afb7c0ed2d')}],
 [{'subject': 'Christopher Nolan',
   'relation': 'director',
   'object': 'Inception',
   'qualifiers': [{'relation': 'point in time', 'object': '2010'}],
   'subject_type': 'human',
   'object_type': 'movie',
   'sample_id': '0',
   '_id': ObjectId('684604708cb451afb7c0ed2e')}])

In [35]:
from utils.structured_dynamic_index_utils import Aligner as Aligner
aligner = Aligner()
extractor = LLMTripletExtractor(model=model_name)

In [None]:
import time
# aligner = Aligner()
start_time = time.time()
result = extract_triplets(text)
end_time = time.time()
print(f"\nExecution time: {end_time - start_time:.2f} seconds")


In [None]:
extractor.calculate_cost()