In [2]:
import numpy as np
from tqdm import tqdm

import json
import pandas as pd

from transformers import AutoTokenizer, AutoModel

from utils.openai_utils import LLMTripletExtractor
from utils.structured_dynamic_index_utils import Aligner


import warnings
warnings.filterwarnings('ignore')
import os
import tiktoken

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
! ls

analysis			   musique.json
attempt2res.json		   musique_res
attempt2res_musique.json	   musique_res_gpt-4.1
create_indexes.py		   musique_structured_inference.py
estimate_llm_cost.ipynb		   musique_updated_res
hotpot_gpt_4.1			   musique_updated_res_9_04
hotpot_gpt_4.1-mini		   musique_updated_res_gpt_4.1-mini
hotpot_gpt-4o-mini		   musique_updated_res_gpt4o_mini
hotpotqa200.json		   pipeline_test.ipynb
hotpot_qa_structured_inference.py  populate_db.py
kg-from-wiki-dynamic.ipynb	   preprocessing
kg-from-wiki.ipynb		   requirements.txt
llama-musique_openrouter_test	   setup_db.sh
logs				   synthie-tests.ipynb
mongo_test.ipynb		   utils
musique_200_test.json		   venv
musique_gpt4o_mini		   wikidata_vs_text2kg.ipynb


In [5]:
c = 0
for file in os.listdir("hotpot_gpt-4o-mini"):
    if "final" in file:
        c += 1

c

51

In [2]:
class CostCalc:
    model_pricing = {
        "gpt-4o-mini": (0.15, 0.6),
        "gpt-4o": (5.0, 15.0),
    }

    def __init__(self, model: str) -> None:
        self.model = model
        self.encoding = tiktoken.encoding_for_model(model)

    def count_tokens(self, content: str):
        return len(self.encoding.encode(content))

    def calculate_input_cost(self, content: str):
        return (
            self.model_pricing[self.model][0] / 10**6 * self.count_tokens(content)
        )

    def calculate_output_cost(self, content: str):
        return (
            self.model_pricing[self.model][1] / 10**6 * self.count_tokens(content)
        )


calc = CostCalc("gpt-4o")

In [3]:
aligner = Aligner()
model_name = 'gpt-4o'
extractor = LLMTripletExtractor(model=model_name)

In [4]:
# text = "Borisov is up for Best Supporting Actor for his role in Sean Baker's “Anora” at the Academy Awards on March 2, making him the first Russian to be nominated in an acting category since the fall of the Soviet Union."
# input_1st_step = extractor.extract_triplets_from_text(text)
# system_prompt = input_1st_step['system_prompt']
# user_prompt = input_1st_step['user_prompt']

# calc.calculate_input_cost(system_prompt) + calc.calculate_input_cost(user_prompt)


In [2]:
import json
with open("hotpotqa200.json", "r") as f:
    ds = json.load(f)

id2sample = {}
for elem in ds:
    id2sample[elem['_id']] = elem
len(id2sample)

200

In [4]:
ds[0]

{'_id': '5a7613c15542994ccc9186bf',
 'answer': 'Gesellschaft mit beschränkter Haftung',
 'question': "VIVA Media AG changed it's name in 2004. What does their new acronym stand for?",
 'supporting_facts': [['VIVA Media', 0],
  ['Gesellschaft mit beschränkter Haftung', 0]],
 'context': [['Constantin Medien',
   ['Constantin Medien AG (formerly EM.Entertainment and EM.TV & Merchandising AG, then EM.TV AG, and finally em.sport media ag) is a German media group, based in Ismaning near Munich, active in the area of sports, film and event marketing to medium-sized media companies.']],
  ['VIVA Poland',
   ['VIVA Polska (earlier "VIVApolska!")',
    ' is a Polish 24h music and entertainment channel from Viacom International Media Networks Polska.',
    ' The channel was officially launched on June 10, 2000 by the German VIVA Media AG.']],
  ['Viva (UK and Ireland)',
   ['Viva (stylised as VIVA) is a music television channel in the United Kingdom and Ireland, owned by VIVA Media and thereby Vi

In [7]:
COST = 0
for sample_id in tqdm(id2sample):

    sample = id2sample[sample_id]

    aligner = Aligner()
    model_name = 'gpt-4o'
    extractor = LLMTripletExtractor(model=model_name)
    
    texts = [" ".join(item[1]) for item in sample['context']]
    for text in texts:
        input_1st_step = extractor.extract_triplets_from_text(text)
        system_prompt = input_1st_step['system_prompt']
        user_prompt = input_1st_step['user_prompt']

        COST += calc.calculate_input_cost(system_prompt) + calc.calculate_input_cost(user_prompt)

100%|██████████| 200/200 [01:55<00:00,  1.73it/s]


In [8]:
COST

9.599879999999992

In [22]:
triplets_num = []
output_costs = []
id2answer = {}
for file in tqdm(os.listdir('musique_updated_res_9_04'), total=100):    
    if 'final' in file:
        df = pd.read_csv('musique_updated_res_9_04/{}'.format(file), index_col=0)
        rows_as_dict = df[["subject", "relation", "object", "subject_type", "object_type", "qualifiers"]].to_dict(orient='records')
        triplets_num.extend(df['source_text_ids'].value_counts().tolist())
        
        output_costs.extend([calc.calculate_output_cost(json.dumps(row)) for row in rows_as_dict])
        
        

100%|██████████| 100/100 [00:00<00:00, 152.03it/s]


In [23]:
np.median(triplets_num) * np.median(output_costs) * 200

1.2240000000000002

In [24]:
np.max(triplets_num) * np.max(output_costs) * 200

27.156000000000002

In [10]:
def extract_triplets(text):
    """Extract and refine knowledge graph triplets from text using LLM.
    
    Args:
        text (str): Input text to extract triplets from
        
    Returns:
        tuple: (final_triplets, filtered_triplets) where:
            - final_triplets: List of validated and refined triplets
            - filtered_triplets: List of triplets that couldn't be validated
    """
    # Extract initial triplets using LLM
    COST = 0
    extracted_triplets = extractor.extract_triplets_from_text(text)
    

    for triplet in extracted_triplets['triplets']:
        # print("1st step triplet: ", triplet)
        
        # Get candidate entity types
        subj_type_ids, obj_type_ids = aligner.retrieve_similar_entity_types(triplet=triplet)
        
        # Get candidate properties/relations
        properties = aligner.retrieve_properties_for_entity_type(
            target_relation=triplet['relation'],
            object_types=obj_type_ids, 
            subject_types=subj_type_ids,
            k=5
        )

        # Build candidate triplet backbones
        # print(properties)
        candidates = []
        for prop_id, prop_label, prop_direction in properties:
            if prop_direction == 'direct':
                subject_types = set(subj_type_ids) & set(aligner.prop2constraints[prop_id]['Subject type constraint'])
                object_types = set(obj_type_ids) & set(aligner.prop2constraints[prop_id]['Value-type constraint'])
            else:
                object_types = set(subj_type_ids) & set(aligner.prop2constraints[prop_id]['Value-type constraint']) 
                subject_types = set(obj_type_ids) & set(aligner.prop2constraints[prop_id]['Subject type constraint'])

            # Use original type sets if no constraints matched
            subject_types = subj_type_ids if len(subject_types) == 0 else subject_types
            object_types = obj_type_ids if len(object_types) == 0 else object_types

            candidates.append({
                "subject": triplet['subject'] if prop_direction == 'direct' else triplet['object'],
                "relation": prop_label,
                'object': triplet['object'] if prop_direction == 'direct' else triplet['subject'],
                "subject_types": [aligner.entity_type2label[t] for t in subject_types],
                "object_types": [aligner.entity_type2label[t] for t in object_types]
            })

            # print({
            #     "subject": triplet['subject'] if prop_direction == 'direct' else triplet['object'],
            #     "relation": prop_label,
            #     'object': triplet['object'] if prop_direction == 'direct' else triplet['subject'],
            #     "subject_types": [aligner.entity_type2label[t] for t in subject_types],
            #     "object_types": [aligner.entity_type2label[t] for t in object_types]
            # })


        # Refine relation and entity types using LLM - choose among valid backbones for triplet
        input_2nd_step =  extractor.refine_relation_and_entity_types(
            text=text, 
            triplet=triplet,
            candidate_triplets=candidates
        )
        system_prompt = input_2nd_step['system_prompt']
        user_prompt = input_2nd_step['user_prompt']

        COST += calc.calculate_input_cost(system_prompt) + calc.calculate_input_cost(user_prompt)

    return COST

In [12]:
COST = 0
for sample_id in id2sample:

    sample = id2sample[sample_id]

    aligner = Aligner()
    model_name = 'gpt-4o'
    extractor = LLMTripletExtractor(model=model_name)
    
    texts = [item['paragraph_text'] for item in sample['paragraphs']]
    for text in tqdm(texts):        
        COST += extract_triplets(text)

    break

COST

100%|██████████| 20/20 [01:55<00:00,  5.79s/it]


0.3911350000000001

In [27]:
COST * 4

1.71562

In [7]:
def extract_triplets(text):
    COST = 0
    """Extract and refine knowledge graph triplets from text using LLM.
    
    Args:
        text (str): Input text to extract triplets from
        
    Returns:
        tuple: (final_triplets, filtered_triplets) where:
            - final_triplets: List of validated and refined triplets
            - filtered_triplets: List of triplets that couldn't be validated
    """
    # Extract initial triplets using LLM
    extracted_triplets = extractor.extract_triplets_from_text(text)
    
    final_triplets = []
    filtered_triplets = []

    for triplet in extracted_triplets['triplets']:
        # print("1st step triplet: ", triplet)
        
        # Get candidate entity types
        subj_type_ids, obj_type_ids = aligner.retrieve_similar_entity_types(triplet=triplet)
        
        # Get candidate properties/relations
        properties = aligner.retrieve_properties_for_entity_type(
            target_relation=triplet['relation'],
            object_types=obj_type_ids, 
            subject_types=subj_type_ids,
            k=5
        )

        # Build candidate triplet backbones
        # print(properties)
        candidates = []
        for prop_id, prop_label, prop_direction in properties:
            if prop_direction == 'direct':
                subject_types = set(subj_type_ids) & set(aligner.prop2constraints[prop_id]['Subject type constraint'])
                object_types = set(obj_type_ids) & set(aligner.prop2constraints[prop_id]['Value-type constraint'])
            else:
                object_types = set(subj_type_ids) & set(aligner.prop2constraints[prop_id]['Value-type constraint']) 
                subject_types = set(obj_type_ids) & set(aligner.prop2constraints[prop_id]['Subject type constraint'])

            # Use original type sets if no constraints matched
            subject_types = subj_type_ids if len(subject_types) == 0 else subject_types
            object_types = obj_type_ids if len(object_types) == 0 else object_types

            candidates.append({
                "subject": triplet['subject'] if prop_direction == 'direct' else triplet['object'],
                "relation": prop_label,
                'object': triplet['object'] if prop_direction == 'direct' else triplet['subject'],
                "subject_types": [aligner.entity_type2label[t] for t in subject_types],
                "object_types": [aligner.entity_type2label[t] for t in object_types]
            })

            # print({
            #     "subject": triplet['subject'] if prop_direction == 'direct' else triplet['object'],
            #     "relation": prop_label,
            #     'object': triplet['object'] if prop_direction == 'direct' else triplet['subject'],
            #     "subject_types": [aligner.entity_type2label[t] for t in subject_types],
            #     "object_types": [aligner.entity_type2label[t] for t in object_types]
            # })


        # Refine relation and entity types using LLM - choose among valid backbones for triplet
        backbone_triplet = extractor.refine_relation_and_entity_types(
            text=text, 
            triplet=triplet,
            candidate_triplets=candidates
        )
        backbone_triplet['qualifiers'] = triplet['qualifiers']

        # Refine entity names
        COST += refine_entities(text, backbone_triplet, aligner)
        


    # print("-"*100)
    return COST


def refine_entities(text, triplet, aligner):
    """Refine entity names using type constraints."""
    # Handle object refinement
    obj_type = triplet['object_type']
    obj_type_id = aligner.label2entity_type.get(obj_type, '')
    obj_hierarchy = [obj_type_id] + aligner.entity2hierarchy.get(obj_type_id, [])
    updated_obj = 'None'
    
    # do not change time or quantity entities
    similar_objects = aligner.retrieve_entity_by_type(
        entity=triplet['object'],
        entity_type=obj_type
    )
    input_3d_step = extractor.refine_entity(
        text=text,
        triplet=triplet,
        candidates=similar_objects,
        is_object=True
    )

    system_prompt = input_3d_step['system_prompt']
    user_prompt = input_3d_step['user_prompt']

    COST = 2 * calc.calculate_input_cost(system_prompt) + calc.calculate_input_cost(user_prompt)

    return COST

In [8]:
COST = 0
for sample_id in id2sample:

    sample = id2sample[sample_id]

    aligner = Aligner()
    model_name = 'gpt-4o'
    extractor = LLMTripletExtractor(model=model_name)
    
    texts = [item['paragraph_text'] for item in sample['paragraphs']]
    for text in tqdm(texts):        
        COST += extract_triplets(text)

    break

COST

100%|██████████| 20/20 [04:02<00:00, 12.13s/it]


0.428905

In [12]:
0.42/50*200

1.68

In [19]:
1.62*200

324.0

In [20]:
18+30+30+2+2+10

92

In [21]:
92/200

0.46

In [32]:
toks = []
for sample_id in id2sample:

    sample = id2sample[sample_id]
    
    texts = [item['paragraph_text'] for item in sample['paragraphs']]
    for text in tqdm(texts):        
        toks.extend(text.split())
    break

100%|██████████| 20/20 [00:00<00:00, 136178.70it/s]


In [33]:
len(toks)

1079