In [1]:
import time
import numpy as np
from tqdm import tqdm

import json
import jsonlines
import openai
import os
import sys
import logging
import pprint

import torch
from transformers import AutoTokenizer, AutoModel

from utils.eval_utils import micro_precision, micro_recall, f1_score
from utils.openai_utils import LLMTripletExtractor
from utils.verifier_utils import TripletFilter
from utils.structured_dynamic_index_utils import Aligner

import faiss
import argparse
import re
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## Examples

In [2]:
aligner = Aligner()

In [3]:
text = "Borisov is up for Best Supporting Actor for his role in Sean Baker's “Anora” at the Academy Awards on March 2, making him the first Russian to be nominated in an acting category since the fall of the Soviet Union."

In [4]:
model_name = 'gpt-4o'

extractor = LLMTripletExtractor(model=model_name, prompt1_path='utils/prompts/propmt_1_types_qualifiers.txt')

In [5]:
extracted_triplets = extractor.get_completion_first_query(text)

extracted_triplets['triplets']

[{'subject': 'Borisov',
  'relation': 'nominated for',
  'object_': 'Best Supporting Actor',
  'qualifiers': [{'relation': 'for work', 'object_': 'Anora'},
   {'relation': 'award received', 'object_': 'Academy Awards'},
   {'relation': 'point in time', 'object_': 'March 2'}],
  'subject_type': 'person',
  'object_type': 'award category'},
 {'subject': 'Borisov',
  'relation': 'first',
  'object_': 'Russian nominated in an acting category',
  'qualifiers': [{'relation': 'since', 'object_': 'fall of the Soviet Union'}],
  'subject_type': 'person',
  'object_type': 'achievement'},
 {'subject': 'Anora',
  'relation': 'director',
  'object_': 'Sean Baker',
  'qualifiers': [],
  'subject_type': 'film',
  'object_type': 'person'}]

In [6]:
for triplet in extracted_triplets['triplets']:
    obj_type_ids, subj_type_ids = aligner.retrieve_similar_entity_types(triplet=triplet)
    properties = aligner.retrieve_properties_for_entity_type(target_relation=triplet['relation'], object_types=obj_type_ids, subject_types=subj_type_ids)
    print(triplet['subject'], triplet['relation'], triplet['object_'])
    print(triplet['subject_type'], triplet['object_type'])
    print(properties)
    print('-'*100)

Borisov nominated for Best Supporting Actor
person award category
['nominated for', 'winner', 'genre', 'founded by', 'notable work', 'honorific suffix', 'issued by', 'donated by', 'intended public', 'applies to jurisdiction']
----------------------------------------------------------------------------------------------------
Borisov first Russian nominated in an acting category
person achievement
['winner', 'director', 'team captain', 'nominated for', 'head coach', 'rector', 'owned by', 'choral conductor', 'founded by', 'general manager']
----------------------------------------------------------------------------------------------------
Anora director Sean Baker
film person
['director', 'director of photography', 'art director', 'film editor', 'choreographer', 'musical conductor', 'translator', 'animator', 'nominated for', 'lyricist']
----------------------------------------------------------------------------------------------------


In [7]:
text = 'Sam Altman’s net worth is a major topic for people following the tech industry, as the CEO of OpenAI is one of the most prominent figures out there, especially with the rise of artificial intelligence (AI) and chatbots like ChatGPT. '

In [8]:
extracted_triplets = extractor.get_completion_first_query(text)

extracted_triplets['triplets']

[{'subject': 'Sam Altman',
  'relation': 'position held',
  'object_': 'CEO',
  'qualifiers': [{'relation': 'of', 'object_': 'OpenAI'}],
  'subject_type': 'person',
  'object_type': 'position'},
 {'subject': 'Sam Altman',
  'relation': 'notable for',
  'object_': 'rise of artificial intelligence',
  'qualifiers': [],
  'subject_type': 'person',
  'object_type': 'event'},
 {'subject': 'Sam Altman',
  'relation': 'notable for',
  'object_': 'chatbots like ChatGPT',
  'qualifiers': [],
  'subject_type': 'person',
  'object_type': 'technology'}]

In [9]:
for triplet in extracted_triplets['triplets']:
    obj_type_ids, subj_type_ids = aligner.retrieve_similar_entity_types(triplet=triplet)
    properties = aligner.retrieve_properties_for_entity_type(target_relation=triplet['relation'], object_types=obj_type_ids, subject_types=subj_type_ids)
    print(triplet['subject'], triplet['relation'], triplet['object_'])
    print(triplet['subject_type'], triplet['object_type'])
    print(properties)
    print('-'*100)

Sam Altman position held CEO
person position
['position held', 'position holder', 'appointed by', 'appointed by', 'professorship', 'nominated by', 'nominated by', 'occupation', 'position played on team / speciality', 'military or police rank']
----------------------------------------------------------------------------------------------------
Sam Altman notable for rise of artificial intelligence
person event
['notable work', 'founded by', 'owned by', 'competition won', 'ordered by', 'indigenous to', 'performer', 'killed by', 'exhibited creator', 'ethnic group']
----------------------------------------------------------------------------------------------------
Sam Altman notable for chatbots like ChatGPT
person technology
['manufacturer', 'developer', 'genre', 'indigenous to', 'industry', 'publisher', 'applies to jurisdiction', 'item operated', 'occupant', 'product, material, or service produced or provided']
----------------------------------------------------------------------------

In [10]:
text = "Musk’s xAI releases artificial intelligence model Grok 3, claims better performance than rivals in early testing."

In [11]:
extracted_triplets = extractor.get_completion_first_query(text)

extracted_triplets['triplets']

[{'subject': 'xAI',
  'relation': 'developed',
  'object_': 'Grok 3',
  'qualifiers': [],
  'subject_type': 'organization',
  'object_type': 'artificial intelligence model'},
 {'subject': 'Grok 3',
  'relation': 'claimed performance',
  'object_': 'better than rivals',
  'qualifiers': [{'relation': 'point in time', 'object_': 'early testing'}],
  'subject_type': 'artificial intelligence model',
  'object_type': 'performance'}]

In [12]:
for triplet in extracted_triplets['triplets']:
    obj_type_ids, subj_type_ids = aligner.retrieve_similar_entity_types(triplet=triplet)
    properties = aligner.retrieve_properties_for_entity_type(target_relation=triplet['relation'], object_types=obj_type_ids, subject_types=subj_type_ids)
    print(triplet['subject'], triplet['relation'], triplet['object_'])
    print(triplet['subject_type'], triplet['object_type'])
    print(properties)
    print('-'*100)

xAI developed Grok 3
organization artificial intelligence model
['developer', 'commissioned by', 'designed by', 'founded by', 'manufacturer', 'owned by', 'creator', 'separated from', 'separated from', 'owner of']
----------------------------------------------------------------------------------------------------
Grok 3 claimed performance better than rivals
artificial intelligence model performance
[]
----------------------------------------------------------------------------------------------------


In [13]:
text = 'In addition to directing “The Phoenician Scheme,” Anderson penned the script with the story co-written by Roman Coppola. Focus will distribute the film domestically. “The Phoenician Scheme” will open in limited release on May 30, 2025 and expand wide on June 6. Universal Pictures, Focus’s parent studio, is handling international distribution.'

In [14]:
extracted_triplets = extractor.get_completion_first_query(text)

extracted_triplets['triplets']

[{'subject': 'Anderson',
  'relation': 'director of',
  'object_': 'The Phoenician Scheme',
  'qualifiers': [],
  'subject_type': 'person',
  'object_type': 'film'},
 {'subject': 'Anderson',
  'relation': 'screenwriter of',
  'object_': 'The Phoenician Scheme',
  'qualifiers': [],
  'subject_type': 'person',
  'object_type': 'film'},
 {'subject': 'Roman Coppola',
  'relation': 'co-writer of',
  'object_': 'The Phoenician Scheme',
  'qualifiers': [],
  'subject_type': 'person',
  'object_type': 'film'},
 {'subject': 'Focus',
  'relation': 'distributor of',
  'object_': 'The Phoenician Scheme',
  'qualifiers': [{'relation': 'distribution scope', 'object_': 'domestic'}],
  'subject_type': 'company',
  'object_type': 'film'},
 {'subject': 'The Phoenician Scheme',
  'relation': 'limited release date',
  'object_': 'May 30, 2025',
  'qualifiers': [],
  'subject_type': 'film',
  'object_type': 'date'},
 {'subject': 'The Phoenician Scheme',
  'relation': 'wide release date',
  'object_': 'June

In [15]:
for triplet in extracted_triplets['triplets']:
    obj_type_ids, subj_type_ids = aligner.retrieve_similar_entity_types(triplet=triplet)
    properties = aligner.retrieve_properties_for_entity_type(target_relation=triplet['relation'], object_types=obj_type_ids, subject_types=subj_type_ids)
    print(triplet['subject'], triplet['relation'], triplet['object_'])
    print(triplet['subject_type'], triplet['object_type'])
    print(properties)
    print('-'*100)

Anderson director of The Phoenician Scheme
person film
['director', 'director of photography', 'art director', 'musical conductor', 'film editor', 'choreographer', 'translator', 'nominated for', 'voice actor', 'animator']
----------------------------------------------------------------------------------------------------
Anderson screenwriter of The Phoenician Scheme
person film
['film editor', 'director', 'filmography', 'director of photography', 'lyricist', 'nominated for', 'art director', 'animator', 'voice actor', 'film crew member']
----------------------------------------------------------------------------------------------------
Roman Coppola co-writer of The Phoenician Scheme
person film
['nominated for', 'lyricist', 'director', 'voice actor', 'film editor', 'director of photography', 'filmography', 'cast member', 'choreographer', 'art director']
----------------------------------------------------------------------------------------------------
Focus distributor of The Phoeni