In [1]:
import time
import numpy as np
from tqdm import tqdm

import json
import jsonlines
import openai
import os
import sys
import logging
import pprint

import torch
from transformers import AutoTokenizer, AutoModel

from utils.eval_utils import micro_precision, micro_recall, f1_score
from utils.openai_utils import LLMTripletExtractor
from utils.verifier_utils import TripletFilter
from utils.structured_dynamic_index_utils import Aligner

import faiss
import argparse
import re
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## Examples

In [2]:
aligner = Aligner()

In [16]:
text = "Borisov is up for Best Supporting Actor for his role in Sean Baker's “Anora” at the Academy Awards on March 2, making him the first Russian to be nominated in an acting category since the fall of the Soviet Union."

In [17]:
model_name = 'gpt-4o'

extractor = LLMTripletExtractor(model=model_name, prompt1_path='utils/prompts/propmt_1_types_qualifiers.txt')

In [18]:
extracted_triplets = extractor.get_completion_first_query(text)

extracted_triplets['triplets']

[{'subject': 'Borisov',
  'relation': 'nominated for',
  'object_': 'Best Supporting Actor',
  'qualifiers': [{'relation': 'for work', 'object_': 'Anora'},
   {'relation': 'award received', 'object_': 'Academy Awards'},
   {'relation': 'point in time', 'object_': 'March 2'}],
  'subject_type': 'person',
  'object_type': 'award category'},
 {'subject': 'Borisov',
  'relation': 'first',
  'object_': 'Russian nominated in an acting category',
  'qualifiers': [{'relation': 'since', 'object_': 'fall of the Soviet Union'}],
  'subject_type': 'person',
  'object_type': 'achievement'},
 {'subject': 'Sean Baker',
  'relation': 'director of',
  'object_': 'Anora',
  'qualifiers': [],
  'subject_type': 'person',
  'object_type': 'film'}]

In [19]:
for triplet in extracted_triplets['triplets']:
    obj_type_ids, subj_type_ids = aligner.retrieve_similar_entity_types(triplet=triplet)
    properties = aligner.retrieve_properties_for_entity_type(target_relation=triplet['relation'], object_types=obj_type_ids, subject_types=subj_type_ids)
    print(triplet['subject'], triplet['relation'], triplet['object_'])
    print(triplet['subject_type'], ",", triplet['object_type'])
    print([(p['label'], p['type']) for p in properties])
    for p in properties:
        print([aligner.entity2label[t] for t in p['subject_types']], p['label'], [aligner.entity2label[t] for t in p['object_types']])
    print('-'*100)

Borisov nominated for Best Supporting Actor
person , award category
[('nominated for', 'inverse'), ('trophy awarded', 'direct'), ('winner', 'direct'), ('performer', 'direct'), ('published in', 'inverse'), ('genre', 'direct'), ('genre', 'inverse'), ('developer', 'direct'), ('presenter', 'direct'), ('appointed by', 'inverse')]
['juridical person', 'legal person', 'people mover', 'natural person', 'human', 'grammatical person'] nominated for ['award', 'award ceremony', 'music award', 'group of awards']
['award ceremony'] trophy awarded ['person', 'age of a person', 'juridical person', 'human population', 'hypothetical person', 'legal person', 'people mover', 'fictional human', 'human head', 'natural person', 'individual', 'fictional person', 'human', 'indigenous people', 'people', 'grammatical person', 'personification']
['award', 'music award', 'group of awards'] winner ['juridical person', 'legal person', 'people mover', 'fictional human', 'natural person', 'fictional person', 'human', 

Sean Baker director of Anora
person , film
[('director', 'direct'), ('director of publication', 'direct'), ('director of photography', 'direct'), ('editor', 'direct'), ('appointed by', 'direct'), ('appointed by', 'inverse'), ('producer', 'direct'), ('producer', 'inverse'), ('author', 'inverse'), ('author', 'direct')]
['dubbing of film', 'film series', 'film', 'première', 'fictional film', 'documentary film', 'television film', 'animated film'] director ['human']
['television film'] director of publication ['human']
['film series', 'film', 'fictional film', 'documentary film', 'television film', 'animated film'] director of photography ['human', 'fictional human']
['filmography', 'drama', 'audio release', 'dubbing of film', 'film series', 'film', 'documentary film', 'film poster', 'television film', 'animated film', 'film set'] editor ['human']
['film character'] appointed by ['human', 'fictional human', 'fictional person']
['human', 'fictional human', 'fictional person'] appointed by [

In [20]:
text = 'Sam Altman’s net worth is a major topic for people following the tech industry, as the CEO of OpenAI is one of the most prominent figures out there, especially with the rise of artificial intelligence (AI) and chatbots like ChatGPT. '

In [21]:
extracted_triplets = extractor.get_completion_first_query(text)

extracted_triplets['triplets']

[{'subject': 'Sam Altman',
  'relation': 'net worth',
  'object_': 'major topic',
  'qualifiers': [{'relation': 'for',
    'object_': 'people following the tech industry'}],
  'subject_type': 'person',
  'object_type': 'concept'},
 {'subject': 'Sam Altman',
  'relation': 'position held',
  'object_': 'CEO of OpenAI',
  'qualifiers': [],
  'subject_type': 'person',
  'object_type': 'position'},
 {'subject': 'Sam Altman',
  'relation': 'notable for',
  'object_': 'rise of artificial intelligence',
  'qualifiers': [],
  'subject_type': 'person',
  'object_type': 'event'},
 {'subject': 'Sam Altman',
  'relation': 'notable for',
  'object_': 'chatbots like ChatGPT',
  'qualifiers': [],
  'subject_type': 'person',
  'object_type': 'technology'}]

In [22]:
for triplet in extracted_triplets['triplets']:
    obj_type_ids, subj_type_ids = aligner.retrieve_similar_entity_types(triplet=triplet)
    properties = aligner.retrieve_properties_for_entity_type(target_relation=triplet['relation'], object_types=obj_type_ids, subject_types=subj_type_ids)
    print(triplet['subject'], triplet['relation'], triplet['object_'])
    print(triplet['subject_type'], ",", triplet['object_type'])
    print([(p['label'], p['type']) for p in properties])
    for p in properties:
        print([aligner.entity2label[t] for t in p['subject_types']], p['label'], [aligner.entity2label[t] for t in p['object_types']])
    print('-'*100)

Sam Altman net worth major topic
person , concept
[('owner of', 'inverse'), ('owner of', 'direct'), ('chief executive officer', 'direct'), ('corporate officer', 'direct'), ('chief operating officer', 'direct'), ('board member', 'direct'), ('industry', 'direct'), ('author', 'inverse'), ('author', 'direct'), ('employer', 'direct')]
['human population', 'fictional human', 'fictional person', 'human', 'indigenous people', 'people'] owner of ['sketch', 'feature', 'philosophical concept', 'political initiative', 'legal concept', 'game mechanic', 'definition', 'prototype', 'premise', 'concept car']
['political initiative'] owner of ['person', 'juridical person', 'human population', 'hypothetical person', 'legal person', 'people mover', 'fictional human', 'human head', 'natural person', 'individual', 'fictional person', 'human', 'indigenous people', 'people', 'grammatical person']
['political initiative'] chief executive officer ['human', 'fictional human']
['political initiative'] corporate o

Sam Altman notable for chatbots like ChatGPT
person , technology
[('notable work', 'inverse'), ('has written for', 'inverse'), ('designed by', 'direct'), ('founded by', 'direct'), ('screenwriter', 'direct'), ('commissioned by', 'direct'), ('producer', 'direct'), ('owned by', 'direct'), ('investigated by', 'direct'), ('composer', 'direct')]
['human population', 'fictional human', 'fictional person', 'human', 'indigenous people', 'people'] notable work ['technology', 'software', 'telecommunications device', 'industry', 'technical standard', 'technical specification', 'communication protocol', 'medical device', 'technical system', 'equipment', 'military technology', 'physical tool', 'artificial intelligence', 'computer', 'computer hardware']
['human', 'fictional human', 'fictional person'] has written for ['communication protocol', 'technical standard']
['technology', 'software', 'telecommunications device', 'technical standard', 'fictional technology', 'technical specification', 'communi

In [23]:
text = "Musk’s xAI releases artificial intelligence model Grok 3, claims better performance than rivals in early testing."

In [24]:
extracted_triplets = extractor.get_completion_first_query(text)

extracted_triplets['triplets']

[{'subject': 'xAI',
  'relation': 'developed',
  'object_': 'Grok 3',
  'qualifiers': [],
  'subject_type': 'organization',
  'object_type': 'artificial intelligence model'},
 {'subject': 'Grok 3',
  'relation': 'claimed performance',
  'object_': 'better than rivals',
  'qualifiers': [{'relation': 'point in time', 'object_': 'early testing'}],
  'subject_type': 'artificial intelligence model',
  'object_type': 'performance'}]

In [25]:
for triplet in extracted_triplets['triplets']:
    obj_type_ids, subj_type_ids = aligner.retrieve_similar_entity_types(triplet=triplet)
    properties = aligner.retrieve_properties_for_entity_type(target_relation=triplet['relation'], object_types=obj_type_ids, subject_types=subj_type_ids)
    print(triplet['subject'], triplet['relation'], triplet['object_'])
    print(triplet['subject_type'], ",", triplet['object_type'])
    print([(p['label'], p['type']) for p in properties])
    for p in properties:
        print([aligner.entity2label[t] for t in p['subject_types']], p['label'], [aligner.entity2label[t] for t in p['object_types']])
    print('-'*100)

xAI developed Grok 3
organization , artificial intelligence model
[('developer', 'direct'), ('commissioned by', 'direct'), ('designed by', 'inverse'), ('designed by', 'direct'), ('founded by', 'inverse'), ('founded by', 'direct'), ('manufacturer', 'direct'), ('published in', 'inverse'), ('published in', 'direct'), ('owned by', 'inverse')]
['artificial geographic entity', 'mathematical model', 'artificial intelligence', 'subscription business model', 'product model', 'electronic device model', 'lens model', 'fictional artificial intelligence', 'fictional artificial entity', 'virtual reality headset model', 'artificial satellite', 'artificial physical object', 'business model', 'scientific model', 'integrated circuit model', 'artificial object', 'model', 'artificially intelligent entity', 'computer model'] developer ['educational organization', 'political organization', 'Catholic organization', 'organizational subdivision', 'religious organization', 'organization', 'government organizati

In [26]:
text = 'In addition to directing “The Phoenician Scheme,” Anderson penned the script with the story co-written by Roman Coppola. Focus will distribute the film domestically. “The Phoenician Scheme” will open in limited release on May 30, 2025 and expand wide on June 6. Universal Pictures, Focus’s parent studio, is handling international distribution.'

In [27]:
extracted_triplets = extractor.get_completion_first_query(text)

extracted_triplets['triplets']

[{'subject': 'Anderson',
  'relation': 'director of',
  'object_': 'The Phoenician Scheme',
  'qualifiers': [],
  'subject_type': 'person',
  'object_type': 'film'},
 {'subject': 'Anderson',
  'relation': 'screenwriter of',
  'object_': 'The Phoenician Scheme',
  'qualifiers': [],
  'subject_type': 'person',
  'object_type': 'film'},
 {'subject': 'Roman Coppola',
  'relation': 'co-writer of',
  'object_': 'The Phoenician Scheme',
  'qualifiers': [],
  'subject_type': 'person',
  'object_type': 'film'},
 {'subject': 'Focus',
  'relation': 'distributor of',
  'object_': 'The Phoenician Scheme',
  'qualifiers': [{'relation': 'distribution scope', 'object_': 'domestic'}],
  'subject_type': 'company',
  'object_type': 'film'},
 {'subject': 'The Phoenician Scheme',
  'relation': 'limited release date',
  'object_': 'May 30, 2025',
  'qualifiers': [],
  'subject_type': 'film',
  'object_type': 'date'},
 {'subject': 'The Phoenician Scheme',
  'relation': 'wide release date',
  'object_': 'June

In [28]:
for triplet in extracted_triplets['triplets']:
    obj_type_ids, subj_type_ids = aligner.retrieve_similar_entity_types(triplet=triplet)
    properties = aligner.retrieve_properties_for_entity_type(target_relation=triplet['relation'], object_types=obj_type_ids, subject_types=subj_type_ids)
    print(triplet['subject'], triplet['relation'], triplet['object_'])
    print(triplet['subject_type'], ",", triplet['object_type'])
    print([(p['label'], p['type']) for p in properties])
    for p in properties:
        print([aligner.entity2label[t] for t in p['subject_types']], p['label'], [aligner.entity2label[t] for t in p['object_types']])
    print('-'*100)

Anderson director of The Phoenician Scheme
person , film
[('director', 'direct'), ('director of publication', 'direct'), ('director of photography', 'direct'), ('editor', 'direct'), ('appointed by', 'direct'), ('appointed by', 'inverse'), ('producer', 'direct'), ('producer', 'inverse'), ('author', 'inverse'), ('author', 'direct')]
['dubbing of film', 'film series', 'film', 'première', 'fictional film', 'documentary film', 'television film', 'animated film'] director ['human']
['television film'] director of publication ['human']
['film series', 'film', 'fictional film', 'documentary film', 'television film', 'animated film'] director of photography ['human', 'fictional human']
['filmography', 'drama', 'audio release', 'dubbing of film', 'film series', 'film', 'documentary film', 'film poster', 'television film', 'animated film', 'film set'] editor ['human']
['film character'] appointed by ['human', 'fictional human', 'fictional person']
['human', 'fictional human', 'fictional person'] 