In [1]:
from datasets import load_dataset
from tqdm import tqdm_notebook
import datasets
from tqdm import tqdm

import json
import openai
import os

from utils.eval_utils import micro_precision, micro_recall, f1_score
from utils.openai_utils import LLMTripletExtractor
from utils.dynamic_index_utils import Aligner
from utils.verifier_utils import TripletFilter

import pandas as pd
import networkx as nx
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
ds = load_dataset("hotpotqa/hotpot_qa", "fullwiki")

Downloading builder script: 100%|██████████| 6.42k/6.42k [00:00<00:00, 21.1MB/s]
Downloading readme: 100%|██████████| 9.19k/9.19k [00:00<00:00, 22.4MB/s]


Downloading and preparing dataset hotpot_qa/fullwiki to /home/jovyan/.cache/huggingface/datasets/hotpotqa___hotpot_qa/fullwiki/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5...


Downloading data: 100%|██████████| 566M/566M [00:24<00:00, 22.8MB/s]
Downloading data: 100%|██████████| 47.5M/47.5M [00:03<00:00, 15.7MB/s]
Downloading data: 100%|██████████| 46.2M/46.2M [00:02<00:00, 16.7MB/s]
Downloading data files: 100%|██████████| 3/3 [00:32<00:00, 10.94s/it]
                                                                                         

Dataset hotpot_qa downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/hotpotqa___hotpot_qa/fullwiki/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 27.87it/s]


In [9]:
ds['train'][12]

{'id': '5a7722d655429966f1a36c99',
 'question': 'Where did the form of music played by Die Rhöner Säuwäntzt originate?',
 'answer': 'United States',
 'type': 'bridge',
 'level': 'medium',
 'supporting_facts': {'title': ['Die Rhöner Säuwäntzt', 'Skiffle'],
  'sent_id': [0, 1]},
 'context': {'title': ['Pantun Sunda',
   'Bubu music',
   'Peşrev',
   'Skiffle',
   'Cachi Cachi music',
   'Background music',
   'Kantrum',
   'Chillador',
   'Die Rhöner Säuwäntzt',
   'Minoru Kojima'],
  'sentences': [['Pantun Sunda is a type of Sundanese oral narrative performance interspersed with songs and music played on a "kacapi", a kind of zither.',
    ' A pantun is intended to be recited during an evening-length performance during which a single performer relates the story of a hero’s initiation: The protagonist leaves his kingdom in order to seek experiences, beautiful princesses to become his wife, power, other kingdoms to subject, the realization of a dream (Rosidi 1984a:143); after having succe

In [11]:
device = 'cuda:1'
model_name = 'gpt-4o'
aligner = Aligner(device=device)

# triplet_filter = TripletFilter()
extractor = LLMTripletExtractor(model=model_name,  prompt2_individual_triplets_path='utils/prompts/prompt2_individual_triplets_dynamic.txt')

In [12]:
def extract_kg_from_texts(texts):
    first_step_triplets = []
    second_step_triplets = []
    generated_triplets = []

    for i, text in tqdm(enumerate(texts), total=len(texts)):

        print("Text: ", text)

        ############## first step prompting ##############
        extracted_triplets = extractor.get_completion_first_query(text)
        # extracted_triplets = parse_output(extracted_triplets)
        first_step_triplets.append(extracted_triplets)

        ############## second step aligning all entity and relation names ##############
        print("Extracted triplets after 1st step prompting: ", extracted_triplets)    
        
        for triplet in extracted_triplets:

            try:
                    
                subject_description = triplet['subject'] + "; " + extractor.generate_description_for_entity(text=text, triplet=triplet, entity=triplet['subject'])[triplet['subject']]
                
                object_description = triplet['object']  + "; " + extractor.generate_description_for_entity(text=text, triplet=triplet, entity=triplet['object'])[triplet['object']]
                
                relation_description = triplet['relation'] + "; " + \
                    extractor.generate_description_for_relation(text=text, triplet=triplet,  relation=triplet['relation'])[triplet['relation']]
                

                if len(aligner.id2entity) > 0 and len(aligner.id2relation) > 0:

                    similar_relations_with_descriptions = aligner.top_relations_by_llm_output(relations=[relation_description], with_descriptions=True)
                    similar_entities_with_descriptions = aligner.top_entities_by_llm_output(entities=[subject_description, object_description], with_descriptions=True)

                    similar_relations = aligner.top_relations_by_llm_output(relations=[triplet['relation']], with_descriptions=False)
                    similar_entities = aligner.top_entities_by_llm_output(entities=[triplet['subject'], triplet['object']], with_descriptions=False)
                    
                    for key in similar_relations:
                        similar_relations[key] = list(set(similar_relations[key] + similar_relations_with_descriptions[key]))
                    
                    for key in similar_entities:
                        similar_entities[key] = list(set(similar_entities[key] + similar_entities_with_descriptions[key]))


                    output = extractor.get_completion_second_query_by_single_triplet(similar_entities=similar_entities, 
                        similar_relations=similar_relations, text=text, triplet=triplet)
                    
                    print("OUTPUT: ", output)
                    if output['subject'] == 'None' or output['subject'] == None:
                        aligner.add_entities([triplet['subject']], [subject_description])
                        output['subject'] = triplet['subject']

                    if output['object'] == 'None' or output['object'] == None:
                        aligner.add_entities([triplet['object']], [object_description])
                        output['object'] = triplet['object']

                    if output['relation'] == 'None' or output['relation'] == None:
                        aligner.add_relations([triplet['relation']], [relation_description])
                        output['relation'] = triplet['relation']
                    
                    second_step_triplets.append(output.copy())
                
                else:
                    aligner.add_entities([triplet['subject'], triplet['object']], [subject_description, object_description])
                    aligner.add_relations([triplet['relation']], [relation_description])
                    
                    second_step_triplets.append(triplet)
                

                print("Extracted triplets after 2nd step prompting: ", second_step_triplets)

                generated_triplets.append(second_step_triplets)

            except Exception as e:
                print(str(e))
    
    graph_triplets = []
    for item in generated_triplets:
        graph_triplets.extend(item)
        df = pd.DataFrame(graph_triplets)
        df = df.drop_duplicates()

    return df


In [13]:
ds['validation'][4]

{'id': '5a8e3ea95542995a26add48d',
 'question': 'The director of the romantic comedy "Big Stone Gap" is based in what New York city?',
 'answer': 'Greenwich Village, New York City',
 'type': 'bridge',
 'level': 'hard',
 'supporting_facts': {'title': ['Big Stone Gap (film)', 'Adriana Trigiani'],
  'sent_id': [0, 0]},
 'context': {'title': ['Great Eastern Conventions',
   'Big Stone Gap (film)',
   'I Love NY (2015 film)',
   'Just Another Romantic Wrestling Comedy',
   "Hamish and Andy's Gap Year",
   'Sex and the City (film)',
   'Nola (film)',
   'Kingston Morning',
   'Clinton, Minnesota',
   'New York Society of Model Engineers'],
  'sentences': [['Great Eastern Conventions, Inc. was an entertainment company which produced comic book conventions, most actively during the years 1987-1996.',
    " In New York City, the Great Eastern shows filled the gap between the mid-1980s demise of the annual Comic Art Convention and Creation Conventions, and the establishment of promoter Michael C

In [14]:
texts = [" ".join(text) for text in ds['validation'][4]['context']['sentences']]
texts

["Great Eastern Conventions, Inc. was an entertainment company which produced comic book conventions, most actively during the years 1987-1996.  In New York City, the Great Eastern shows filled the gap between the mid-1980s demise of the annual Comic Art Convention and Creation Conventions, and the establishment of promoter Michael Carbonaro's annual Big Apple Comic Con in 1996.  From 1993–1995, Great Eastern hosted two New York City shows annually at the Jacob K. Javits Convention Center.  Great Eastern also ran shows in New Jersey, Pennsylvania, Massachusetts, Oregon, Minnesota, and Texas.",
 "Big Stone Gap is a 2014 American drama romantic comedy film written and directed by Adriana Trigiani and produced by Donna Gigliotti for Altar Identity Studios, a subsidiary of Media Society.  Based on Trigiani's 2000 best-selling novel of the same name, the story is set in the actual Virginia town of Big Stone Gap circa 1970s.  The film had its world premiere at the Virginia Film Festival on N

In [15]:
df = extract_kg_from_texts(texts)

  0%|          | 0/10 [00:00<?, ?it/s]

Text:  Great Eastern Conventions, Inc. was an entertainment company which produced comic book conventions, most actively during the years 1987-1996.  In New York City, the Great Eastern shows filled the gap between the mid-1980s demise of the annual Comic Art Convention and Creation Conventions, and the establishment of promoter Michael Carbonaro's annual Big Apple Comic Con in 1996.  From 1993–1995, Great Eastern hosted two New York City shows annually at the Jacob K. Javits Convention Center.  Great Eastern also ran shows in New Jersey, Pennsylvania, Massachusetts, Oregon, Minnesota, and Texas.
Extracted triplets after 1st step prompting:  [{'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'produced', 'object': 'Comic book conventions'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'active years', 'object': '1987-1996'}, {'subject': 'Great Eastern Conve

 10%|█         | 1/10 [01:53<16:57, 113.11s/it]

OUTPUT:  {'subject': 'Big Apple Comic Con', 'relation': 'established', 'object': 'None'}
Extracted triplets after 2nd step prompting:  [{'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'active years', 'object': '1987-1996'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New York City'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New Jersey'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Pennsylvania'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Massachusetts'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Oregon'}, {'subject': 'Gre

 20%|██        | 2/10 [03:41<14:42, 110.30s/it]

OUTPUT:  {'subject': 'Big Stone Gap', 'relation': 'publication date', 'object': '2014'}
Extracted triplets after 2nd step prompting:  [{'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'active years', 'object': '1987-1996'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New York City'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New Jersey'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Pennsylvania'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Massachusetts'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Oregon'}, {'subject': 'Grea

 30%|███       | 3/10 [05:37<13:11, 113.05s/it]

OUTPUT:  {'subject': 'The Irony of Fate', 'relation': 'publication date', 'object': 'None'}
Extracted triplets after 2nd step prompting:  [{'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'active years', 'object': '1987-1996'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New York City'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New Jersey'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Pennsylvania'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Massachusetts'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Oregon'}, {'subject': '

 40%|████      | 4/10 [09:03<14:58, 149.76s/it]

OUTPUT:  {'subject': 'New York City', 'relation': 'located in the administrative territorial entity', 'object': 'United States of America'}
Extracted triplets after 2nd step prompting:  [{'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'active years', 'object': '1987-1996'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New York City'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New Jersey'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Pennsylvania'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Massachusetts'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'locati

 50%|█████     | 5/10 [10:44<11:00, 132.20s/it]

OUTPUT:  {'subject': "Hamish & Andy's Gap Year", 'relation': 'location of event', 'object': 'Bangkok'}
Extracted triplets after 2nd step prompting:  [{'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'active years', 'object': '1987-1996'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New York City'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New Jersey'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Pennsylvania'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Massachusetts'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Oregon'}, {'

 60%|██████    | 6/10 [13:19<09:18, 139.69s/it]

OUTPUT:  {'subject': 'Sex and the City', 'relation': 'None', 'object': 'None'}
Extracted triplets after 2nd step prompting:  [{'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'active years', 'object': '1987-1996'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New York City'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New Jersey'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Pennsylvania'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Massachusetts'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Oregon'}, {'subject': 'Great Eastern

 70%|███████   | 7/10 [14:17<05:39, 113.25s/it]

OUTPUT:  {'subject': 'Nola', 'relation': 'instance of', 'object': 'Romantic comedy'}
Extracted triplets after 2nd step prompting:  [{'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'active years', 'object': '1987-1996'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New York City'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New Jersey'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Pennsylvania'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Massachusetts'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Oregon'}, {'subject': 'Great E

 80%|████████  | 8/10 [15:13<03:09, 94.90s/it] 

OUTPUT:  {'subject': 'Itsbynne Reel', 'relation': 'award received', 'object': '53rd Grammy Awards'}
Extracted triplets after 2nd step prompting:  [{'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'active years', 'object': '1987-1996'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New York City'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New Jersey'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Pennsylvania'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Massachusetts'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Oregon'}, {'sub

 90%|█████████ | 9/10 [16:04<01:21, 81.26s/it]

OUTPUT:  {'subject': 'DeWitt Clinton', 'relation': 'None', 'object': 'None'}
Extracted triplets after 2nd step prompting:  [{'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'active years', 'object': '1987-1996'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New York City'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New Jersey'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Pennsylvania'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Massachusetts'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Oregon'}, {'subject': 'Great Eastern C

100%|██████████| 10/10 [18:40<00:00, 112.02s/it]

OUTPUT:  {'subject': 'New York Society of Model Engineers', 'relation': 'location of event', 'object': 'New York City'}
Extracted triplets after 2nd step prompting:  [{'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'instance of', 'object': 'Entertainment company'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'active years', 'object': '1987-1996'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New York City'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'New Jersey'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Pennsylvania'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'object': 'Massachusetts'}, {'subject': 'Great Eastern Conventions, Inc.', 'relation': 'location of event', 'objec




In [16]:
df.drop_duplicates()

Unnamed: 0,subject,relation,object
0,"Great Eastern Conventions, Inc.",instance of,Entertainment company
2,"Great Eastern Conventions, Inc.",active years,1987-1996
3,"Great Eastern Conventions, Inc.",location of event,New York City
4,"Great Eastern Conventions, Inc.",location of event,New Jersey
5,"Great Eastern Conventions, Inc.",location of event,Pennsylvania
...,...,...,...
123,The Union Connecting,based on,Lackawanna Railroad
124,Lackawanna Railroad,connects with,Hoboken Terminal
125,Lackawanna Railroad,connects with,Pennsylvania
126,New York Society of Model Engineers,has part,Model motor boat races


In [17]:
G = nx.Graph()
for _, row in df.iterrows():
  G.add_edge(row['subject'], row['object'], label=row['relation'])

pos = nx.fruchterman_reingold_layout(G, k=0.5)

In [21]:
# Create edge traces
edge_traces = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_trace = go.Scatter(
        x=[x0, x1, None],
        y=[y0, y1, None],
        mode="lines",
        line=dict(width=0.5, color="gray"),
        hoverinfo="none"
     )
    edge_traces.append(edge_trace)

# Create node trace
node_trace = go.Scatter(
    x=[pos[node][0] for node in G.nodes()],
    y=[pos[node][1] for node in G.nodes()],
    mode='markers+text',
    marker=dict(size=10, color='lightblue'),
    text=[node for node in G.nodes()],
    textposition='top center',
    hoverinfo='text',
    textfont=dict(size=7)
)

# Create edge label trace
edge_label_trace = go.Scatter(
    x=[(pos[edge[0]][0] + pos[edge[1]][0]) / 2 for edge in G.edges()],
    y=[(pos[edge[0]][1] + pos[edge[1]][1]) / 2 for edge in G.edges()],
    mode='text',
    text=[G[edge[0]][edge[1]]['label'] for edge in G.edges()],
    textposition='middle center',
    hoverinfo='none',
    textfont=dict(size=7)
)

# Create layout
layout = go.Layout(
    title="Knowledge Graph",
    titlefont_size=16,
    title_x=0.5,
    showlegend=False,
    hovermode="closest",
    margin=dict(b=20, l=5, r=5, t=40),
    xaxis_visible=False,
    yaxis_visible=False
)

# Create Plotly figure
fig = go.Figure(data=edge_traces + [node_trace, edge_label_trace], layout=layout)

# Show the interactive plot
fig.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [22]:
! pip install nbformat

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
ds['validation'][4]['question']

'The director of the romantic comedy "Big Stone Gap" is based in what New York city?'

In [14]:
question_triplets = extractor.extract_triplets_from_question(question=ds['validation'][4]['question'])
question_triplets

[{'subject': 'Big Stone Gap',
  'relation': 'genre',
  'object': 'romantic comedy'},
 {'subject': '<entity>', 'relation': 'directed', 'object': 'Big Stone Gap'},
 {'subject': '<entity>', 'relation': 'based in', 'object': '<entity>'},
 {'subject': '<entity>', 'relation': 'located in', 'object': 'New York'}]

In [None]:
from itertools import product
def generate_triplet_hypothesis(triplet, placeholder_entities=['<entity>'], placeholder_relations=['<relation>']):
    
    refined_subjects = []
    refined_objects = []
    refined_relations = []

    if triplet['subject'] != '<entity>':
        # entity_desc = extractor.generate_description_for_entity_from_question(text=ds['validation'][4]['question'], triplet=question_triplets[0], entity=question_triplets[0]['subject'])
        refined_subjects = aligner.top_entities_by_llm_output(entities=[triplet['subject']], with_descriptions=False)[triplet['subject']]
    else:
        refined_subjects = placeholder_entities
    
    if triplet['object'] != '<entity>':
        # entity_desc = extractor.generate_description_for_entity_from_question(text=ds['validation'][4]['question'], triplet=question_triplets[0], entity=question_triplets[0]['subject'])
        refined_objects = aligner.top_entities_by_llm_output(entities=[triplet['object']], with_descriptions=False)[triplet['object']]
    else:
        refined_objects = placeholder_entities

    if triplet['relation'] != '<relation>':
        refined_relations = aligner.top_relations_by_llm_output(relations=[triplet['relation']], with_descriptions=False)[triplet['relation']]
    else:
        refined_relations = placeholder_relations

    return list(product(refined_subjects, refined_relations, refined_objects))

In [68]:
generate_triplet_hypothesis(question_triplets[1])

[('<entity>', 'starring', 'Big Stone Gap'),
 ('<entity>', 'starring', 'Big Stone County'),
 ('<entity>', 'starring', 'Delaware Water Gap model'),
 ('<entity>', 'starring', 'Big Apple Comic Con'),
 ('<entity>', 'starring', "Hamish & Andy's Gap Year"),
 ('<entity>', 'screenwriter', 'Big Stone Gap'),
 ('<entity>', 'screenwriter', 'Big Stone County'),
 ('<entity>', 'screenwriter', 'Delaware Water Gap model'),
 ('<entity>', 'screenwriter', 'Big Apple Comic Con'),
 ('<entity>', 'screenwriter', "Hamish & Andy's Gap Year"),
 ('<entity>', 'director', 'Big Stone Gap'),
 ('<entity>', 'director', 'Big Stone County'),
 ('<entity>', 'director', 'Delaware Water Gap model'),
 ('<entity>', 'director', 'Big Apple Comic Con'),
 ('<entity>', 'director', "Hamish & Andy's Gap Year"),
 ('<entity>', 'producer', 'Big Stone Gap'),
 ('<entity>', 'producer', 'Big Stone County'),
 ('<entity>', 'producer', 'Delaware Water Gap model'),
 ('<entity>', 'producer', 'Big Apple Comic Con'),
 ('<entity>', 'producer', "Hami

In [None]:
for triplet in hypothesis:
    if triplet[0] == "<entity>" and triplet[2] != "<entity>":
        
        obj_cond = (graph_df['object'] == triplet[2])
        subj_cond = (graph_df['subject'] == triplet[2])

        graph_triplet = graph_df[(obj_cond | subj_cond) & (graph_df['relation'] == triplet[1])].to_dict(orient='records')

In [63]:
def filter_hypothesis(hypothesis_list, graph_df):
    verified_triplets = []

    for triplet in hypothesis_list:
        subj_obj_cond = (graph_df['subject'] == triplet[0]) & (graph_df['object'] == triplet[2])
        obj_subj_cond = (graph_df['subject'] == triplet[2]) & (graph_df['object'] == triplet[0])

        graph_triplet = graph_df[(subj_obj_cond | obj_subj_cond) & (graph_df['relation'] == triplet[1])].to_dict(orient='records')
        if len(graph_triplet) > 0:
            verified_triplets.extend(graph_triplet)
    return verified_triplets

In [15]:
extractor.generate_description_for_entity_from_question(text=ds['validation'][4]['question'], triplet=question_triplets[0], entity=question_triplets[0]['subject'])

{'Big Stone Gap': 'Big Stone Gap is a romantic comedy film directed by Adriana Trigiani, based on her novel of the same name. The film is set in a small town in Virginia and follows the life of a local woman whose life changes after discovering a family secret.'}