In [1]:
from tqdm import tqdm_notebook
from tqdm import tqdm

import json

from utils.openai_utils import LLMTripletExtractor
from utils.dynamic_index_utils import Aligner

import pandas as pd
import networkx as nx
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("hotpotqa200.json", "r") as f:
    ds = json.load(f)

In [3]:
ds[1]

{'_id': '5adf2fa35542993344016c11',
 'answer': 'Jonny" Craig',
 'question': 'Which of Jonny Craig and Pete Doherty has been a member of more bands ?',
 'supporting_facts': [['Jonny Craig', 0],
  ['Jonny Craig', 2],
  ['Pete Doherty', 1],
  ['Pete Doherty', 2]],
 'context': [['Pete Doherty',
   ['Peter Doherty (born 12 March 1979) is an English musician, songwriter, actor, poet, writer, and artist.',
    ' He is best known for being co-frontman of the Libertines, which he formed with Carl Barât in 1997.',
    ' His other musical projects are indie band Babyshambles and Peter Doherty and the Puta Madres.']],
  ['Relativity (Emarosa album)',
   ['Relativity is the debut album by American post-hardcore band Emarosa released on July 8, 2008 through Rise Records.',
    ' "Relativity" was produced by Kris Crummett, producer of other bands such as Drop Dead, Gorgeous and Fear Before, whom Jonny Craig worked with on Dance Gavin Dance\'s debut album the year before.']],
  ['Jonny Craig',
   ['Jo

In [4]:
device = 'cuda:1'
model_name = 'gpt-4o'
aligner = Aligner(device=device)

# triplet_filter = TripletFilter()
extractor = LLMTripletExtractor(model=model_name,  prompt2_individual_triplets_path='utils/prompts/prompt2_individual_triplets_dynamic.txt')

In [5]:
def extract_kg_from_texts(texts):
    first_step_triplets = []
    second_step_triplets = []
    generated_triplets = []

    for i, text in tqdm(enumerate(texts), total=len(texts)):

        print("Text: ", text)

        ############## first step prompting ##############
        extracted_triplets = extractor.get_completion_first_query(text)
        # extracted_triplets = parse_output(extracted_triplets)
        first_step_triplets.append(extracted_triplets)

        ############## second step aligning all entity and relation names ##############
        print("Extracted triplets after 1st step prompting: ", extracted_triplets)    
        
        for triplet in extracted_triplets:

            try:
                    
                subject_description = triplet['subject'] + "; " + extractor.generate_description_for_entity(text=text, triplet=triplet, entity=triplet['subject'])[triplet['subject']]
                
                object_description = triplet['object']  + "; " + extractor.generate_description_for_entity(text=text, triplet=triplet, entity=triplet['object'])[triplet['object']]
                
                relation_description = triplet['relation'] + "; " + \
                    extractor.generate_description_for_relation(text=text, triplet=triplet,  relation=triplet['relation'])[triplet['relation']]
                

                if len(aligner.id2entity) > 0 and len(aligner.id2relation) > 0:

                    similar_relations_with_descriptions = aligner.top_relations_by_llm_output(relations=[relation_description], with_descriptions=True)
                    similar_entities_with_descriptions = aligner.top_entities_by_llm_output(entities=[subject_description, object_description], with_descriptions=True)

                    similar_relations = aligner.top_relations_by_llm_output(relations=[triplet['relation']], with_descriptions=False)
                    similar_entities = aligner.top_entities_by_llm_output(entities=[triplet['subject'], triplet['object']], with_descriptions=False)
                    
                    for key in similar_relations:
                        similar_relations[key] = list(set(similar_relations[key] + similar_relations_with_descriptions[key]))
                    
                    for key in similar_entities:
                        similar_entities[key] = list(set(similar_entities[key] + similar_entities_with_descriptions[key]))


                    output = extractor.get_completion_second_query_by_single_triplet(similar_entities=similar_entities, 
                        similar_relations=similar_relations, text=text, triplet=triplet)
                    
                    print("OUTPUT: ", output)

                    if output['subject'] == 'None' or output['subject'] == None:
                        aligner.add_entities([triplet['subject']], [subject_description])
                        output['subject'] = triplet['subject']

                    if output['object'] == 'None' or output['object'] == None:
                        aligner.add_entities([triplet['object']], [object_description])
                        output['object'] = triplet['object']

                    if output['relation'] == 'None' or output['relation'] == None:
                        aligner.add_relations([triplet['relation']], [relation_description])
                        output['relation'] = triplet['relation']
                    
                    second_step_triplets.append(output.copy())
                
                else:
                    aligner.add_entities([triplet['subject'], triplet['object']], [subject_description, object_description])
                    aligner.add_relations([triplet['relation']], [relation_description])
                    
                    second_step_triplets.append(triplet)
                
                generated_triplets.append(second_step_triplets)

            except Exception as e:
                print(str(e))
                
        print("Extracted triplets after 2nd step prompting: ", second_step_triplets)
    
    graph_triplets = []
    for item in generated_triplets:
        graph_triplets.extend(item)
        
    df = pd.DataFrame(graph_triplets)
    df = df.drop_duplicates()

    return df

In [16]:
ds[1]['context']

[['Pete Doherty',
  ['Peter Doherty (born 12 March 1979) is an English musician, songwriter, actor, poet, writer, and artist.',
   ' He is best known for being co-frontman of the Libertines, which he formed with Carl Barât in 1997.',
   ' His other musical projects are indie band Babyshambles and Peter Doherty and the Puta Madres.']],
 ['Relativity (Emarosa album)',
  ['Relativity is the debut album by American post-hardcore band Emarosa released on July 8, 2008 through Rise Records.',
   ' "Relativity" was produced by Kris Crummett, producer of other bands such as Drop Dead, Gorgeous and Fear Before, whom Jonny Craig worked with on Dance Gavin Dance\'s debut album the year before.']],
 ['Jonny Craig',
  ['Jonathan Monroe "Jonny" Craig (born March 26, 1986) is a Canadian-American singer and songwriter.',
   ' He is currently working as a solo musician.',
   ' He has been the lead vocalist for the bands Dance Gavin Dance, Emarosa, Ghost Runner on Third, Slaves, and westerHALTS.',
   ' A

In [14]:
texts = ["".join(text[1]) for text in ds[1]['context']]
texts

['Peter Doherty (born 12 March 1979) is an English musician, songwriter, actor, poet, writer, and artist. He is best known for being co-frontman of the Libertines, which he formed with Carl Barât in 1997. His other musical projects are indie band Babyshambles and Peter Doherty and the Puta Madres.',
 'Relativity is the debut album by American post-hardcore band Emarosa released on July 8, 2008 through Rise Records. "Relativity" was produced by Kris Crummett, producer of other bands such as Drop Dead, Gorgeous and Fear Before, whom Jonny Craig worked with on Dance Gavin Dance\'s debut album the year before.',
 'Jonathan Monroe "Jonny" Craig (born March 26, 1986) is a Canadian-American singer and songwriter. He is currently working as a solo musician. He has been the lead vocalist for the bands Dance Gavin Dance, Emarosa, Ghost Runner on Third, Slaves, and westerHALTS. As a solo artist, he has released one studio album, two EPs and a live album to date. He was also a part of the supergro

In [8]:
" ".join(ds[1]['context'][0][1])

'Peter Doherty (born 12 March 1979) is an English musician, songwriter, actor, poet, writer, and artist.  He is best known for being co-frontman of the Libertines, which he formed with Carl Barât in 1997.  His other musical projects are indie band Babyshambles and Peter Doherty and the Puta Madres.'

In [10]:
df = extract_kg_from_texts([" ".join(ds[1]['context'][0][1])])

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  Peter Doherty (born 12 March 1979) is an English musician, songwriter, actor, poet, writer, and artist.  He is best known for being co-frontman of the Libertines, which he formed with Carl Barât in 1997.  His other musical projects are indie band Babyshambles and Peter Doherty and the Puta Madres.

Text: Peter Doherty (born 12 March 1979) is an English musician, songwriter, actor, poet, writer, and artist.  He is best known for being co-frontman of the Libertines, which he formed with Carl Barât in 1997.  His other musical projects are indie band Babyshambles and Peter Doherty and the Puta Madres.
[{'subject': 'Peter Doherty', 'relation': 'date of birth', 'object': '12 March 1979'}, {'subject': 'Peter Doherty', 'relation': 'country of citizenship', 'object': 'United Kingdom'}, {'subject': 'Peter Doherty', 'relation': 'occupation', 'object': 'Musician'}, {'subject': 'Peter Doherty', 'relation': 'occupation', 'object': 'Songwriter'}, {'subject': 'Peter Doherty', 'relation': 'occup

100%|██████████| 1/1 [02:03<00:00, 123.21s/it]

{'subject': 'The Libertines', 'relation': 'None', 'object': 'None'}

OUTPUT:  {'subject': 'The Libertines', 'relation': 'None', 'object': 'None'}
Extracted triplets after 2nd step prompting:  [{'subject': 'Peter Doherty', 'relation': 'date of birth', 'object': '12 March 1979'}, {'subject': 'Peter Doherty', 'relation': 'country of citizenship', 'object': 'United Kingdom'}, {'subject': 'Peter Doherty', 'relation': 'occupation', 'object': 'Musician'}, {'subject': 'Peter Doherty', 'relation': 'occupation', 'object': 'Songwriter'}, {'subject': 'Peter Doherty', 'relation': 'occupation', 'object': 'Actor'}, {'subject': 'Peter Doherty', 'relation': 'occupation', 'object': 'Poet'}, {'subject': 'Peter Doherty', 'relation': 'occupation', 'object': 'Writer'}, {'subject': 'Peter Doherty', 'relation': 'occupation', 'object': 'Artist'}, {'subject': 'Peter Doherty', 'relation': 'member of', 'object': 'The Libertines'}, {'subject': 'Peter Doherty', 'relation': 'member of', 'object': 'Babyshambles'}, {'




In [12]:
df.drop_duplicates()

Unnamed: 0,subject,relation,object
0,Peter Doherty,date of birth,12 March 1979
1,Peter Doherty,country of citizenship,United Kingdom
2,Peter Doherty,occupation,Musician
3,Peter Doherty,occupation,Songwriter
4,Peter Doherty,occupation,Actor
5,Peter Doherty,occupation,Poet
6,Peter Doherty,occupation,Writer
7,Peter Doherty,occupation,Artist
8,Peter Doherty,member of,The Libertines
9,Peter Doherty,member of,Babyshambles


In [41]:
G = nx.Graph()
for _, row in df.iterrows():
  G.add_edge(row['subject'], row['object'], label=row['relation'])

pos = nx.fruchterman_reingold_layout(G, k=0.5)

In [42]:
# Create edge traces
edge_traces = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_trace = go.Scatter(
        x=[x0, x1, None],
        y=[y0, y1, None],
        mode="lines",
        line=dict(width=0.5, color="gray"),
        hoverinfo="none"
     )
    edge_traces.append(edge_trace)

# Create node trace
node_trace = go.Scatter(
    x=[pos[node][0] for node in G.nodes()],
    y=[pos[node][1] for node in G.nodes()],
    mode='markers+text',
    marker=dict(size=10, color='lightblue'),
    text=[node for node in G.nodes()],
    textposition='top center',
    hoverinfo='text',
    textfont=dict(size=7)
)

# Create edge label trace
edge_label_trace = go.Scatter(
    x=[(pos[edge[0]][0] + pos[edge[1]][0]) / 2 for edge in G.edges()],
    y=[(pos[edge[0]][1] + pos[edge[1]][1]) / 2 for edge in G.edges()],
    mode='text',
    text=[G[edge[0]][edge[1]]['label'] for edge in G.edges()],
    textposition='middle center',
    hoverinfo='none',
    textfont=dict(size=7)
)

# Create layout
layout = go.Layout(
    title="Knowledge Graph",
    titlefont_size=16,
    title_x=0.5,
    showlegend=False,
    hovermode="closest",
    margin=dict(b=20, l=5, r=5, t=40),
    xaxis_visible=False,
    yaxis_visible=False
)

# Create Plotly figure
fig = go.Figure(data=edge_traces + [node_trace, edge_label_trace], layout=layout)

# Show the interactive plot
fig.show()

In [53]:
ds[1]['answer']

'Jonny" Craig'

In [46]:
question_triplets = extractor.extract_triplets_from_question(question=ds[1]['question'])
question_triplets

[{'subject': 'Jonny Craig', 'relation': 'member of', 'object': '<entity>'},
 {'subject': 'Pete Doherty', 'relation': 'member of', 'object': '<entity>'},
 {'subject': 'Jonny Craig',
  'relation': 'member of more bands than',
  'object': 'Pete Doherty'}]

In [47]:
entities = [triplet['subject'] for triplet in question_triplets if triplet['subject'] != "<entity>"] + \
    [triplet['object'] for triplet in question_triplets if triplet['object'] != "<entity>"]
entities = list(set(entities))

relations = [triplet['relation'] for triplet in question_triplets if triplet['relation'] != '<relation>']
relations = list(set(relations))

refined_entities = aligner.top_entities_by_llm_output(entities=entities, with_descriptions=False)
refined_relations = aligner.top_relations_by_llm_output(relations=relations, with_descriptions=False)

In [50]:
# question_triplets[0], question_triplets[1] = question_triplets[1], question_triplets[0]
# question_triplets

In [51]:
missed_candidate_entities = []
ground_triplets = []
# missed_candidate_relations = []

for triplet in question_triplets:
    if triplet['subject'] == '<entity>' and triplet['object'] == '<entity>':
        subj_condition = df['subject'].apply(lambda x: x in missed_candidate_entities)
        obj_condition = df['object'].apply(lambda x: x in missed_candidate_entities)
        relation_condition = df['relation'].apply(lambda x: x in refined_relations[triplet['relation']])

        candidate_triplets = df[(subj_condition | obj_condition) & relation_condition].to_dict(orient='records')
        ground_triplets.extend(candidate_triplets)

        new_entities = [triplet['subject'] for triplet in question_triplets] + [triplet['object'] for triplet in candidate_triplets]
        new_entities = list(set(new_entities))
        missed_candidate_entities.extend(new_entities)
    
    elif triplet['subject'] == '<entity>':
        subj_condition = df['subject'].apply(lambda x: x in refined_entities[triplet['object']])
        obj_condition = df['object'].apply(lambda x: x in refined_entities[triplet['object']])
        relation_condition =  df['relation'].apply(lambda x: x in refined_relations[triplet['relation']])

        candidate_triplets = df[(obj_condition | subj_condition) & relation_condition].to_dict(orient='records')
        ground_triplets.extend(candidate_triplets)

        new_entities = [triplet['subject'] for triplet in candidate_triplets] + [triplet['object'] for triplet in candidate_triplets]
        new_entities = list(set(new_entities))
        missed_candidate_entities.extend(new_entities)
    
    elif triplet['object'] == '<entity>':
        subj_condition = df['subject'].apply(lambda x: x in refined_entities[triplet['subject']])
        obj_condition = df['object'].apply(lambda x: x in refined_entities[triplet['subject']])
        relation_condition =  df['relation'].apply(lambda x: x in refined_relations[triplet['relation']])

        candidate_triplets = df[(subj_condition | obj_condition) & relation_condition].to_dict(orient='records')
        ground_triplets.extend(candidate_triplets)

        new_entities = [triplet['subject'] for triplet in candidate_triplets] + [triplet['object'] for triplet in candidate_triplets]
        new_entities = list(set(new_entities))
        missed_candidate_entities.extend(new_entities)
    
    else:
        continue
    

In [52]:
ground_triplets

[{'subject': 'Peter Doherty', 'relation': 'instance of', 'object': 'Musician'},
 {'subject': 'Peter Doherty', 'relation': 'instance of', 'object': 'Actor'},
 {'subject': 'Peter Doherty', 'relation': 'instance of', 'object': 'Poet'},
 {'subject': 'Peter Doherty',
  'relation': 'member of',
  'object': 'The Libertines'},
 {'subject': 'The Libertines',
  'relation': 'member of',
  'object': 'Peter Doherty'},
 {'subject': 'Peter Doherty',
  'relation': 'member of',
  'object': 'Babyshambles'},
 {'subject': 'Peter Doherty',
  'relation': 'member of',
  'object': 'Peter Doherty and the Puta Madres'},
 {'subject': 'Jonny Craig', 'relation': 'producer', 'object': 'Kris Crummett'},
 {'subject': 'Jonny Craig',
  'relation': 'member of',
  'object': 'Post-hardcore band'},
 {'subject': 'Jonny Craig',
  'relation': 'member of',
  'object': 'Dance Gavin Dance'},
 {'subject': 'Jonny Craig', 'relation': 'member of', 'object': 'Emarosa'},
 {'subject': 'Jonny Craig', 'relation': 'member of', 'object': '

In [37]:
extractor.calculate_cost()

2.78697

In [24]:
df

Unnamed: 0,subject,relation,object
0,Guns N' Roses,instance of,Hard rock band
1,Guns N' Roses,country of origin,United States
3,Guns N' Roses,inception,1985
4,Guns N' Roses,has part,Hollywood Rose
5,Guns N' Roses,has part,L.A. Guns
...,...,...,...
139,Arnold Schwarzenegger,instance of,Action film
140,Arnold Schwarzenegger,instance of,Comedy film
141,Arnold Schwarzenegger,performer of,Music video
142,Arnold Schwarzenegger,performer of,Actor


In [25]:
df[df['subject'] == 'Arnold Schwarzenegger']

Unnamed: 0,subject,relation,object
35,Arnold Schwarzenegger,performer of,Jericho Cane
108,Arnold Schwarzenegger,performer of,Jack Slater
109,Arnold Schwarzenegger,producer,Last Action Hero
133,Arnold Schwarzenegger,occupation,Actor
136,Arnold Schwarzenegger,instance of,Actor
137,Arnold Schwarzenegger,billed as,Arnold Strong
139,Arnold Schwarzenegger,instance of,Action film
140,Arnold Schwarzenegger,instance of,Comedy film
141,Arnold Schwarzenegger,performer of,Music video
142,Arnold Schwarzenegger,performer of,Actor


In [26]:
df[df['object'] == 'Arnold Schwarzenegger']

Unnamed: 0,subject,relation,object
28,End of Days,has part,Arnold Schwarzenegger
105,Last Action Hero,performer of,Arnold Schwarzenegger
118,True Lies,performer of,Arnold Schwarzenegger
131,Harry Tasker,performer of,Arnold Schwarzenegger


In [28]:
df[df['subject'] == 'Last Action Hero']

Unnamed: 0,subject,relation,object
96,Last Action Hero,inception,1993
97,Last Action Hero,country of origin,United States
98,Last Action Hero,instance of,Fantasy film
99,Last Action Hero,instance of,Action film
101,Last Action Hero,director,John McTiernan
102,Last Action Hero,producer,John McTiernan
104,Last Action Hero,based on,Action film
105,Last Action Hero,performer of,Arnold Schwarzenegger
106,Last Action Hero,performer of,Austin O'Brien
107,Last Action Hero,performer of,Charles Dance


In [29]:
df[df['object'] == 'Last Action Hero']

Unnamed: 0,subject,relation,object
109,Arnold Schwarzenegger,producer,Last Action Hero
110,Austin O'Brien,performer of,Last Action Hero


In [30]:
df[df['object'] == "Guns N' Roses"]

Unnamed: 0,subject,relation,object
43,Steven Adler,has part,Guns N' Roses
74,Hammerjacks Concert Hall and Nightclub,notable for,Guns N' Roses
88,The Recycler,helped launch career of,Guns N' Roses


In [31]:
df[df['subject'] == "Guns N' Roses"]

Unnamed: 0,subject,relation,object
0,Guns N' Roses,instance of,Hard rock band
1,Guns N' Roses,country of origin,United States
3,Guns N' Roses,inception,1985
4,Guns N' Roses,has part,Hollywood Rose
5,Guns N' Roses,has part,L.A. Guns
6,Guns N' Roses,record label,Geffen Records
7,Guns N' Roses,has part,Appetite for Destruction
62,Guns N' Roses,replaced,Faction With Jason Ellis


In [32]:
texts

['Guns N\' Roses is an American hard rock band formed in Los Angeles, California in 1985 by members of Hollywood Rose and L.A. Guns.  The band has released six studio albums, two live albums, two compilation albums, two extended plays, seven video albums, eighteen singles, twenty four music videos and one video single.  Guns N\' Roses signed a deal with Geffen Records in 1986, after the independently released EP "Live ?! *@ Like a Suicide" a year before.  Its debut studio album "Appetite for Destruction" was released in 1987, reached the top of the "Billboard" 200 and sold 18 million units in the United States and approximately 33 million units worldwide.',
 'Get Christie Love!  is a 1974 made-for-television film and subsequent crime drama TV series starring Teresa Graves as an undercover female police detective who is determined to overthrow a drug ring.  This film is based on Dorothy Uhnak\'s crime-thriller novel "The Ledger".  However, the main character "Christie Opara"—a white, Ne