In [1]:
from tqdm import tqdm_notebook
from tqdm import tqdm

import json
import pandas as pd

from utils.openai_utils import LLMTripletExtractor
from utils.dynamic_index_utils import Aligner

import os
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("hotpotqa200.json", "r") as f:
    ds = json.load(f)

In [9]:
ds[0]['context'][0][1]

['Constantin Medien',
 ['Constantin Medien AG (formerly EM.Entertainment and EM.TV & Merchandising AG, then EM.TV AG, and finally em.sport media ag) is a German media group, based in Ismaning near Munich, active in the area of sports, film and event marketing to medium-sized media companies.']]

In [31]:
sum([elem['answer'] in " ".join([" ".join(sent[1]) for sent in elem['context']]) for elem in ds]) / len(ds)

0.99

In [3]:
ds[1]['question']

'Which of Jonny Craig and Pete Doherty has been a member of more bands ?'

In [4]:
model_name = 'gpt-4o'
device = 'cuda:1'

aligner = Aligner(device=device)
extractor = LLMTripletExtractor(model=model_name)

In [6]:
extractor.calculate_cost()

0.0

In [7]:
retrieved_triplets = {}

for sample in  tqdm(os.listdir('hotpot200_res')):

    df = pd.read_csv("hotpot200_res/" + sample, index_col=0)

    aligner = Aligner(device=device)
    entities = list(set(list(df.subject) + list(df.object)))
    aligner.add_entities(entities, descriptions=['' for _ in range(len(entities))])

    id_ = int(sample.split(".")[0])
    extracted_entities = extractor.extract_entities_from_question(ds[id_]['question'])

    top_entities = aligner.top_entities_by_llm_output(list(extracted_entities.keys()))
    unique_top_entities = []

    for mapped_entities in top_entities.values():
        unique_top_entities.extend(mapped_entities)

    unique_top_entities = list(set(unique_top_entities))

    triplets = df[(df['subject'].apply(lambda x: x in unique_top_entities)) | (df['object'].apply(lambda x: x in unique_top_entities))]

    triplets = triplets.to_records(index=False)

    retrieved_triplets[id_] = triplets



  0%|          | 0/50 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 50/50 [02:33<00:00,  3.07s/it]


In [8]:
extractor.calculate_cost()

0.110065

In [12]:
contain_ans = 0
sample_cnt = 0

for i, sample in retrieved_triplets.items():
    df = pd.DataFrame(sample)

    ans = ds[i]['answer']

    if len(df[(df['subject'] == ans) | (df['object'] == ans)]) > 0:
        contain_ans += 1
    else:
        print(ans)
    sample_cnt += 1

contain_ans / sample_cnt

Jonny" Craig
Bath, Maine
fortnightly women interest magazine
a failed coup attempt
2 March 1972
no
Pulitzer Prize
Hawaii County
German
1941
super-regional shopping mall
M. Night Shyamalan
1978
London
no
chronological collection of critical quotations
22 November
Minnesota
1927
"Read It and Weep" (2006)
Velvetpark
Papa Gino's
Shukratara
Captain Hans Geering
Frederick Alexander
The Bad Hemingway Contest
Vivendi S.A.
Jimmy Ellis
Mark Masons' Hall
no
Aloe Vera of America


0.38

In [18]:
contain_ans = 0
sample_cnt = 0

for i, sample in retrieved_triplets.items():
    df = pd.DataFrame(sample)
    
    ans = ds[i]['answer']
    subj_obj = list(df['subject']) + list(df['object'])

    if any([ans in elem for elem in subj_obj]) or any([elem in ans for elem in subj_obj]):
        contain_ans += 1
    else:
        print(ans)
    sample_cnt += 1

contain_ans / sample_cnt

Jonny" Craig
fortnightly women interest magazine
a failed coup attempt
2 March 1972
German
1941
super-regional shopping mall
M. Night Shyamalan
1978
London
no
chronological collection of critical quotations
22 November
Minnesota
1927
Captain Hans Geering
The Bad Hemingway Contest
Vivendi S.A.
Jimmy Ellis
Mark Masons' Hall
no
Aloe Vera of America


0.56

In [22]:
df

Unnamed: 0,subject,relation,object
0,Max Charles,occupation,Actor
1,Jonathan Walsh,occupation,Professional StarCraft 2 player
2,Jonathan Walsh,member of sports team,GOMTV Global Starcraft II League
3,Jonathan Walsh,award received,Major League Gaming Starcraft 2 tournament in ...
4,GOMTV Global Starcraft II League,platform,Professional StarCraft 2 player
5,Jonathan Walsh,award received,GOMTV Global Starcraft II League
6,Pedro Moreno Durán,occupation,Professional StarCraft 2 player
7,LucifroN,participant in,DreamHack
8,Pedro Moreno Durán,participant in,Professional StarCraft 2 player
9,Atsuko Tanaka,instance of,Actor


In [21]:
ds[i]['question']

'The role of "Celene" in the film "Walk All over Me" was played by an actress that voices what role in the "Starcraft 2" triolgy?'

In [23]:
ds[i]['answer']

'Sarah Kerrigan'

In [38]:
len_dist = [len(elem) for elem in retrieved_triplets.values()]

hist = go.Histogram(x=len_dist)
fig = go.Figure(data=hist)

# Show the interactive plot
fig.show()