In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

import json

import llama_cpp
import torch
import pandas as pd

from dotenv import load_dotenv
from openai import OpenAI

from curverag import utils
from curverag.curverag import CurveRAG, DEFAULT_ENTITY_TYPES, DEFAULT_GLINER_MODEL, DEFAULT_SENTENCE_TRANSFORMER_MODEL
from curverag.graph import KnowledgeGraph
from curverag.atth.kg_dataset import KGDataset
from curverag.atth.models.hyperbolic import AttH
from curverag.eval import evaluation, queries, context, expected_output

load_dotenv() 

# Explore dataset

In [None]:
with open('../datasets/2WikiMultihopQA/new/train.json', 'rb') as f:
    train = json.load(f)

with open('../datasets/2WikiMultihopQA/new/dev.json', 'rb') as f:
    dev = json.load(f)

with open('../datasets/2WikiMultihopQA/new/test.json', 'rb') as f:
    test = json.load(f)

In [None]:
len(dev)

In [21]:
dev[0].keys()

dict_keys(['_id', 'type', 'question', 'context', 'entity_ids', 'supporting_facts', 'evidences', 'answer', 'evidences_id', 'answer_id'])

In [None]:
train[0].keys()

In [23]:
train[0]['question']

'Are director of film Move (1970 Film) and director of film Méditerranée (1963 Film) from the same country?'

In [24]:
train[0]['context'][:3]

[['Stuart Rosenberg',
  ['Stuart Rosenberg (August 11, 1927 – March 15, 2007) was an American film and television director whose motion pictures include "Cool Hand Luke" (1967), "Voyage of the Damned" (1976), "The Amityville Horror" (1979), and "The Pope of Greenwich Village" (1984).',
   'He was noted for his work with actor Paul Newman.']],
 ['Méditerranée (1963 film)',
  ['Méditerranée is a 1963 French experimental film directed by Jean-Daniel Pollet with assistance from Volker Schlöndorff.',
   'It was written by Philippe Sollers and produced by Barbet Schroeder, with music by Antione Duhamel.',
   'The 45 minute film is cited as one of Pollet\'s most influential films, which according to Jonathan Rosenbaum directly influenced Jean-Luc Goddard\'s "Contempt", released later the same year.',
   'Footage for the film was shot around the Mediterranean, including at a Greek temple, a Sicilian garden, the sea, and also features a fisherman, a bullfighter, and a girl on an operating table

In [26]:
train[0]['supporting_facts']

[['Move (1970 film)', 0],
 ['Méditerranée (1963 film)', 0],
 ['Stuart Rosenberg', 0],
 ['Jean-Daniel Pollet', 0]]

In [27]:
train[0]['evidences']

[['Move (1970 film)', 'director', 'Stuart Rosenberg'],
 ['Méditerranée (1963 film)', 'director', 'Jean-Daniel Pollet'],
 ['Stuart Rosenberg', 'country of citizenship', 'American'],
 ['Jean-Daniel Pollet', 'country of citizenship', 'French']]

In [25]:
train[0]['answer']

'no'

# Run eval

## Train Model

In [None]:
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

In [None]:
entity_types = ['people', 'locations', 'entities', 'movies', 'directors']
rag = CurveRAG(
    openai_client=client,
    entity_types=entity_types,
)

In [None]:
new_docs = []
for d in dev:
    d_ = str(d['context']).replace('[', '').replace(']', '. ').replace('\'', '')
    new_docs.append(d_)

In [None]:
len(new_docs)

In [None]:
dataset_size = 1000
rag.fit(new_docs[:dataset_size], dataset_name='movies')

In [None]:
rag.save(path="./models/250824_model_dev_1/model.pkl")

In [None]:
rag = CurveRAG.load(
    "./models/250824_model_dev_1/model.pkl",
    openai_client=client  # or llm=llm, outlines_llm=outlines_llm
)

In [None]:
len(rag.graph.nodes)

In [None]:
len(rag.graph.nodes)

In [None]:
rag.graph.edges

## Eval Model

In [18]:
dataset_size=10
ems, f1, qs, preds, answers, graphs = evaluation(rag, dataset_size=dataset_size)

query_prompt generate_response_query
QUERY PROMPT generate_response_query
Who PRON WP
is AUX VBZ
the DET DT
mother NOUN NN
of ADP IN
the DET DT
director NOUN NN
of ADP IN
film NOUN NN
Polish ADJ JJ
- PUNCT HYPH
Russian ADJ JJ
War PROPN NNP
( PUNCT -LRB-
Film PROPN NNP
) PUNCT -RRB-
? PUNCT .

You are a helpful assistant analyzing the given input data to provide an answer to the user query.
Only include the answer and use as few words as possible in your response. Provide no extra context in your response.
For example if the question can be answered with a Yes or No, then only respond with that. If the response requires a name, then only respond with the name.

# USER QUERY
Who is the mother of the director of film Polish-Russian War (Film)?

# Context:
KnowledgeGraph Overview
  There are 71 entities and 27 relationships in this graph.

Entities in the graph:
  • 'Polish-Russian War (film)':
      The entity has the following description: 2009 Polish film directed by Xawery Żuławski. 20

In [None]:
len(preds)

In [None]:
ems, f1

In [28]:
for f, q, p, a, g, r in zip(f1, qs, preds, answers, graphs, dev[:dataset_size]):
    print('Q: ', q)
    print('P: ', p)
    print('A: ', a)
    print('F:', f)
    print('S:', r['supporting_facts'], r['evidences'])
    print('G: ', g)
    print('-------------------------------------------')

Q:  Who is the mother of the director of film Polish-Russian War (Film)?
P:  Dorota Masłowska
A:  ['Małgorzata Braunek', 'Małgorzata Braunek']
F: 0.0
S: [['Polish-Russian War (film)', 1], ['Xawery Żuławski', 2]] [['Polish-Russian War', 'director', 'Xawery Żuławski'], ['Xawery Żuławski', 'mother', 'Małgorzata Braunek']]
G:  KnowledgeGraph Overview
  There are 71 entities and 27 relationships in this graph.

Entities in the graph:
  • 'Polish-Russian War (film)':
      The entity has the following description: 2009 Polish film directed by Xawery Żuławski. 2009 Polish film directed by Xawery Żuławski.
      It can also be referred to as: Polish-Russian War (film)
      It has the following additional information: Based on the novel Polish-Russian War under the white-red flag by Dorota Masłowska.
  • 'Xawery Żuławski':
      The entity has the following description: Polish film director born December 22, 1971. Polish film director born December 22, 1971.
      It can also be referred to as

In [None]:
sum(f1) / len(f1)

In [None]:
h_ems, h_f1, h_preds = evaluation(rag, dataset_size=100, model_traversal='hyperbolic')

In [None]:
h_ems, h_f1