In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import json

import llama_cpp
import torch
import pandas as pd

from dotenv import load_dotenv
from openai import OpenAI

from curverag import utils
from curverag.curverag import CurveRAG, DEFAULT_ENTITY_TYPES, DEFAULT_GLINER_MODEL, DEFAULT_SENTENCE_TRANSFORMER_MODEL
from curverag.graph import KnowledgeGraph
from curverag.atth.kg_dataset import KGDataset
from curverag.atth.models.hyperbolic import AttH
from curverag.eval import evaluation, queries, context, expected_output

load_dotenv() 

True

# Explore dataset

In [3]:
with open('../datasets/2WikiMultihopQA/new/train.json', 'rb') as f:
    train = json.load(f)

with open('../datasets/2WikiMultihopQA/new/dev.json', 'rb') as f:
    dev = json.load(f)

with open('../datasets/2WikiMultihopQA/new/test.json', 'rb') as f:
    test = json.load(f)

In [4]:
len(dev)

12576

In [5]:
dev[0].keys()

dict_keys(['_id', 'type', 'question', 'context', 'entity_ids', 'supporting_facts', 'evidences', 'answer', 'evidences_id', 'answer_id'])

In [6]:
train[0].keys()

dict_keys(['_id', 'type', 'question', 'context', 'entity_ids', 'supporting_facts', 'evidences', 'answer', 'evidences_id', 'answer_id'])

In [7]:
train[0]['question']

'Are director of film Move (1970 Film) and director of film Méditerranée (1963 Film) from the same country?'

In [8]:
train[0]['context'][:3]

[['Stuart Rosenberg',
  ['Stuart Rosenberg (August 11, 1927 – March 15, 2007) was an American film and television director whose motion pictures include "Cool Hand Luke" (1967), "Voyage of the Damned" (1976), "The Amityville Horror" (1979), and "The Pope of Greenwich Village" (1984).',
   'He was noted for his work with actor Paul Newman.']],
 ['Méditerranée (1963 film)',
  ['Méditerranée is a 1963 French experimental film directed by Jean-Daniel Pollet with assistance from Volker Schlöndorff.',
   'It was written by Philippe Sollers and produced by Barbet Schroeder, with music by Antione Duhamel.',
   'The 45 minute film is cited as one of Pollet\'s most influential films, which according to Jonathan Rosenbaum directly influenced Jean-Luc Goddard\'s "Contempt", released later the same year.',
   'Footage for the film was shot around the Mediterranean, including at a Greek temple, a Sicilian garden, the sea, and also features a fisherman, a bullfighter, and a girl on an operating table

In [9]:
train[0]['answer']

'no'

# Run eval

## Train Model

In [10]:
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

In [None]:
entity_types = ['people', 'locations', 'entities', 'movies', 'directors']
rag = CurveRAG(
    openai_client=client,
    entity_types=entity_types,
)

In [None]:
new_docs = []
for d in dev:
    d_ = str(d['context']).replace('[', '').replace(']', '. ').replace('\'', '')
    new_docs.append(d_)

In [None]:
len(new_docs)

In [None]:
dataset_size = 1000
rag.fit(new_docs[:dataset_size], dataset_name='movies')

In [None]:
rag.save(path="./models/250824_model_dev_1/model.pkl")

In [13]:
rag = CurveRAG.load(
    "./models/250824_model_dev_1/model.pkl",
    openai_client=client  # or llm=llm, outlines_llm=outlines_llm
)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
len(rag2.graph.nodes)

In [None]:
len(rag.graph.nodes)

In [None]:
rag.graph.edges

## Eval Model

In [None]:
dataset_size=1000
ems, f1, qs, preds, answers = evaluation(rag, dataset_size=dataset_size)

In [None]:
len(preds)

In [None]:
ems, f1

In [None]:
for f, q, p, a in zip(f1, qs, preds, answers):
    print('Q: ', q)
    print('P: ', p)
    print('A: ', a)
    print('F:', f)
    print('-------------------------------------------')

In [None]:
sum(f1) / len(f1)

In [None]:
dataset_path: str = '../datasets/2WikiMultihopQA/new/dev.json'
dataset_size: int = 1000
alias_path: str = '../datasets/2WikiMultihopQA/new/id_aliases.json'
with open(dataset_path, 'rb') as f:
    eval_dataset = json.load(f)
    eval_dataset = eval_dataset[:dataset_size]

aliases = defaultdict(list)
with open(alias_path, 'rb') as f:
    for line in f:
        record = json.loads(line)
        aliases[record['Q_id']] += record['aliases'] + record['demonyms']

In [None]:
h_ems, h_f1, h_preds = evaluation(rag, dataset_size=100, model_traversal='hyperbolic')

In [None]:
h_ems, h_f1