In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

import json

import llama_cpp
import torch

from dotenv import load_dotenv
from openai import OpenAI

from curverag import utils
from curverag.curverag import CurveRAG, DEFAULT_ENTITY_TYPES, DEFAULT_GLINER_MODEL, DEFAULT_SENTENCE_TRANSFORMER_MODEL
from curverag.graph import KnowledgeGraph
from curverag.atth.kg_dataset import KGDataset
from curverag.atth.models.hyperbolic import AttH
from curverag.eval import evaluation, queries, context, expected_output

load_dotenv() 

# Explore dataset

In [None]:
with open('../datasets/2WikiMultihopQA/train.json', 'rb') as f:
    train = json.load(f)

with open('../datasets/2WikiMultihopQA/dev.json', 'rb') as f:
    dev = json.load(f)

with open('../datasets/2WikiMultihopQA/test.json', 'rb') as f:
    test = json.load(f)

In [None]:
train[0].keys()

In [None]:
train[0]['question']

In [None]:
train[0]['answer']

In [None]:
train[0]['context']

# Run eval

## Train Model

In [None]:
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

In [None]:
entity_types = ['people', 'locations', 'entities', 'movies', 'directors']
rag = CurveRAG(
    openai_client=client,
    entity_types=entity_types,
)

In [None]:
new_docs = []
for d in train:
    d_ = str(d['context']).replace('[', '').replace(']', '. ').replace('\'', '')
    new_docs.append(d_)

In [None]:
dataset_size = 100
rag.fit(new_docs[:dataset_size], dataset_name='movies')

In [None]:
rag.graph.nodes

In [None]:
rag.graph.edges

## Test Model

In [None]:
responses_pp = []
for i in range(dataset_size):
    resp = rag.query(train[i]['question'], traversal = 'pp')
    responses_pp.append(resp)

In [None]:
responses_ = []
for i in range(dataset_size):
    resp = rag.query(train[i]['question'], traversal='hyperbolic')
    responses_.append(resp)