In [1]:
import os, getpass

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("OPENAI_API_KEY")
_set_env("LANGCHAIN_API_KEY")
_set_env('LANGCHAIN_TRACING_V2')
_set_env('LANGCHAIN_ENDPOINT')
os.environ["LANGCHAIN_PROJECT"] = "problem_decomposition"

# Test config file
import os
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--model', type = str, default = 'openai')

config = parser.parse_args(args=['--model', 'openai'])


# Test run

In [2]:
from datasets import load_dataset
from graph_builder import build_graph

# Load Graph
ExperimentName = 'gpt3.5_turbo_test1' # external memory
graph = build_graph(config.model, ExperimentName)

# Prep data
# train_set = load_dataset("hotpot_qa", 'distractor', split='train', trust_remote_code=True)
dev_set = load_dataset("hotpot_qa", 'distractor', split = 'validation', trust_remote_code = True)
test_set = dev_set.shuffle(seed = 42) # Use as test set

In [None]:
# Run
iteration = 100

for i in range(iteration):
    data = test_set[i]

    # Extract data info
    unique_id = data['id']
    question = data['question']
    context = [
        "{}: {}".format(title, " ".join(sentences)) 
        for title, sentences in zip(data['context']['title'], data['context']['sentences'])
    ]

    # Invoke the graphs
    thread = {"configurable": {"thread_id": unique_id}}
    prompt = {
        'messages': question,
        'context': context
    }
    response = graph.invoke(prompt, thread)

In [None]:
# states
# instantiate states
states = {
    'id': [],
    'question': [],
    'answer': [],
    'gt_answer': [],
    'problem': [],
    'sub_problems': [],
    'dependencies': [],
    'sub_problem_solutions': [],
    'sub_problem_reasoning': [],
    'final_reasoning': [],
    'context': []
}

# Get responses
for i in range(iteration):
    # Extract data info
    unique_id = test_set[i]['id']
    gt_answer = test_set[i]['answer']
    
    # load response 
    thread = {"configurable": {"thread_id": unique_id}}
    curr_state = graph.get_state(thread).values

    # Save state
    for j in states.keys():
        if j in curr_state.keys():
            states[j].append(curr_state[j])
    states['question'].append(curr_state['messages'][0].content)
    states['answer'].append(curr_state['messages'][1].content)
    states['gt_answer'].append(gt_answer)
    states['id'].append(unique_id)

json.dump(states, open(f'states_{ExperimentName}.json', 'w'), indent = 4)

# Evaluation

In [None]:
# llm based evaluation
from openai import OpenAI
import logging
from typing import List, Dict
import json
from prompt import PromptTemplate

class HotpotEvaluator:
    def __init__(self, openai_key: str):
        """Initialize evaluator with OpenAI key."""
        self.client = OpenAI(api_key=openai_key)
        self.prompt_template = PromptTemplate.hotpot_evaluation
        
        self.yes_count = 0
        self.no_count = 0
        self.total = 0
        self.correctness = {'id': [], 'list': []}

    def evaluate_single(self, question: str, correct_answer: str, model_answer: str) -> str:
        """Evaluate a single answer using GPT-4o."""
        prompt = self.prompt_template.format(
            question = question,
            correct_answer = correct_answer,
            answer = model_answer
        )
        
        try:
            response = self.client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}],
                temperature=0
            )
            return response.choices[0].message.content
        except Exception as e:
            logging.error(f"Evaluation failed: {e}")
            return "ERROR"

    def evaluate_dataset(self, data: Dict) -> Dict:
        """
        Evaluate a dataset of hotpot QA pairs.
        
        Args:
            data: Dict with keys 'question', 'gt_answer' (correct), and 'answer' (model response) and values as lists
        
        Returns:
            Dict with evaluation metrics
        """
        self.total = len(data['id'])

        for i in range(self.total):
            result = self.evaluate_single(
                data['question'][i],
                data['gt_answer'][i],
                data['answer'][i]
            )
            
            if 'yes' in result.lower():
                self.yes_count += 1                
                self.correctness['list'].append('yes')
            elif 'no' in result.lower():
                self.no_count += 1
                self.correctness['list'].append('no')
            self.correctness['id'].append(data['id'][i])

        return {
            'correct': self.yes_count,
            'incorrect': self.no_count,
            'accuracy': self.yes_count / self.total if self.total > 0 else 0,
            'correctness': self.correctness
        }

# Example usage:
evaluator = HotpotEvaluator(openai_key = os.environ["OPENAI_API_KEY"])

states = json.load(open('states_{ExperimentName}.json', 'r'))
results = evaluator.evaluate_dataset(states)
print(results)  # {'correct': 1, 'incorrect': 0, 'accuracy': 1.0}

{'correct': 3, 'incorrect': 2, 'accuracy': 0.6}


# Benchmarking

In [None]:
# prompt