In [1]:
import os, getpass

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("OPENAI_API_KEY")
_set_env("LANGCHAIN_API_KEY")
_set_env('LANGCHAIN_TRACING_V2')
_set_env('LANGCHAIN_ENDPOINT')
os.environ["LANGCHAIN_PROJECT"] = "problem_decomposition"

# Test config file
import os
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--model', type = str, default = 'openai')

config = parser.parse_args(args=['--model', 'openai'])


# Test run

In [None]:
from datasets import load_dataset
from problem_decomposition import build_graph

# Load Graph
# ExperimentName = 'gpt3.5_turbo_test1' # external memory
ExperimentName = 'prompt_3'
graph = build_graph(config.model, ExperimentName)

# Prep data
# train_set = load_dataset("hotpot_qa", 'distractor', split='train', trust_remote_code=True)
dev_set = load_dataset("hotpot_qa", 'distractor', split = 'validation', trust_remote_code = True)
test_set = dev_set.shuffle(seed = 42) # Use as test set

In [5]:
# Run
iteration = 100

for i in range(13, iteration):
    data = test_set[i]

    # Extract data info
    unique_id = data['id']
    question = data['question']
    context = [
        "{}. {}: {}".format(doc_id+1, title, " ".join(sentences)) 
        for doc_id, (title, sentences)
        in enumerate(
            zip(
                data['context']['title'], 
                data['context']['sentences']
            )
        )
    ]

    # Invoke the graphs
    thread = {"configurable": {"thread_id": unique_id}}
    prompt = {
        'messages': question,
        'context': context
    }
    response = graph.invoke(prompt, thread)

In [6]:
# states
import json
# instantiate states
states = {
    'id': [],
    'question': [],
    'answer': [],
    'gt_answer': [],
    'supporting_documents': [],
    'supporting_facts': [],
    'problem': [],
    'sub_problems': [],
    'dependencies': [],
    'sub_problem_solutions': [],
    'sub_problem_reasoning': [],
    'final_reasoning': [],
    'context': []
}

# Get responses
for i in range(iteration):
    # Extract data info
    unique_id = test_set[i]['id']
    gt_answer = test_set[i]['answer']
    supp_facts = test_set[i]['supporting_facts']['title']
    supp_facts_idx = sorted(list(set(
        [
            test_set[i]['context']['title'].index(supp_fact_i) + 1
            for supp_fact_i in supp_facts
        ]
    )))
    
    # load response 
    thread = {"configurable": {"thread_id": unique_id}}
    curr_state = graph.get_state(thread).values
    curr_state['supporting_documents'] = sorted(list(set(curr_state['supporting_documents']))) # convert to set

    # Save state
    for j in states.keys():
        if j in curr_state.keys():
            states[j].append(curr_state[j])
    states['question'].append(curr_state['messages'][0].content)
    states['answer'].append(curr_state['messages'][1].content)
    states['gt_answer'].append(gt_answer)
    states['supporting_facts'].append(supp_facts_idx)
    states['id'].append(unique_id)

json.dump(states, open(f'states/states_{ExperimentName}.json', 'w'), indent = 4)

# Evaluation

In [7]:
from evaluator import HotpotEvaluator

# Instantiate evaluator
evaluator = HotpotEvaluator(openai_key = os.environ["OPENAI_API_KEY"])

# Load states
states = json.load(open(f'states/states_{ExperimentName}.json', 'r'))

# Calculate metrics
results = evaluator.calculate_metrics_answer(states)
results_docs = evaluator.calculate_metrics_supporting_docs(states)

# Save results
json.dump(results, open(f'metrics/metrics_answer_{ExperimentName}.json', 'w'), indent = 4)
json.dump(results_docs, open(f'metrics/metrics_docs_{ExperimentName}.json', 'w'), indent = 4)

In [None]:
# Prompt 3!
results_docs['avg_f1'], results_docs['exact_match_ratio'], results['avg_f1'], results['accuracy']

(0.7557142857142856, 0.45, 0.5159900344715932, 0.75)

In [None]:
# Check the EM and F1 scores are good to use
for i in range(iteration):
    print(states['gt_answer'][i], ', ', states['answer'][i])
    corr = results['correctness'][i]
    res = results['response'][i]
    f1score = results['f1'][i]
    em = results['exact_match'][i]
    print(f'corr: {corr}, res: {res}, f1: {f1score}, em: {em}')
    print()

Prussian ,  Otto von Bismarck
corr: no, res: NO

The answer "Otto von Bismarck" refers to a character's name, not the nationality. The question asks for the nationality, which is "Prussian." Therefore, the provided answer is not factually correct or complete., f1: 0, em: False

Kurt Julian Weill ,  Kurt Weill
corr: yes, res: YES, f1: 0.8, em: False

U2 ,  U2 released the song 'With or Without You' first.
corr: yes, res: YES, f1: 0.2222222222222222, em: False

Oldham County ,  Oldham County, Kentucky
corr: yes, res: YES, f1: 0.8, em: False

138,535 ,  138535
corr: yes, res: YES

The answer "138535" is factually correct and complete as it matches the correct answer "138,535" in terms of numerical value, despite the lack of a comma., f1: 1.0, em: True

1959 ,  1959
corr: yes, res: YES, f1: 1.0, em: True

October 25, 1881 ,  Thomas Fitch defended John Henry Holliday after the gunfight at the O.K. Corral in 1881.
corr: no, res: NO

The evaluated answer is incomplete because it does not spec

# Benchmarking

In [None]:
# prompt
from pydantic import BaseModel

from prompt import PromptTemplate
from utils import load_llm

class SingleRes(BaseModel):
    answer: str
    reasoning: str
    supporting_documents: list[int]

# Prompt
single_prompt = [
    SystemMessage(
        PromptTemplate.direct_solution.format(
            Problem = question,
            Context = contexts
        )
    )
]


# Load LLM
llm = load_llm(config.model)
structured_llm = llm.with_structured_output(SingleRes)

response = structured_llm.invoke(single_prompt)