In [8]:
from langfuse_config import langfuse, langfuse_handler
from llm import llm
from graph import graph


In [9]:
import pprint
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage, AIMessage

inputs = {
    "messages": [
        HumanMessage('what was the percentage change in the net cash from operating activities from 2008 to 2009'),
    ]
}

for error in graph.stream(inputs, config={"callbacks": [langfuse_handler]}):
    for key, score in error.items():
        print(f"Output from node '{key}':")
        print("---")
        pprint.pprint(score, indent=2, width=80, depth=None)
    print()
    print("---")
    print()

Output from node 'extract_question':
---
{ 'question': 'what was the percentage change in the net cash from operating '
              'activities from 2008 to 2009',
  'steps': ['extract_question']}

---

Output from node 'cheating_retriever':
---
{ 'documents': [ Document(id='Single_JKHY/2009/page_28.pdf-3', metadata={'qa': "{'question': 'what was the percentage change in the net cash from operating activities from 2008 to 2009', 'answer': '14.1%', 'explanation': '', 'ann_table_rows': [6], 'ann_text_rows': [], 'steps': [{'op': 'minus2-1', 'arg1': '206588', 'arg2': '181001', 'res': '25587'}, {'op': 'divide2-2', 'arg1': '#0', 'arg2': '181001', 'res': '14.1%'}], 'program': 'subtract(206588, 181001), divide(#0, 181001)', 'gold_inds': {'table_6': '2008 the net cash from operating activities of year ended june 30 2009 2008 is $ 206588 ; the net cash from operating activities of year ended june 30 2009 2008 is $ 181001 ; the net cash from operating activities of year ended june 30 2009 is $ 

In [13]:
dataset = langfuse.get_dataset("convfinqa-train")
from datetime import datetime
from prompts import eval_prompt_template

from tqdm.auto import tqdm

def check_error_llm(input, actual, expected):
    # Base cases, we don't need to use LLM for that 
    if actual == "" and expected != "":
        return 0
    if actual != "" and expected == "":
        return -1
    if actual == expected:
        return 1
    
    try:
        expected_parsed = float(expected.replace('%', 'e-2').replace("$", ""))
        expected_parsed_2 = float(expected.replace('%', '').replace("$", ""))
        actual_parsed = float(actual.replace('%', 'e-2').replace("$", ""))
        actual_parsed_2 = float(actual.replace('%', '').replace("$", ""))
        if set(actual_parsed, actual_parsed_2).intersection(set(expected_parsed, expected_parsed_2)):
            return 1
    except:
        pass

    # Otherwise, use LLM
    prompt = eval_prompt_template.format(question=input, actual_answer=actual, expected_answer=expected)
    out = llm.completions.create(model="llama3.1", prompt=prompt, max_tokens=5)
    return float(out.choices[0].text)

scores = []
for item in tqdm(dataset.items[:10]):
    # Make sure your application function is decorated with @observe decorator to automatically link the trace
    with item.observe(
        run_name=f"{datetime.now().strftime('%Y%m%d%H%M%S')}",
        run_description="My first run",
        run_metadata={"model": "llama3.1"},
    ) as trace_id:
        # run your @observe() decorated application on the dataset item input
        inputs = {
            "messages": [
                HumanMessage(item.input),
            ]
        }
                
        output = graph.invoke(inputs, config={"callbacks": [langfuse_handler]})
        answer = output['answer']
        generation = output['generation']

        # Evaluate the output to compare different runs more easily
        score = check_error_llm(item.input, answer, item.expected_output)

        # Print input, answer, expected output, and the score in a more readable format
        print(f"Input: {item.input}")
        print(f"Actual Answer: {answer}")
        print(f"Expected Answer: {item.expected_output}")
        print(f"Score: {score}\n" + "-"*50)
        if score < .5:
            print(f"Generation: {generation}")


        langfuse.score(
            trace_id=trace_id,
            name=item.input,
            value=score,
            comment=generation,  # optional, useful to add reasoning
        )

        scores.append(score)

# Print the final average score in a formatted way
average_score = sum(scores) / len(scores)
print(f"\n{'='*50}\nAverage Score: {average_score:.2f}\n{'='*50}")

# Flush the langfuse client to ensure all data is sent to the server at the end of the experiment run
langfuse.flush()


  0%|          | 0/10 [00:00<?, ?it/s]

Input: what is the percentage change in the total debt from 2013 to 2014?
Actual Answer: 
Expected Answer: 4.2%
Score: 0
--------------------------------------------------
Generation: To determine the percentage change in the total debt from 2013 to 2014, I need to:

1. Look for the relevant data:
   The total debt in 2013 was $1662.5 and in 2014 was $1732.8.

2. Calculate the difference between the two values:
    $1732.8 ( total debt in 2014 ) -  $1662.5 ( total debt in 2013 ) = $70.3

3. Determine the percentage increase:
   To find the percentage change, I'll divide the change by the original value and multiply by 100: 

    (($73.1 / $1662.5) * 100) %
Input: what is the net margin for 2006?
Actual Answer: 12.32%
Expected Answer: 10.5%
Score: 0.96
--------------------------------------------------
Input: in ( millions of barrels ) , what was the average of beginning and end of year reserves?
Actual Answer: 
Expected Answer: 404.5
Score: 0
-------------------------------------------