In [1]:
from langfuse_config import langfuse, langfuse_handler
from llm import llm, MODEL_NAME
from graph import graph
import dotenv

import os

dotenv.load_dotenv()



True

In [2]:
# Apply Llama3.1 chat-template
def format_prompt(user_query):
    template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"""
    return template.format(user_query)

llm.completions.create(model="models/Llama-3.1-Storm-8B.Q8_0.gguf", prompt=format_prompt("Tell a story about a guy"), max_tokens=1000)

Completion(id='cmpl-7bf96f506ea84353a4bc52e5c5ab7873', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text="Once upon a time, in a small yet vibrant town, there lived a young man named Jack. Jack was known for his zest for life and his passion for adventure. He had an innate ability to see the best in people and the world around him, which made him beloved by all who knew him. Jack's heart was always open to new things and he never missed an opportunity to step out of his comfort zone and explore. Despite his outgoing personality, Jack also harbored a quiet creative side, capturing the beauty of the world through his photography.\n\nOne sunny summer morning, Jack woke up with a peculiar sense of restlessness. He couldn't shake off the feeling that he needed to do something significant before the summer season ended. So, with an adventurous glint in his eye, Jack decided he would embark on a journey across the country. He bought a rugged backpack, loaded his cam

In [3]:
import pprint
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage, AIMessage

inputs = {
    "messages": [
        HumanMessage('what was the percentage change in the net cash from operating activities from 2008 to 2009'),
    ]
}

for error in graph.stream(inputs, config={"callbacks": [langfuse_handler]}):
    for key, value in error.items():
        print(f"Output from node '{key}':")
        print("---")
        pprint.pprint(value, indent=2, width=80, depth=None)
    print()
    print("---")
    print()

Langfuse client is disabled since no public_key was provided as a parameter or environment variable 'LANGFUSE_PUBLIC_KEY'. See our docs: https://langfuse.com/docs/sdk/python/low-level-sdk#initialize-client


Output from node 'extract_question':
---
{ 'question': 'what was the percentage change in the net cash from operating '
              'activities from 2008 to 2009',
  'steps': ['extract_question']}

---

Output from node 'retriever':
---
{ 'documents': [ Document(id='Single_JKHY/2009/page_28.pdf-3', metadata={'id': 'Single_JKHY/2009/page_28.pdf-3', 'qa': "{'question': 'what was the percentage change in the net cash from operating activities from 2008 to 2009', 'answer': '14.1%', 'explanation': '', 'ann_table_rows': [6], 'ann_text_rows': [], 'steps': [{'op': 'minus2-1', 'arg1': '206588', 'arg2': '181001', 'res': '25587'}, {'op': 'divide2-2', 'arg1': '#0', 'arg2': '181001', 'res': '14.1%'}], 'program': 'subtract(206588, 181001), divide(#0, 181001)', 'gold_inds': {'table_6': '2008 the net cash from operating activities of year ended june 30 2009 2008 is $ 206588 ; the net cash from operating activities of year ended june 30 2009 2008 is $ 181001 ; the net cash from operating activities o

In [4]:
dataset = langfuse.get_dataset("convfinqa-train")
from datetime import datetime
from nodes import CHEATING_RETRIEVAL, DISABLE_GENERATION
from prompts import eval_prompt_template

from tqdm.auto import tqdm

MODEL_NAME = os.getenv("LLM_MODEL", "llama3.1")

def relative_score(a, b):
    """
    Relative difference between two numbers
    
    We also apply a quadratic penalty to penalize larger differences more.
    """
    if a == b:
        return 1.0
    else:
        return 1 - ((abs(a - b) / max(abs(a), abs(b))) ** 2)
    
def retrieval_precision_score(predicted: list[str], expected: str) -> float:
    """
    Number of relevant documents retrieved / number of documents retrieved    
    In case of ConvFinQA, we only have 1 expected document. 
    """

    return float(expected in predicted) / len(predicted)

def retrieval_recall_score(predicted: list[str], expected: str) -> float:
    """
    Number of relevant documents retrieved / number of relevant documents
    
    In case of ConvFinQA, we only have 1 expected document. 
    So if the document is in the predicted set, we get a recall of 1
    Otherwise, we get a recall of 0
    """
    return float(expected in predicted)


def correctness_score(input, predicted, expected):
    predicted = predicted.lower().strip()
    expected = expected.lower().strip()

    # Base cases, we don't need to use LLM for that 
    if predicted == "" and expected != "":
        return 0
    if predicted != "" and expected == "":
        return None
    if predicted == expected:
        return 1
    
    # Compare numbers, allow for percentages, dollars signs
    try:
        expected_parsed = float(expected.replace('%', 'e-2').replace("$", "").replace(",", "").replace(" ", ""))
        expected_parsed_2 = float(expected.replace('%', '').replace("$", "").replace(",", "").replace(" ", ""))
        predicted_parsed = float(predicted.replace('%', 'e-2').replace("$", "").replace(",", "").replace(" ", ""))
        predicted_parsed_2 = float(predicted.replace('%', '').replace("$", "").replace(",", "").replace(" ", ""))
        return max(
            relative_score(predicted_parsed, expected_parsed),
            relative_score(predicted_parsed_2, expected_parsed_2),
            relative_score(predicted_parsed, expected_parsed_2),
            relative_score(predicted_parsed_2, expected_parsed),
        )
    except Exception as e:
        pass

    # Otherwise, use LLM
    print("Using LLM for score generation...")
    prompt = eval_prompt_template.format(question=input, actual_answer=predicted, expected_answer=expected)
    out = llm.completions.create(model=MODEL_NAME, prompt=format_prompt(prompt), max_tokens=10, temperature=0)
    out_text = out.choices[0].text
    out_text = out_text.replace("<OUTPUT>", "").replace("</OUTPUT>", "")
    try:
        float_out = float(out_text)
    except:
        float_out = None # Error
        print(f"Error generating score (generated: {out_text})")
    return float_out

answer_correctness_scores = []
retrieval_precision_scores = []
retrieval_recall_scores = []
run_name = f"{datetime.now().strftime('%Y%m%d%H%M%S')}"
for item in tqdm(dataset.items[:1000]):
    handler = item.get_langchain_handler(
        run_name=run_name,
        run_description="RAG evaluation with ground-truth documents",
        run_metadata={"model": MODEL_NAME, "cheating_retrieval": CHEATING_RETRIEVAL, "disable_generation": DISABLE_GENERATION,
                    #    "graph": graph.get_graph().to_json()
                       },
    )

    # Make sure your application function is decorated with @observe decorator to automatically link the trace
    # run your @observe() decorated application on the dataset item input
    inputs = {
        "messages": [
            HumanMessage(item.input),
        ]
    }

            
    output = graph.invoke(inputs, config={"callbacks": [handler]})

    question = output['question']
    answer = output['answer']
    generation = output['generation']

    retrieved_doc_ids = [doc.metadata['id'] for doc in output['documents']]
    assert all(retrieved_doc_ids), "Invalid document IDs"
    expected_doc_id = item.metadata['document']['id']

    retrieval_precision = retrieval_precision_score(retrieved_doc_ids, expected_doc_id)
    retrieval_recall = retrieval_recall_score(retrieved_doc_ids, expected_doc_id)

    retrieval_precision_scores.append(retrieval_precision)
    retrieval_recall_scores.append(retrieval_recall)

    # Evaluate the output to compare different runs more easily
    correctness = correctness_score(item.input, answer, item.expected_output)

    # Print input, answer, expected output, and the score in a more readable format
    print(f"Input: {item.input}")
    print(f"Predicted Answer: {answer}")
    print(f"Expected Answer: {item.expected_output}")
    print(f"Retrieved Documents: {retrieved_doc_ids}")
    print(f"Expected Document: {expected_doc_id}")
    print(f"Retrieval Precision: {retrieval_precision}")
    print(f"Retrieval Recall: {retrieval_recall}")
    print(f"Correctness: {correctness}\n" + "-"*50)

    # Show generation for debugging, when retrieval was correct but answer was not
    if not correctness or ((correctness < .5) and (retrieval_recall > 0)):
        print(f"Generation: {generation}")

    print("-"*50)
    handler.trace.update(name="eval", 
                         input=question,
                         output=answer,
                         metadata={"generation": generation,
                                    "documents": output["documents"]})
    handler.trace.score(
        name="correctness",
        data_type="NUMERIC",
        value=correctness,
        comment=generation,  # reasoning
    )

    # make list of retrieved docs
    # add '+' prefix if the doc was the correct one
    # add no prefix if it was incorrect
    # if no correct docs identfied, add the correct one at the end with '-' prefix 
    doc_selection_display = ""
    for doc in retrieved_doc_ids:
        if doc == expected_doc_id:
            doc_selection_display += '+' + doc
        else:
            doc_selection_display += doc
        doc_selection_display += ", "
    if expected_doc_id not in retrieved_doc_ids:
        doc_selection_display = '-' + doc   
    
    handler.trace.score(
        name="retrieval_precision",
        data_type="NUMERIC",
        value=retrieval_precision,
        comment=doc_selection_display,
    )

    handler.trace.score(
        name="retrieval_recall",
        data_type="BOOLEAN",
        value=retrieval_recall,
        comment=doc_selection_display,
    )

    answer_correctness_scores.append(correctness)
    retrieval_precision_scores.append(retrieval_precision)
    retrieval_recall_scores.append(retrieval_recall)    

# Print the final average score in a formatted way
answer_correctness_scores_non_none = [c for c in answer_correctness_scores if c is not None]
mean_correctness_score = sum(answer_correctness_scores_non_none) / len(answer_correctness_scores_non_none)
mean_retrieval_precision_score = sum(retrieval_precision_scores) / len(retrieval_precision_scores)
mean_retrieval_recall_score = sum(retrieval_recall_scores) / len(retrieval_recall_scores)

print(f"{'='*50}")
print(f"\n{'='*50}\nAverage Correctness: {mean_correctness_score:.2f}")
print(f"Mean Retrieval Precision: {mean_retrieval_precision_score:.2f}")
print(f"Mean Retrieval Recall: {mean_retrieval_recall_score:.2f}")
print(f"{'='*50}")

# Flush the langfuse client to ensure all data is sent to the server at the end of the experiment run
langfuse.flush()


  0%|          | 0/124 [00:00<?, ?it/s]

Input: what was the difference in percentage cumulative total shareowners 2019 returns for united parcel service inc . versus the standard & poor 2019s 500 index for the five years ended 12/31/10?
Predicted Answer: 1.42%
Expected Answer: -1.42%
Retrieved Documents: ['Single_UPS/2010/page_33.pdf-4']
Expected Document: Single_UPS/2010/page_33.pdf-4
Retrieval Precision: 1.0
Retrieval Recall: 1.0
Correctness: -0.020100000000000007
--------------------------------------------------
Generation: Step 1: Identify the relevant data for United Parcel Service Inc. and the Standard & Poor's 500 index for the five years ended 12/31/10 from the provided documents.
The relevant data is as follows:
- United Parcel Service Inc.: $100.00 (12/31/05), $101.76 (12/31/06), $98.20 (12/31/07), $78.76 (12/31/08), $84.87 (12/31/09), $110.57 (12/31/10)
- Standard & Poor's 500 index: $100.00 (12/31/05), $115.79 (12/31/06), $122.16 (12/31/07), $76.96 (12/31/08), $97.33 (12/31/09), $111.99 (12/31/10)

Step 2: Calcu

1 validation error for ScoreBody
value
  none is not an allowed value (type=type_error.none.not_allowed)
Traceback (most recent call last):
  File "/home/yustee/.cache/pypoetry/virtualenvs/tomoroai-engineer-assignment-I67eNnCV-py3.12/lib/python3.12/site-packages/langfuse/client.py", line 2260, in score
    request = ScoreBody(**new_dict)
              ^^^^^^^^^^^^^^^^^^^^^
  File "/home/yustee/.cache/pypoetry/virtualenvs/tomoroai-engineer-assignment-I67eNnCV-py3.12/lib/python3.12/site-packages/pydantic/v1/main.py", line 341, in __init__
    raise validation_error
pydantic.v1.error_wrappers.ValidationError: 1 validation error for ScoreBody
value
  none is not an allowed value (type=type_error.none.not_allowed)


Error generating score (generated: <INPUT>
    <QUESTION>What is the)
Input: by how much did total other income and expense decrease from 2008 to 2009?
Predicted Answer: $294 million
Expected Answer: 47.4%
Retrieved Documents: ['Single_AAPL/2010/page_42.pdf-2', 'Double_AAPL/2010/page_42.pdf']
Expected Document: Single_AAPL/2010/page_42.pdf-2
Retrieval Precision: 0.5
Retrieval Recall: 1.0
Correctness: None
--------------------------------------------------
Generation: To calculate the decrease in total other income and expense from 2008 to 2009, we need to find the difference between the total other income and expense in 2008 and 2009.

From the table of contents, we can see that the total other income and expense for the three years ended September 25, 2010, are as follows (in millions):

20 10 2009 2008
interest income $ 311 $ 407 $ 653
other income ( expense ) net -156 ( 156 ) -81 ( 81 ) -33 ( 33 )
total other income and expense $ 155 $ 326 $ 620

The total other income and expense i

1 validation error for ScoreBody
value
  none is not an allowed value (type=type_error.none.not_allowed)
Traceback (most recent call last):
  File "/home/yustee/.cache/pypoetry/virtualenvs/tomoroai-engineer-assignment-I67eNnCV-py3.12/lib/python3.12/site-packages/langfuse/client.py", line 2260, in score
    request = ScoreBody(**new_dict)
              ^^^^^^^^^^^^^^^^^^^^^
  File "/home/yustee/.cache/pypoetry/virtualenvs/tomoroai-engineer-assignment-I67eNnCV-py3.12/lib/python3.12/site-packages/pydantic/v1/main.py", line 341, in __init__
    raise validation_error
pydantic.v1.error_wrappers.ValidationError: 1 validation error for ScoreBody
value
  none is not an allowed value (type=type_error.none.not_allowed)


Error generating score (generated: <INPUT>
    <QUESTION>What is the)
Input: by how much did asset retirement obligations decrease from 2007 to 2008?
Predicted Answer: $10,169 million
Expected Answer: -14.9%
Retrieved Documents: ['Single_MRO/2008/page_135.pdf-1', 'Double_MRO/2008/page_135.pdf']
Expected Document: Single_MRO/2008/page_135.pdf-1
Retrieval Precision: 0.5
Retrieval Recall: 1.0
Correctness: None
--------------------------------------------------
Generation: To calculate the decrease in asset retirement obligations from 2007 to 2008, we need to find the difference between the asset retirement obligations as of December 31, 2007, and the asset retirement obligations as of January 1, 2008.

From the provided table, we can see that the asset retirement obligations as of December 31, 2007, were $11,134 million, and as of January 1, 2008, they were $11,344 million.

However, we are interested in the decrease from 2007 to 2008, so we need to look at the change from December 31, 20

In [5]:
output

{'messages': [HumanMessage(content='what was the percentage change in the net cash from operating activities from 2008 to 2009', id='f9a73b7b-74a5-4ff8-a57e-48035eb68088'),
  AIMessage(content='Step 1: Identify the relevant data for the question\nThe relevant data for the question is the net cash from operating activities for the years 2008 and 2009, which is provided in the table at the end of the document.\n\nStep 2: Extract the data for 2008 and 2009\nFrom the table, the net cash from operating activities for 2008 is $18,001 and for 2009 is $20,658.\n\nStep 3: Calculate the percentage change\nTo calculate the percentage change, we use the formula: ((New Value - Old Value) / Old Value) * 100.\nPlugging in the values, we get: ((20,658 - 18,001) / 18,001) * 100.\n\nStep 4: Perform the calculation\nPerforming the calculation, we get: (2,657 / 18,001) * 100 = 14.75%.\n\nStep 5: Round the answer\nRounding the answer to two decimal places, we get: 14.75%.\n\n<ANSWER>14.75%</ANSWER>', id='f