In [1]:
from langfuse_config import langfuse, langfuse_handler
from llm import llm
from graph import graph
import dotenv
import os

dotenv.load_dotenv()



True

In [2]:
import pprint
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage, AIMessage

inputs = {
    "messages": [
        HumanMessage('what was the percentage change in the net cash from operating activities from 2008 to 2009'),
    ]
}

for error in graph.stream(inputs, config={"callbacks": [langfuse_handler]}):
    for key, value in error.items():
        print(f"Output from node '{key}':")
        print("---")
        pprint.pprint(value, indent=2, width=80, depth=None)
    print()
    print("---")
    print()

Output from node 'extract_question':
---
{ 'question': 'what was the percentage change in the net cash from operating '
              'activities from 2008 to 2009',
  'steps': ['extract_question']}

---

Output from node 'retriever':
---
{ 'documents': [ Document(metadata={'id': 'Single_INTC/2018/page_48.pdf-3', 'qa': "{'question': 'what was the percentage change in net cash provided by operating activities between 2017 and 2018?', 'answer': '33%', 'explanation': '', 'ann_table_rows': [1], 'ann_text_rows': [], 'steps': [{'op': 'minus2-1', 'arg1': '29432', 'arg2': '22110', 'res': '7322'}, {'op': 'divide2-2', 'arg1': '#0', 'arg2': '22110', 'res': '33%'}], 'program': 'subtract(29432, 22110), divide(#0, 22110)', 'gold_inds': {'table_1': 'years ended ( in millions ) the net cash provided by operating activities of dec 292018 is $ 29432 ; the net cash provided by operating activities of dec 302017 is $ 22110 ; the net cash provided by operating activities of dec 312016 is $ 21808 ;'}, 'exe_

In [3]:
dataset = langfuse.get_dataset("convfinqa-train")
from datetime import datetime
from prompts import eval_prompt_template

from tqdm.auto import tqdm

MODEL_NAME = os.getenv("LLM_MODEL", "llama3.1")

def retrieval_precision_score(predicted: list[str], expected: str) -> float:
    """
    Number of relevant documents retrieved / number of documents retrieved    
    In case of ConvFinQA, we only have 1 expected document. 
    """

    return float(expected in predicted) / len(predicted)
    # return len(set(predicted).intersection(set(expected))) / len(predicted)

def retrieval_recall_score(predicted: list[str], expected: str) -> float:
    """
    Number of relevant documents retrieved / number of relevant documents
    
    In case of ConvFinQA, we only have 1 expected document. 
    So if the document is in the predicted set, we get a recall of 1
    Otherwise, we get a recall of 0
    """
    return float(expected in predicted)


def check_error_llm(input, predicted, expected):
    # Base cases, we don't need to use LLM for that 
    if predicted == "" and expected != "":
        return 0
    if predicted != "" and expected == "":
        return -1
    if predicted == expected:
        return 1
    
    # Compare numbers, allow for percentages, dollars signs
    try:
        expected_parsed = float(expected.replace('%', 'e-2').replace("$", ""))
        expected_parsed_2 = float(expected.replace('%', '').replace("$", ""))
        predicted_parsed = float(predicted.replace('%', 'e-2').replace("$", ""))
        predicted_parsed_2 = float(predicted.replace('%', '').replace("$", ""))
        if set(predicted_parsed, predicted_parsed_2).intersection(
            set(expected_parsed, expected_parsed_2)
            ):
            return 1
    except:
        pass

    # Otherwise, use LLM
    prompt = eval_prompt_template.format(question=input, actual_answer=predicted, expected_answer=expected)
    out = llm.completions.create(model=MODEL_NAME, prompt=prompt, max_tokens=5)
    return float(out.choices[0].text)

answer_correctness_scores = []
retrieval_precision_scores = []
retrieval_recall_scores = []
for item in tqdm(dataset.items[:10]):
    # Make sure your application function is decorated with @observe decorator to automatically link the trace
    with item.observe(
        run_name=f"{datetime.now().strftime('%Y%m%d%H%M%S')}",
        run_description="My first run",
        run_metadata={"model": MODEL_NAME},
    ) as trace_id:
        # run your @observe() decorated application on the dataset item input
        inputs = {
            "messages": [
                HumanMessage(item.input),
            ]
        }
                
        output = graph.invoke(inputs, config={"callbacks": [langfuse_handler]})
        answer = output['answer']
        generation = output['generation']

        retrieved_doc_ids = [doc.metadata['id'] for doc in output['documents']]
        assert all(retrieved_doc_ids), "Invalid document IDs"
        expected_doc_id = item.metadata['document']['id']

        retrieval_precision = retrieval_precision_score(retrieved_doc_ids, expected_doc_id)
        retrieval_recall = retrieval_recall_score(retrieved_doc_ids, expected_doc_id)

        retrieval_precision_scores.append(retrieval_precision)
        retrieval_recall_scores.append(retrieval_recall)

        # Evaluate the output to compare different runs more easily
        correctness = check_error_llm(item.input, answer, item.expected_output)

        # Print input, answer, expected output, and the score in a more readable format
        print(f"Input: {item.input}")
        # print(f"Predicted Answer: {answer}")
        print(f"Expected Document: {expected_doc_id}")
        print(f"Retrieved Documents: {retrieved_doc_ids}")
        # print(f"Expected Answer: {item.expected_output}")
        print(f"Retrieval Precision: {retrieval_precision}")
        print(f"Retrieval Recall: {retrieval_recall}")
        # print(f"Score: {correctness}\n" + "-"*50)

        # Show generation for debugging, when retrieval was correct but answer was not
        if (correctness < .5) and (retrieval_recall > 0):
            print(f"Generation: {generation}")

        langfuse.score(
            trace_id=trace_id,
            name="correctness",
            value=correctness,
            comment=generation,  # optional, useful to add reasoning
        )

        langfuse.score(
            trace_id=trace_id,
            name="retrieval_precision",
            value=correctness,
            comment=generation,  # optional, useful to add reasoning
        )

        langfuse.score(
            trace_id=trace_id,
            name="retrieval_recall",
            value=correctness,
            comment=generation,  # optional, useful to add reasoning
        )

        answer_correctness_scores.append(correctness)
        retrieval_precision_scores.append(retrieval_precision)
        retrieval_recall_scores.append(retrieval_recall)

# Print the final average score in a formatted way
mean_correctness_score = sum(answer_correctness_scores) / len(answer_correctness_scores)
mean_retrieval_precision_score = sum(retrieval_precision_scores) / len(retrieval_precision_scores)
mean_retrieval_recall_score = sum(retrieval_recall_scores) / len(retrieval_recall_scores)

print(f"{'='*50}")
print(f"\n{'='*50}\nAverage Correctness: {mean_correctness_score:.2f}")
print(f"Mean Retrieval Precision: {mean_retrieval_precision_score:.2f}")
print(f"Mean Retrieval Recall: {mean_retrieval_recall_score:.2f}")
print(f"{'='*50}")

# Flush the langfuse client to ensure all data is sent to the server at the end of the experiment run
langfuse.flush()


  0%|          | 0/10 [00:00<?, ?it/s]

Input: what was the percentage decline in the operating earnings in 2007 of $ 37 million declined from $ 41
Expected Document: Single_IP/2007/page_32.pdf-3
Retrieved Documents: ['Single_RSG/2008/page_56.pdf-1', 'Single_INTC/2018/page_48.pdf-3', 'Single_IPG/2017/page_38.pdf-2', 'Single_IPG/2014/page_36.pdf-1', 'Single_IPG/2014/page_36.pdf-4']
Retrieval Precision: 0.0
Retrieval Recall: 0.0
Input: what was the percentage change in the net cash used in investing activities from 2006 to 2007
Expected Document: Double_MMM/2007/page_38.pdf
Retrieved Documents: ['Single_INTC/2018/page_48.pdf-3', 'Double_GIS/2018/page_39.pdf', 'Single_CB/2008/page_243.pdf-3', 'Single_SWKS/2010/page_105.pdf-2', 'Single_IPG/2014/page_36.pdf-4']
Retrieval Precision: 0.0
Retrieval Recall: 0.0
Input: in 2006 what was the ratio of the increase in tax payments in 2005 and 2006 to the decrease in cash
Expected Document: Double_MMM/2007/page_38.pdf
Retrieved Documents: ['Single_INTC/2018/page_48.pdf-3', 'Double_GIS/2018