# Summary

In [1]:
from pathlib import Path
import json
import pandas as pd

In [2]:
def load_evidence_index(dataset_path: Path):
    idx = {}
    with dataset_path.open("r", encoding="utf-8") as fh:
        for line in fh:
            row = json.loads(line)
            key = (row.get("doc_name"), row.get("question"))
            idx[key] = row.get("evidence") or []
    return idx

dataset_path = Path("/home/moon/Desktop/Financial_Document_Analyzer/data/financebench/financebench_open_source.jsonl")
evidence_idx = load_evidence_index(dataset_path)

In [3]:

LOG_PATH = Path("/home/moon/Desktop/Financial_Document_Analyzer/data/logs/v1.jsonl")
assert LOG_PATH.exists(), f"Log file not found: {LOG_PATH}"

rows = []
with LOG_PATH.open("r", encoding="utf-8") as fh:
    for line in fh:
        line = line.strip()
        if not line:
            continue
        rows.append(json.loads(line))
df = pd.DataFrame(rows)
df

Unnamed: 0,doc_name,question,ground_truth,question_type,question_reasoning,answer,citations,hits,eval_classification,eval_reasoning
0,AMCOR_2022_8K_dated-2022-07-01,What was the key agenda of the AMCOR's 8k fili...,"Amcor Finance (USA), Inc. and Amcor Flexibles ...",novel-generated,,The key agenda of AMCOR's 8K filing dated 1st ...,"[{'i': 5, 'source_doc': 'AMCOR_2022_8K_dated-2...","[{'chunk_id': '28', 'source_doc': 'AMCOR_2022_...",CORRECT,The model answer accurately captures the suppl...
1,AMERICANEXPRESS_2022_10K,Which debt securities are registered to trade ...,There are none,domain-relevant,Information extraction,The provided sources do not explicitly list sp...,"[{'i': 1, 'source_doc': 'AMERICANEXPRESS_2022_...","[{'chunk_id': '943', 'source_doc': 'AMERICANEX...",CORRECT,Model answer correctly indicates that no debt ...
2,AMERICANEXPRESS_2022_10K,What are the geographies that American Express...,"United States, EMEA, APAC, and LACC",domain-relevant,Information extraction,American Express primarily operates in the Uni...,"[{'i': 1, 'source_doc': 'AMERICANEXPRESS_2022_...","[{'chunk_id': '1197', 'source_doc': 'AMERICANE...",DIFFERENT_ANSWER,The answer lists individual countries and the ...
3,AMERICANEXPRESS_2022_10K,Does AMEX have an improving operating margin p...,Performance is not measured through operating ...,domain-relevant,Numerical reasoning OR information extraction,The provided sources do not include the necess...,"[{'i': 8, 'source_doc': 'AMERICANEXPRESS_2022_...","[{'chunk_id': '492', 'source_doc': 'AMERICANEX...",CORRECT,The model answer correctly states that operati...
4,AMERICANEXPRESS_2022_10K,What drove gross margin change as of the FY202...,Performance is not measured through gross margin,domain-relevant,Logical reasoning (based on numerical reasonin...,The provided sources do not explicitly discuss...,"[{'i': 1, 'source_doc': 'AMERICANEXPRESS_2022_...","[{'chunk_id': '492', 'source_doc': 'AMERICANEX...",CORRECT,Model answer correctly states that gross margi...
5,AMERICANEXPRESS_2022_10K,How much has the effective tax rate of America...,The effective tax rate for American Express ha...,domain-relevant,Numerical reasoning,The effective tax rate of American Express dec...,"[{'i': 1, 'source_doc': 'AMERICANEXPRESS_2022_...","[{'chunk_id': '275', 'source_doc': 'AMERICANEX...",CORRECT,The model answer correctly states the 3‑point ...
6,AMERICANEXPRESS_2022_10K,What was the largest liability in American Exp...,Customer deposits,novel-generated,,The provided sources do not explicitly state t...,"[{'i': 6, 'source_doc': 'AMERICANEXPRESS_2022_...","[{'chunk_id': '24', 'source_doc': 'AMERICANEXP...",INCORRECT,The model answer incorrectly claims the larges...
7,AMERICANEXPRESS_2022_10K,Was American Express able to retain card membe...,Yes,novel-generated,,American Express reported having 76.7 million ...,"[{'i': 1, 'source_doc': 'AMERICANEXPRESS_2022_...","[{'chunk_id': '24', 'source_doc': 'AMERICANEXP...",PARTIALLY_CORRECT,The model answer hints at retention but does n...
8,BESTBUY_2024Q2_10Q,Was there any drop in Cash & Cash equivalents ...,"Yes, there was a decline of ~42% between FY202...",novel-generated,,The provided sources do not include the cash a...,[],"[{'chunk_id': '165', 'source_doc': 'BESTBUY_20...",INCORRECT,The model answer incorrectly states that the d...
9,BESTBUY_2024Q2_10Q,Was there any change in the number of Best Buy...,"Yes, there is decline in number stores by 1.32...",novel-generated,,There was a decrease of 11 Best Buy stores bet...,"[{'i': 2, 'source_doc': 'BESTBUY_2024Q2_10Q', ...","[{'chunk_id': '151', 'source_doc': 'BESTBUY_20...",INCORRECT,The model provides incorrect store counts (966...


In [4]:
# preview_cols = ['doc_name', 'question', 'question_type', 'ground_truth', 'answer', 'eval_classification', 'reasoning', 'evidence', 'citations']
# df_preview = df[preview_cols].copy()
# df_preview['question_preview'] = df_preview['question'].astype(str).str.slice(0, 200)
# df_preview['answer_preview'] = df_preview['answer'].astype(str).str.slice(0, 200)
# df_preview = df_preview[['doc_name', 'question', 'question_type', 'ground_truth', 'answer', 'eval_classification', 'reasoning', 'evidence', 'citations']]
# html_path = LOG_PATH.with_suffix('.html')
# df_preview.to_html(html_path, index=False, escape=False)
# print(f'Saved preview table to {html_path}')

In [5]:
# Evaluation breakdown
total = len(df)
cat_counts = df['eval_classification'].value_counts().to_dict()
print(f'Total questions: {total}')
for cat in ['CORRECT', 'PARTIALLY_CORRECT', 'DIFFERENT_ANSWER', 'INCORRECT', 'NO_ANSWER']:
    count = cat_counts.get(cat, 0)
    pct = (count / total * 100) if total else 0
    print(f'{cat:<20}: {count:<3} ({pct:.2f}%)')

print('\nDistribution by question_type and eval_category:')
pivot = df.pivot_table(
    index='question_type',
    columns='eval_classification',
    values='question',
    aggfunc='count',
    fill_value=0,
    margins=True
)
pivot = pivot.reset_index()
pivot.columns.name = None   # <- 열 머리글 제거
display(pivot)


Total questions: 30
CORRECT             : 14  (46.67%)
PARTIALLY_CORRECT   : 3   (10.00%)
DIFFERENT_ANSWER    : 1   (3.33%)
INCORRECT           : 11  (36.67%)
NO_ANSWER           : 1   (3.33%)

Distribution by question_type and eval_category:


Unnamed: 0,question_type,CORRECT,DIFFERENT_ANSWER,INCORRECT,NO_ANSWER,PARTIALLY_CORRECT,All
0,domain-relevant,7,1,4,0,0,12
1,metrics-generated,0,0,2,0,0,2
2,novel-generated,7,0,5,1,3,16
3,All,14,1,11,1,3,30


# CORRECT

In [6]:
correct = df[df['eval_classification'] == "CORRECT"]
print(f'CORRECT: {len(correct)}')
correct[['doc_name', 'question', 'question_type', 'answer', 'ground_truth']]

CORRECT: 14


Unnamed: 0,doc_name,question,question_type,answer,ground_truth
0,AMCOR_2022_8K_dated-2022-07-01,What was the key agenda of the AMCOR's 8k fili...,novel-generated,The key agenda of AMCOR's 8K filing dated 1st ...,"Amcor Finance (USA), Inc. and Amcor Flexibles ..."
1,AMERICANEXPRESS_2022_10K,Which debt securities are registered to trade ...,domain-relevant,The provided sources do not explicitly list sp...,There are none
3,AMERICANEXPRESS_2022_10K,Does AMEX have an improving operating margin p...,domain-relevant,The provided sources do not include the necess...,Performance is not measured through operating ...
4,AMERICANEXPRESS_2022_10K,What drove gross margin change as of the FY202...,domain-relevant,The provided sources do not explicitly discuss...,Performance is not measured through gross margin
5,AMERICANEXPRESS_2022_10K,How much has the effective tax rate of America...,domain-relevant,The effective tax rate of American Express dec...,The effective tax rate for American Express ha...
10,BESTBUY_2024Q2_10Q,Which Best Buy product category performed the ...,novel-generated,The Entertainment product category performed t...,The entertainment segment experienced the high...
14,BOEING_2022_10K,Who are the primary customers of Boeing as of ...,domain-relevant,The primary customers of Boeing as of FY2022 a...,Boeing's primary customers as of FY2022 are a ...
16,BOEING_2022_10K,What production rate changes is Boeing forecas...,novel-generated,Boeing forecasts production rate changes for s...,Boeing forecasts an increase in the production...
18,JOHNSON_JOHNSON_2023_8K_dated-2023-08-30,Which business segment of JnJ will be treated ...,novel-generated,The Consumer Health business segment of Johnso...,The Consumer Health business segment will be t...
19,JOHNSON_JOHNSON_2023_8K_dated-2023-08-30,What is the amount of the gain accruing to JnJ...,novel-generated,The gain accruing to Johnson & Johnson as a re...,JnJ will make a gain of approximately $20 bill...


In [7]:
idx = correct.index[0] if len(correct) else None
if idx is not None:
    rec = correct.loc[idx]
    print(f"Doc          : {rec['doc_name']}")
    print(f"Question     : {rec['question']}")
    print('\nGround Truth :', rec['ground_truth'])
    print('\nGenerated Answer :', rec['answer'])

    # Evidence from dataset
    evid_key = (rec.get("doc_name"), rec.get("question"))
    evidences = evidence_idx.get(evid_key, [])
    print("\nEvidence:")
    if evidences:
        for i, ev in enumerate(evidences, start=1):
            text = ev.get("evidence_text")
            print(f"Evidence {i}: {text}\n")

    print('\nChunk Used:')
    for citation in rec['citations']:
        print(f"  {citation.get('i'):<2} | {citation.get('text')}")

    print('\nTop-K:')
    for i, hit in enumerate(rec['hits'], 1):
        chunk_id = hit.get('chunk_id')
        chunk_type = hit.get('type')
        text_preview = (hit.get('text') or '').replace('\n', ' ')
        print(f"  {i:<2} | {text_preview} [{chunk_type}]")
else:
    print('No CORRECT cases available.')


Doc          : AMCOR_2022_8K_dated-2022-07-01
Question     : What was the key agenda of the AMCOR's 8k filing dated 1st July 2022?

Ground Truth : Amcor Finance (USA), Inc. and Amcor Flexibles North America, Inc., entered into supplemental indentures relating to Guaranteed Senior Notes due 2026 and 2028. This involved the substitution of the Substitute Issuer (Amcor Flexibles North America) for the Former Issuer (Amcor Finance) and the assumption of covenants under the indentures. (In essence a novation agreement)

Generated Answer : The key agenda of AMCOR's 8K filing dated 1st July 2022 was to report the execution of the Second Supplemental Indenture and First Supplemental Indenture, which facilitated the substitution of Amcor Flexibles North America, Inc. (the Substitute Issuer) for Amcor Finance (USA), Inc. (the Former Issuer) as the issuer of the Existing Notes. This involved the Substitute Issuer assuming the covenants of the Former Issuer under the Indentures governing the 2026 

# PARTIALLY_CORRECT

In [8]:
partial = df[df['eval_classification'] == "PARTIALLY_CORRECT"]
print(f'PARTIALLY_CORRECT: {len(partial)}')
partial[['doc_name', 'question', 'question_type', 'answer', 'ground_truth']]

PARTIALLY_CORRECT: 3


Unnamed: 0,doc_name,question,question_type,answer,ground_truth
7,AMERICANEXPRESS_2022_10K,Was American Express able to retain card membe...,novel-generated,American Express reported having 76.7 million ...,Yes
15,BOEING_2022_10K,Is Boeing's business subject to cyclicality?,novel-generated,The provided sources do not explicitly state w...,"Yes, Boeing's business is subject to cyclicali..."
27,ULTABEAUTY_2023Q4_EARNINGS,What drove the increase in Ulta Beauty's merch...,novel-generated,The increase in Ulta Beauty's merchandise inve...,Increase in Merchandise inventories balance wa...


In [9]:
idx = partial.index[0] if len(partial) else None
if idx is not None:
    rec = partial.loc[idx]
    print(f"Doc          : {rec['doc_name']}")
    print(f"Question     : {rec['question']}")
    print('\nGround Truth :', rec['ground_truth'])
    print('\nGenerated Answer :', rec['answer'])

    # Evidence from dataset
    evid_key = (rec.get("doc_name"), rec.get("question"))
    evidences = evidence_idx.get(evid_key, [])
    print("\nEvidence:")
    if evidences:
        for i, ev in enumerate(evidences, start=1):
            text = ev.get("evidence_text")
            print(f"Evidence {i}: {text}\n")

    print('\nChunk Used:')
    for citation in rec['citations']:
        print(f"  {citation.get('i'):<2} | {citation.get('text')}")

    print('\nTop-K:')
    for i, hit in enumerate(rec['hits'], 1):
        chunk_id = hit.get('chunk_id')
        chunk_type = hit.get('type')
        text_preview = (hit.get('text') or '').replace('\n', ' ')
        print(f"  {i:<2} | {text_preview} [{chunk_type}]")
else:
    print('No CORRECT cases available.')


Doc          : AMERICANEXPRESS_2022_10K
Question     : Was American Express able to retain card members during 2022?

Ground Truth : Yes

Generated Answer : American Express reported having 76.7 million proprietary cards in force worldwide as of December 31, 2022 [1], which suggests they maintained their card member base during the year. However, the sources do not explicitly state retention rates or attrition metrics, nor do they confirm whether the number of card members increased, decreased, or remained stable compared to prior periods. The company did mention retaining existing card members as part of its growth initiatives [6], but this is a strategic goal rather than a confirmed outcome.

Evidence:
Evidence 1: Net card fees increased 17 percent year over-year, as new card acquisitions reached record levels in 2022 and Card Member
retention remained high, demonstrating the impact of investments we have made in our premium value propositions


Chunk Used:
  1  | For the year ended 

# DIFFERENT_ANSWER

In [10]:
different = df[df['eval_classification'] == "DIFFERENT_ANSWER"]
print(f'DIFFERENT_ANSWER: {len(different)}')
different[['doc_name', 'question', 'question_type', 'answer', 'ground_truth']]

DIFFERENT_ANSWER: 1


Unnamed: 0,doc_name,question,question_type,answer,ground_truth
2,AMERICANEXPRESS_2022_10K,What are the geographies that American Express...,domain-relevant,American Express primarily operates in the Uni...,"United States, EMEA, APAC, and LACC"


In [11]:
idx = different.index[0] if len(different) else None
if idx is not None:
    rec = different.loc[idx]
    print(f"Doc          : {rec['doc_name']}")
    print(f"Question     : {rec['question']}")
    print('\nGround Truth :', rec['ground_truth'])
    print('\nGenerated Answer :', rec['answer'])

    # Evidence from dataset
    evid_key = (rec.get("doc_name"), rec.get("question"))
    evidences = evidence_idx.get(evid_key, [])
    print("\nEvidence:")
    if evidences:
        for i, ev in enumerate(evidences, start=1):
            text = ev.get("evidence_text")
            print(f"Evidence {i}: {text}\n")

    print('\nChunk Used:')
    for citation in rec['citations']:
        print(f"  {citation.get('i'):<2} | {citation.get('text')}")

    print('\nTop-K:')
    for i, hit in enumerate(rec['hits'], 1):
        chunk_id = hit.get('chunk_id')
        chunk_type = hit.get('type')
        text_preview = (hit.get('text') or '').replace('\n', ' ')
        print(f"  {i:<2} | {text_preview} [{chunk_type}]")
else:
    print('No CORRECT cases available.')


Doc          : AMERICANEXPRESS_2022_10K
Question     : What are the geographies that American Express primarily operates in as of 2022?

Ground Truth : United States, EMEA, APAC, and LACC

Generated Answer : American Express primarily operates in the United States, Mexico, Ireland, Jersey, Netherlands Antilles, Japan, Malaysia, Argentina, France, Netherlands, Sweden, Italy, Canada, United Kingdom, Spain, India, Switzerland, China, Brazil, Indonesia, Chile, Australia, New Zealand, Bahrain, Saudi Arabia, Oman, Qatar, Egypt, Germany, Poland, Austria, and other countries and territories. As of 2022, they operated in approximately 103 countries and territories through their payments network [5].

Evidence:
Evidence 1: (Millions)
United States
EMEA
APAC
LACC
Other Unallocated
Consolidated
2022
Total revenues net of interest expense
$
41,396 
$
4,871 
$
3,835 
$
2,917 
$
(157)
$
52,862 
Pretax income (loss) from continuing operations
10,383 
550 
376 
500 
(2,224)
9,585 
2021
Total revenues n

# INCORRECT

In [12]:
incorrect = df[df['eval_classification'] == "INCORRECT"]
print(f'INCORRECT: {len(incorrect)}')
incorrect[['doc_name', 'question', 'question_type', 'answer', 'ground_truth']]

INCORRECT: 11


Unnamed: 0,doc_name,question,question_type,answer,ground_truth
6,AMERICANEXPRESS_2022_10K,What was the largest liability in American Exp...,novel-generated,The provided sources do not explicitly state t...,Customer deposits
8,BESTBUY_2024Q2_10Q,Was there any drop in Cash & Cash equivalents ...,novel-generated,The provided sources do not include the cash a...,"Yes, there was a decline of ~42% between FY202..."
9,BESTBUY_2024Q2_10Q,Was there any change in the number of Best Buy...,novel-generated,There was a decrease of 11 Best Buy stores bet...,"Yes, there is decline in number stores by 1.32..."
11,BOEING_2022_10K,Are there any product categories / service cat...,domain-relevant,"Yes, Boeing's Commercial Airplanes segment rep...",Yes. Boeing has product and service categories...
12,BOEING_2022_10K,Has Boeing reported any materially important o...,domain-relevant,"Based on the provided sources, there is no exp...",Yes. Multiple lawsuits have been filed against...
13,BOEING_2022_10K,Does Boeing have an improving gross margin pro...,domain-relevant,The provided sources do not include gross marg...,Yes. Boeing has an improving gross margin prof...
17,BOEING_2022_10K,How does Boeing's effective tax rate in FY2022...,novel-generated,Boeing's effective tax rate in FY2022 was 0.6%...,"Effective tax rate in FY2022 was 0.62%, compar..."
23,PEPSICO_2022_10K,What is the quantity of restructuring costs di...,domain-relevant,The quantity of restructuring costs directly o...,Pepsico's restructuring costs in FY2022 amount...
24,PEPSICO_2022_10K,What is the FY2022 unadjusted EBITDA less cape...,metrics-generated,The calculation of unadjusted EBITDA requires ...,$9068.00
25,PEPSICO_2022_10K,What is the FY2022 unadjusted EBITDA % margin ...,metrics-generated,The provided sources do not include the necess...,16.5%


In [13]:
idx = incorrect.index[0] if len(incorrect) else None
if idx is not None:
    rec = incorrect.loc[idx]
    print(f"Doc          : {rec['doc_name']}")
    print(f"Question     : {rec['question']}")
    print('\nGround Truth :', rec['ground_truth'])
    print('\nGenerated Answer :', rec['answer'])

    # Evidence from dataset
    evid_key = (rec.get("doc_name"), rec.get("question"))
    evidences = evidence_idx.get(evid_key, [])
    print("\nEvidence:")
    if evidences:
        for i, ev in enumerate(evidences, start=1):
            text = ev.get("evidence_text")
            print(f"Evidence {i}: {text}\n")

    print('\nChunk Used:')
    for citation in rec['citations']:
        print(f"  {citation.get('i'):<2} | {citation.get('text')}")

    print('\nTop-K:')
    for i, hit in enumerate(rec['hits'], 1):
        chunk_id = hit.get('chunk_id')
        chunk_type = hit.get('type')
        text_preview = (hit.get('text') or '').replace('\n', ' ')
        print(f"  {i:<2} | {text_preview} [{chunk_type}]")
else:
    print('No CORRECT cases available.')


Doc          : AMERICANEXPRESS_2022_10K
Question     : What was the largest liability in American Express's Balance Sheet in 2022?

Ground Truth : Customer deposits

Generated Answer : The provided sources do not explicitly state the largest liability in American Express's balance sheet for 2022. While sources mention financial liabilities and capital ratios [6, 9], they do not detail specific liability line items or their magnitudes. The absence of a visible table or numerical breakdown of liabilities prevents identifying the largest liability [9].

Evidence:
Evidence 1: CONSOLIDATED BALANCE SHEETS
December 31 (Millions, except share data)
2022
2021
Assets
Cash and cash equivalents
Cash and due from banks (includes restricted cash of consolidated variable interest entities: 2022, $5; 2021, $11)
$
5,510 
$
1,292 
Interest-bearing deposits in other banks (includes securities purchased under resale agreements: 2022, $318; 2021, $463)
28,097 
20,548 
Short-term investment securities (incl

# NO_ANSWER

In [14]:
no_answer = df[df['eval_classification'] == "NO_ANSWER"]
print(f'NO_ANSWER: {len(no_answer)}')
no_answer[['doc_name', 'question', 'question_type', 'answer', 'ground_truth']]

NO_ANSWER: 1


Unnamed: 0,doc_name,question,question_type,answer,ground_truth
29,ULTABEAUTY_2023Q4_EARNINGS,Did Ulta Beauty's wages expense as a percent o...,novel-generated,The necessary information to calculate Ulta Be...,Wages expense as a percent of net sales increa...


In [15]:
idx = no_answer.index[0] if len(no_answer) else None
if idx is not None:
    rec = no_answer.loc[idx]
    print(f"Doc          : {rec['doc_name']}")
    print(f"Question     : {rec['question']}")
    print('\nGround Truth :', rec['ground_truth'])
    print('\nGenerated Answer :', rec['answer'])

    # Evidence from dataset
    evid_key = (rec.get("doc_name"), rec.get("question"))
    evidences = evidence_idx.get(evid_key, [])
    print("\nEvidence:")
    if evidences:
        for i, ev in enumerate(evidences, start=1):
            text = ev.get("evidence_text")
            print(f"Evidence {i}: {text}\n")

    print('\nChunk Used:')
    for citation in rec['citations']:
        print(f"  {citation.get('i'):<2} | {citation.get('text')}")

    print('\nTop-K:')
    for i, hit in enumerate(rec['hits'], 1):
        chunk_id = hit.get('chunk_id')
        chunk_type = hit.get('type')
        text_preview = (hit.get('text') or '').replace('\n', ' ')
        print(f"  {i:<2} | {text_preview} [{chunk_type}]")
else:
    print('No CORRECT cases available.')


Doc          : ULTABEAUTY_2023Q4_EARNINGS
Question     : Did Ulta Beauty's wages expense as a percent of net sales increase or decrease in FY2023?

Ground Truth : Wages expense as a percent of net sales increased in FY2023. The answer here assumes FY2023 refers to the 12 months ended on January 28, 2023 (although the company refers to this period as its fiscal 2022.

Generated Answer : The necessary information to calculate Ulta Beauty's wages expense as a percentage of net sales for FY2023 is not available in the provided sources. The sources mention financial milestones (e.g., revenue, net income) but do not include specific figures for wages expense or net sales required for the calculation.

Evidence:
Evidence 1: For the Full Year of Fiscal 2022
Net sales increased 18.3% to $10.2 billion compared to $8.6 billion in fiscal 2021,
primarily due to the favorable impact from the continued resilience of the beauty
category, retail price increases, the impact of new brands and product inn