# Build and evaluate document extraction 🦜⛓️



### Install dependencies

In [1]:
%%capture --no-stderr
%pip install langsmith langchain-openai langchain-core langchain-community pydantic python-dotenv openai
%pip install --upgrade langsmith

In [None]:
import langsmith

print(f"\nCurrent langsmith version: {langsmith.__version__}")

### Load env

In [None]:
from dotenv import load_dotenv

load_dotenv()

### Load the 10-K

In [4]:
from langchain_community.document_loaders import PyPDFLoader

def load_pdf():
    loader = PyPDFLoader("./aapl.pdf")
    all_text = loader.load()
    return all_text

### Perform extraction

In [None]:
from pydantic import BaseModel, Field
from langsmith import wrappers, Client
from openai import OpenAI
openai_client = wrappers.wrap_openai(OpenAI())

class UsefulInformation(BaseModel):
    products_and_services: list[str] = Field(description="A list of products and services provided by the company")
    risk_factors: list[str] = Field(description="A list of risk factors described in the document")
    irs_employer_id_number: list[str] = Field(description="The IRS Employer Identification Number of the company")
    company_address: list[str] = Field(description="The address of the company")
    earnings_per_share_basic: list[str] = Field(description="The basic earnings per share of the company")
    net_income: list[str] = Field(description="The net income of the company")

def extract_information(doc):
    prompt = f"""
    The text below is an excerpt from a 10-K report. You must extract specific information and return it in a structured format.
    
    CRITICAL INSTRUCTIONS:
    1. AVOID DUPLICATES: Never include duplicate items in any list
    2. BE CONCISE: Keep each item brief and to the point
    3. VALIDATE: Each piece of information must be explicitly stated in the text, do not make assumptions
    4. FORMAT: All fields must be lists, even if empty or single item
    
    Examples of GOOD responses:
    - Products: ["Google Search", "Google Cloud", "Android"]
    - Address: ["1600 Amphitheatre Parkway, Mountain View, CA 94043"]
    - Phone: ["+1 650-253-0000"]
    
    Examples of BAD responses (avoid these):
    - Duplicates: ["Google Search", "Search by Google", "Google Search Engine"]
    - Too verbose: ["Google Search is a web search engine that allows users to search the World Wide Web..."]
    - Made up data: Do not include information unless explicitly found in the text
    
    Please extract:
    1. Products and Services: List unique products/services (max 10 items)
    2. Risk Factors: List unique, critical risks (max 10 items)
    3. IRS Employer ID Number: List any EIN found
    4. Company Address: List primary address of the company
    5. Earnings Per Share (Basic): List basic EPS figure
    6. Net Income: List net income figure

    Text from the 10-K report:
    {doc}
    """
    try:
        response = openai_client.beta.chat.completions.parse(
        model="o1-2024-12-17",
        messages=[
            { "role": "user", "content": prompt },
        ],
        response_format=UsefulInformation
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error in structured output LLM call: {str(e)}")
        print(f"Error type: {type(e)}")
        return UsefulInformation(
            products_and_services=[],
            risk_factors=[],
            irs_employer_id_number=[],
            company_address=[],
            earnings_per_share_basic=[],
            net_income=[]
        )

def process_all_docs():
    all_text =  load_pdf()
    results =  extract_information(all_text)
    print("processed all docs...")
    return results

aggregated_info = process_all_docs()
print(aggregated_info)

### Evaluate extraction

<img src="./extraction-eval.png" alt="extraction-eval" width="600">

##### Load existing dataset

In [6]:
dataset_name = "10-k extraction"

##### Define application logic to be evaluated

In [7]:
from langsmith import traceable

client = Client()

@traceable
def target(inputs: dict) -> dict:
    response = openai_client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            { "role": "user", "content": inputs["input"][0]["content"] },
        ],
        response_format=UsefulInformation
    )
    return { "response": response.choices[0].message.content }

##### Define evaluator

In [8]:
import json

def format_objects_for_llm_judge(obj1, obj2):
    """Formats two objects into natural language for easier LLM comparison."""
    def format_single_object(obj, object_name):
        if isinstance(obj, str):
            obj = json.loads(obj)
        formatted_sections = []
        formatted_sections.append(f"\n{object_name} contains the following information:")
        sorted_keys = sorted(obj.keys())
        for key in sorted_keys:
            values = obj[key]
            readable_key = key.replace('_', ' ').capitalize()
            if isinstance(values, list):
                if len(values) == 1:
                    formatted_sections.append(f"\n{readable_key}: {values[0]}")
                else:
                    items = '\n  - '.join(values)
                    formatted_sections.append(f"\n{readable_key}:\n  - {items}")
            else:
                formatted_sections.append(f"\n{readable_key}: {values}")
        
        return '\n'.join(formatted_sections)

    object1_text = format_single_object(obj1, "Actual Output")
    object2_text = format_single_object(obj2, "Reference Output")
    return [object1_text, object2_text]

@traceable(run_type="llm")
def run_llm_judge(formatted_text):
    class Score(BaseModel):
        """Evaluate how well an extracted output matches a reference ground truth for 10-K document information."""
        accuracy: bool = Field(
            description=(
                "A binary score (0 or 1) that indicates whether the model's extraction adequately matches the reference ground truth. "
                "Score 1 if the model's output captures the same essential business information as the reference extraction, even if "
                "expressed differently. The goal is to verify that the model successfully extracted similar key business information "
                "as found in the reference ground truth, not to ensure identical representation."
            )
        )
        reason: str = Field(
            description=(
                "An explanation of how well the model's extraction aligns with the reference ground truth. Consider how effectively "
                "the model captured the same key business information, financial data, and risk factors as the reference output. "
                "Acknowledge that variations in expression are acceptable as long as the same core information is captured."
            )
        )
    response = openai_client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are evaluating how well a model's extraction of 10-K document information matches a reference ground truth output. "
                    "Your task is to determine if the model successfully captured the same essential business information as the reference, "
                    "understanding that similar concepts may be expressed differently.\n\n"
                    "Context:\n"
                    "- The reference output represents the ground truth extraction from a 10-K document\n"
                    "- The model's output is being evaluated against this reference for accuracy and completeness\n"
                    "- Both extractions contain key business information like products/services, risk factors, and financial metrics\n"
                    "- The goal is to verify the model captured similar information as the reference, not identical expression\n\n"
                    "Evaluation Guidelines:\n"
                    "- Score 1 (true) if the model's output:\n"
                    "  * Captures the same core business information as the reference\n"
                    "  * Identifies similar risk factors, even if described differently\n"
                    "  * Extracts matching or equivalent financial metrics\n"
                    "  * Contains consistent company identifiers\n"
                    "  * May include additional valid information beyond the reference\n\n"
                    "- Score 0 (false) only if the model's output:\n"
                    "  * Misses or contradicts critical information from the reference\n"
                    "  * Shows fundamental misunderstanding of the business details\n"
                    "  * Contains irreconcilable differences in key metrics\n"
                    "  * Fails to capture the essential information found in the reference\n\n"
                    "Remember: The reference output is our ground truth. Evaluate how well the model's extraction "
                    "captures the same essential business information, allowing for variations in expression.\n\n"
                    "Outputs to Evaluate:\n"
                    f"- **Model Output:** {formatted_text[0]}\n"
                    f"- **Reference Ground Truth:** {formatted_text[1]}\n"
                )
            }
        ],
        response_format=Score
    )
    response_object = json.loads(response.choices[0].message.content)
    return { "response": response_object }

@traceable
def evaluate_accuracy(outputs: dict, reference_outputs: dict) -> dict:
    actual_output = outputs["response"]
    expected_output = reference_outputs['output']
    formatted_text = format_objects_for_llm_judge(actual_output, expected_output)
    object_response = run_llm_judge(formatted_text)["response"]
    return { "key": "accuracy",
            "score": object_response["accuracy"],
            "reason": object_response["reason"] }

##### Run evaluation

In [None]:
experiment_results = client.evaluate(
    target,
    data="10-k extraction",
    evaluators=[evaluate_accuracy],
    experiment_prefix="10-k-extraction-gpt-4o",
    max_concurrency=5,
    num_repetitions=3
)

experiment_results.to_pandas()