source: https://langchain-ai.github.io/langchain-benchmarks/notebooks/retrieval/semi_structured_benchmarking/ss_eval_chunk_sizes.html

### Pre-requisites

In [2]:
import getpass
import os

os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
env_vars = ["LANGCHAIN_API_KEY", "OPENAI_API_KEY", "FIREWORKS_API_KEY"]
for var in env_vars:
    if var not in os.environ:
        os.environ[var] = getpass.getpass(prompt=f"Enter your {var}: ")

Enter your LANGCHAIN_API_KEY: ········
Enter your FIREWORKS_API_KEY: ········


### Dataset
Fetch the associated PDFs from remote cache for the dataset so that we can perform ingestion.

In [4]:
import os

from langchain_benchmarks import clone_public_dataset, registry
from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names

# Task
task = registry["Semi-structured Reports"]

# Files used
paths = list(get_file_names())
files = [str(p) for p in paths]

File /Users/r337555/anaconda3/envs/langchain/lib/python3.11/site-packages/langchain_benchmarks/rag/tasks/semi_structured_reports/indexing/semi_structured_earnings.zip does not exist. Downloading from GCS...
File https://storage.googleapis.com/benchmarks-artifacts/langchain-docs-benchmarking/semi_structured_earnings.zip downloaded.


Clone the dataset so that it’s available in our LangSmith datasets.



In [5]:
clone_public_dataset(task.dataset_id, dataset_name=task.name)

  0%|          | 0/30 [00:00<?, ?it/s]

Finished fetching examples. Creating dataset...
New dataset created you can access it at https://smith.langchain.com/o/8088c1e2-0bd7-567f-a9a3-e89380e5cb42/datasets/0aba67b4-6e39-4f9d-b586-57a963cb724d.
Done creating dataset.


### Load and index
We load each file, split it, embed with OpenAIEmbeddings, and create an index with Chroma vectorstore.

In [6]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.chat_models import ChatFireworks
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler


def load_and_split(file, token_count, split_document=True):
    """
    Load and optionally split PDF files.

    Args:
        file (str): File path.
        token_count (int): Token count for splitting.
        split_document (bool): Flag for splitting or returning pages.
    """

    loader = PyPDFLoader(file)
    pdf_pages = loader.load()

    if split_document:
        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=token_count, chunk_overlap=50
        )

        docs = text_splitter.split_documents(pdf_pages)
        texts = [d.page_content for d in docs]
    else:
        texts = [d.page_content for d in pdf_pages]

    print(f"There are {len(texts)} text elements")
    return texts


def load_files(files, token_count, split_document):
    """
    Load files.

    Args:
        files (list): List of file names.
        dir (str): Directory path.
        token_count (int): Token count for splitting.
        split_document (bool): Flag for splitting documents.
    """

    texts = []
    for fi in files:
        texts.extend(load_and_split(fi, token_count, split_document))
    return texts


def make_retriever(texts, expt):
    """
    Make vector store.

    Args:
        texts (list): List of texts.
        expt (str): Experiment name.
    """
    vectorstore = Chroma.from_texts(
        texts=texts, collection_name=expt, embedding=OpenAIEmbeddings()
    )
    retriever = vectorstore.as_retriever()
    return retriever


def rag_chain(retriever, llm):
    """
    RAG chain.

    Args:
        retriever: The retriever to use.
        llm: The llm to use.
    """

    # Prompt template
    template = """Answer the question based only on the following context, which can include text and tables:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)

    # LLM
    if llm == "mixtral":
        model = ChatFireworks(
            model="accounts/fireworks/models/mixtral-8x7b-instruct", temperature=0
        )
    else:
        model = ChatOpenAI(temperature=0, model="gpt-4")

    # RAG pipeline
    chain = (
        {
            "context": retriever | (lambda x: "\n\n".join([i.page_content for i in x])),
            "question": RunnablePassthrough(),
        }
        | prompt
        | model
        | StrOutputParser()
    )
    return chain


# Experiment configurations
experiments = [
    (None, False, "page_split-oai", "oai"),
    (50, True, "50_tok_split-oai", "oai"),
    (100, True, "100_tok_split-oai", "oai"),
    (250, True, "250_tok_split-oai", "oai"),
    (250, True, "250_tok_split-mixtral", "mixtral"),
]

# Run
stor_chain = {}
for token_count, split_document, expt, llm in experiments:
    texts = load_files(files, token_count, split_document)
    retriever = make_retriever(texts, expt)
    stor_chain[expt] = rag_chain(retriever, llm)

There are 3 text elements
There are 15 text elements
There are 3 text elements
There are 16 text elements
There are 11 text elements
There are 11 text elements
There are 181 text elements
There are 1454 text elements
There are 197 text elements
There are 521 text elements
There are 284 text elements
There are 287 text elements
There are 59 text elements
There are 77 text elements
There are 67 text elements
There are 285 text elements
There are 146 text elements
There are 130 text elements
There are 15 text elements
There are 27 text elements
There are 17 text elements
There are 74 text elements
There are 41 text elements
There are 38 text elements
There are 15 text elements
There are 27 text elements
There are 17 text elements
There are 74 text elements
There are 41 text elements
There are 38 text elements


### Eval
Run eval onm our dataset, Semi-structured Reports.

In [8]:
import uuid
from langsmith.client import Client
from langchain.smith import RunEvalConfig

# Config
client = Client()
eval_config = RunEvalConfig(
    evaluators=["cot_qa"],
)

# Experiments
chain_map = {
    "page_split": stor_chain["page_split-oai"],
    "baseline-50-tok": stor_chain["50_tok_split-oai"],
    "baseline-100-tok": stor_chain["100_tok_split-oai"],
    "baseline-250-tok": stor_chain["250_tok_split-oai"],
    "baseline-250-tok-mixtral": stor_chain["250_tok_split-mixtral"],
}

# Run evaluation
run_id = uuid.uuid4().hex[:4]
test_runs = {}
for project_name, chain in chain_map.items():
    test_runs[project_name] = client.run_on_dataset(
        dataset_name=task.name,
        llm_or_chain_factory=lambda: (lambda x: x["question"]) | chain,
        evaluation=eval_config,
        verbose=True,
        project_name=f"{run_id}-{project_name}",
        project_metadata={"chain": project_name},
    )

View the evaluation results for project 'aa93-page_split' at:
https://smith.langchain.com/o/8088c1e2-0bd7-567f-a9a3-e89380e5cb42/datasets/0aba67b4-6e39-4f9d-b586-57a963cb724d/compare?selectedSessions=aeee4da8-db04-4e63-ac87-6dfa9c26787e

View all tests for Dataset Semi-structured Reports at:
https://smith.langchain.com/o/8088c1e2-0bd7-567f-a9a3-e89380e5cb42/datasets/0aba67b4-6e39-4f9d-b586-57a963cb724d
[>                                                 ] 0/28

Chain failed for example dbde8588-8b03-41e2-aa09-31ab0320ea88 with inputs {'Question': 'What was Datadog’s current and non-current deferred revenue as of September 30, 2023?'}
Error Type: KeyError, Message: 'question'


[->                                                ] 1/28

Chain failed for example 45c1960f-674f-4fa8-8f2e-f04170dff3c8 with inputs {'Question': 'How much capitalized software development costs did Datadog report for the three months that ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 351d2a74-07f2-43b9-8077-a78106e21ae0 with inputs {'Question': 'Can you calculate the year-over-year percentage change in Datadog’s research and development spending for the three months ended September 30,2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 80216a8b-cae9-43e3-9be6-57a6bfbaaa25 with inputs {'Question': 'How much did Datadog spend on research and development for the three months ended September 30,2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 813ea2d1-4316-4bfb-b1b2-43b96d545cdf with inputs {'Question': "What was Datadog's net income for the three months that ended September 30 in 2023 and 2022?"}
Error Type: KeyError, Message: 'question'


[--->                                              ] 2/28[---->                                             ] 3/28[------>                                           ] 4/28[-------->                                         ] 5/28

Chain failed for example bfc0311a-6d35-4b1c-bae0-d0e46707f849 with inputs {'Question': 'How many bank failures occurred between 2021 and 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example f274198c-baa5-4eaa-b8d8-427f486024f8 with inputs {'Question': 'What is the total amount of underreported federal tax income from 2011-2013?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 7e5c821c-4a5c-4382-a5c6-8ba381ea49a0 with inputs {'Question': 'What were the deposits from bank failures for 2001-2020 and 2021-2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 5f0e62da-7f5f-4151-bb5f-491de60be2de with inputs {'Question': 'What factors contributed to the federal income tax gap in 2011-2013, both in percentage and dollar terms?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 3b721362-fc29-4d14-9fa3-5aaffddf2e6d with inputs {'Question': 'Can you calculate the year-over-year percentage change in net sales for the

[---------->                                       ] 6/28[----------->                                      ] 7/28[------------->                                    ] 8/28[--------------->                                  ] 9/28[----------------->                                ] 10/28

Chain failed for example c541c59a-cc42-4a06-8085-d44102ce2398 with inputs {'Question': 'For the three months that ended September 30, 2023, what percentage of total net sales did AWS contribute?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 46f2dd15-c2ab-4d0a-bf5d-4c172d3a3ae0 with inputs {'Question': 'Can you calculate the year-over-year percentage change in AWS operating income for the three months that ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 26f099ce-6672-4fa2-aaa4-3cd65f4a3627 with inputs {'Question': 'Can you calculate the year-over-year percentage change in Amazon net product sales from 2022 to 2023 for the nine months ended September 30?'}
Error Type: KeyError, Message: 'question'
Chain failed for example f412803c-ca62-47e4-bb72-7b657d13addc with inputs {'Question': "What was the year-over-year percentage change in revenue growth for Microsoft's Intelligent Cloud segment from 2022 to 2023 for the three

[------------------->                              ] 11/28[-------------------->                             ] 12/28[---------------------->                           ] 13/28[------------------------>                         ] 14/28[-------------------------->                       ] 15/28

Chain failed for example d0916586-8647-4d98-bc7d-3f599ed785ea with inputs {'Question': 'Can you calculate the year-over-year percentage change in revenue for Google Cloud from 2022 to 2023 for the Quarter Ended September 30?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 3518405e-b07b-4c2f-a473-fa0045c16f26 with inputs {'Question': 'For the three months ended September 30, 2022, how much revenue did Google advertising and Google Cloud account for?'}
Error Type: KeyError, Message: 'question'
Chain failed for example e0fb68ab-1374-49fd-87bd-5e700b2b3439 with inputs {'Question': 'For the three months ended September 30, 2023, what percentage of total Alphabet revenues does Google Cloud comprise?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 1c579a74-21b6-49f1-9a17-2dc9c5c4de80 with inputs {'Question': 'What were the revenues for AWS, Google Cloud, and Microsoft Intelligent Cloud for the three months that ended September 30, 2023?'}
Error Type: 

[---------------------------->                     ] 16/28[----------------------------->                    ] 17/28[------------------------------->                  ] 18/28[--------------------------------->                ] 19/28[----------------------------------->              ] 20/28

Chain failed for example 652d39b8-06c6-47a1-8368-9768a12805b3 with inputs {'Question': "What is Amazon's Diluted earnings per share for the Nine Months that Ended September 30, 2023?"}
Error Type: KeyError, Message: 'question'
Chain failed for example e21e7e71-e68c-4bbb-a30a-b35f7a4978ec with inputs {'Question': "What is Alphabet's Research and development expense for the Quarter Ended September 30, 2022?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 70045d2c-bfd8-4292-ab5f-3c5c396e3632 with inputs {'Question': 'What is the Microsoft Research and development expense for the Three Months Ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'


[------------------------------------->            ] 21/28[-------------------------------------->           ] 22/28[---------------------------------------->         ] 23/28

Chain failed for example 771e5284-f845-4e48-9cd8-c46bff13964c with inputs {'Question': 'How many viewers did Amazon Prime attract to the Thursday Night Football season opener?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 83d89a67-3f81-4a4b-be67-ba5e56c070be with inputs {'Question': "What is Datadog's Non-GAAP gross margin for the Nine Months Ended September 30, 2023?"}
Error Type: KeyError, Message: 'question'


[------------------------------------------>       ] 24/28[-------------------------------------------->     ] 25/28

Chain failed for example 4e8d1c00-ebf7-4ce4-b17a-88eb152ca20e with inputs {'Question': "Based on Datadog's report 10 Insights on Real-World Container Use, what percentage of container organizations now run serverless containers and how does this compare to two years ago?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 0f1ea0a0-eab1-4c50-90c6-b25ee35e9767 with inputs {'Question': 'How long has the IRS has been estimating the size and composition of the tax gap?'}
Error Type: KeyError, Message: 'question'
Chain failed for example bed70228-f017-4f73-98de-3327e32eb651 with inputs {'Question': 'What are the two main resolution methods that the FDIC normally uses for failing banks?'}
Error Type: KeyError, Message: 'question'


[------------------------------------------------->] 28/28View the evaluation results for project 'aa93-baseline-50-tok' at:
https://smith.langchain.com/o/8088c1e2-0bd7-567f-a9a3-e89380e5cb42/datasets/0aba67b4-6e39-4f9d-b586-57a963cb724d/compare?selectedSessions=5be23d62-31b2-4af9-86a8-71ae265e4a36

View all tests for Dataset Semi-structured Reports at:
https://smith.langchain.com/o/8088c1e2-0bd7-567f-a9a3-e89380e5cb42/datasets/0aba67b4-6e39-4f9d-b586-57a963cb724d
[>                                                 ] 0/28

Chain failed for example 45c1960f-674f-4fa8-8f2e-f04170dff3c8 with inputs {'Question': 'How much capitalized software development costs did Datadog report for the three months that ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 813ea2d1-4316-4bfb-b1b2-43b96d545cdf with inputs {'Question': "What was Datadog's net income for the three months that ended September 30 in 2023 and 2022?"}
Error Type: KeyError, Message: 'question'
Chain failed for example dbde8588-8b03-41e2-aa09-31ab0320ea88 with inputs {'Question': 'What was Datadog’s current and non-current deferred revenue as of September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 80216a8b-cae9-43e3-9be6-57a6bfbaaa25 with inputs {'Question': 'How much did Datadog spend on research and development for the three months ended September 30,2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 351d2a74-07f2-43b9-8077-a78106e21ae0 with inputs

[->                                                ] 1/28[--->                                              ] 2/28[---->                                             ] 3/28[------>                                           ] 4/28[-------->                                         ] 5/28

Chain failed for example 7e5c821c-4a5c-4382-a5c6-8ba381ea49a0 with inputs {'Question': 'What were the deposits from bank failures for 2001-2020 and 2021-2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example f274198c-baa5-4eaa-b8d8-427f486024f8 with inputs {'Question': 'What is the total amount of underreported federal tax income from 2011-2013?'}
Error Type: KeyError, Message: 'question'
Chain failed for example bfc0311a-6d35-4b1c-bae0-d0e46707f849 with inputs {'Question': 'How many bank failures occurred between 2021 and 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 5f0e62da-7f5f-4151-bb5f-491de60be2de with inputs {'Question': 'What factors contributed to the federal income tax gap in 2011-2013, both in percentage and dollar terms?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 3b721362-fc29-4d14-9fa3-5aaffddf2e6d with inputs {'Question': 'Can you calculate the year-over-year percentage change in net sales for the

[---------->                                       ] 6/28[----------->                                      ] 7/28[------------->                                    ] 8/28[--------------->                                  ] 9/28[----------------->                                ] 10/28

Chain failed for example 26f099ce-6672-4fa2-aaa4-3cd65f4a3627 with inputs {'Question': 'Can you calculate the year-over-year percentage change in Amazon net product sales from 2022 to 2023 for the nine months ended September 30?'}
Error Type: KeyError, Message: 'question'
Chain failed for example c541c59a-cc42-4a06-8085-d44102ce2398 with inputs {'Question': 'For the three months that ended September 30, 2023, what percentage of total net sales did AWS contribute?'}
Error Type: KeyError, Message: 'question'
Chain failed for example f412803c-ca62-47e4-bb72-7b657d13addc with inputs {'Question': "What was the year-over-year percentage change in revenue growth for Microsoft's Intelligent Cloud segment from 2022 to 2023 for the three months that ended September 30?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 46f2dd15-c2ab-4d0a-bf5d-4c172d3a3ae0 with inputs {'Question': 'Can you calculate the year-over-year percentage change in AWS operating income for the three month

[------------------->                              ] 11/28[-------------------->                             ] 12/28[---------------------->                           ] 13/28[------------------------>                         ] 14/28[-------------------------->                       ] 15/28

Chain failed for example d0916586-8647-4d98-bc7d-3f599ed785ea with inputs {'Question': 'Can you calculate the year-over-year percentage change in revenue for Google Cloud from 2022 to 2023 for the Quarter Ended September 30?'}
Error Type: KeyError, Message: 'question'
Chain failed for example e0fb68ab-1374-49fd-87bd-5e700b2b3439 with inputs {'Question': 'For the three months ended September 30, 2023, what percentage of total Alphabet revenues does Google Cloud comprise?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 69d28718-2fd1-4bb8-845d-2de2af664e27 with inputs {'Question': 'What percentage of total revenues do AWS, Google Cloud, and Microsoft Intelligent Cloud contribute for the three months ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 3518405e-b07b-4c2f-a473-fa0045c16f26 with inputs {'Question': 'For the three months ended September 30, 2022, how much revenue did Google advertising and Google Cloud account for

[---------------------------->                     ] 16/28[----------------------------->                    ] 17/28[------------------------------->                  ] 18/28[--------------------------------->                ] 19/28[----------------------------------->              ] 20/28

Chain failed for example 652d39b8-06c6-47a1-8368-9768a12805b3 with inputs {'Question': "What is Amazon's Diluted earnings per share for the Nine Months that Ended September 30, 2023?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 70045d2c-bfd8-4292-ab5f-3c5c396e3632 with inputs {'Question': 'What is the Microsoft Research and development expense for the Three Months Ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example e21e7e71-e68c-4bbb-a30a-b35f7a4978ec with inputs {'Question': "What is Alphabet's Research and development expense for the Quarter Ended September 30, 2022?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 83d89a67-3f81-4a4b-be67-ba5e56c070be with inputs {'Question': "What is Datadog's Non-GAAP gross margin for the Nine Months Ended September 30, 2023?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 771e5284-f845-4e48-9cd8-c46bff13964c with inputs {'Question': 'How man

[------------------------------------->            ] 21/28[-------------------------------------->           ] 22/28[---------------------------------------->         ] 23/28[------------------------------------------>       ] 24/28[-------------------------------------------->     ] 25/28

Chain failed for example 4e8d1c00-ebf7-4ce4-b17a-88eb152ca20e with inputs {'Question': "Based on Datadog's report 10 Insights on Real-World Container Use, what percentage of container organizations now run serverless containers and how does this compare to two years ago?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 0f1ea0a0-eab1-4c50-90c6-b25ee35e9767 with inputs {'Question': 'How long has the IRS has been estimating the size and composition of the tax gap?'}
Error Type: KeyError, Message: 'question'
Chain failed for example bed70228-f017-4f73-98de-3327e32eb651 with inputs {'Question': 'What are the two main resolution methods that the FDIC normally uses for failing banks?'}
Error Type: KeyError, Message: 'question'


[------------------------------------------------->] 28/28View the evaluation results for project 'aa93-baseline-100-tok' at:
https://smith.langchain.com/o/8088c1e2-0bd7-567f-a9a3-e89380e5cb42/datasets/0aba67b4-6e39-4f9d-b586-57a963cb724d/compare?selectedSessions=4d940f36-17b2-4bc9-9e88-4d6b780aaf46

View all tests for Dataset Semi-structured Reports at:
https://smith.langchain.com/o/8088c1e2-0bd7-567f-a9a3-e89380e5cb42/datasets/0aba67b4-6e39-4f9d-b586-57a963cb724d
[>                                                 ] 0/28

Chain failed for example dbde8588-8b03-41e2-aa09-31ab0320ea88 with inputs {'Question': 'What was Datadog’s current and non-current deferred revenue as of September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 80216a8b-cae9-43e3-9be6-57a6bfbaaa25 with inputs {'Question': 'How much did Datadog spend on research and development for the three months ended September 30,2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 45c1960f-674f-4fa8-8f2e-f04170dff3c8 with inputs {'Question': 'How much capitalized software development costs did Datadog report for the three months that ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 813ea2d1-4316-4bfb-b1b2-43b96d545cdf with inputs {'Question': "What was Datadog's net income for the three months that ended September 30 in 2023 and 2022?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 351d2a74-07f2-43b9-8077-a78106e21ae0 with inputs

[->                                                ] 1/28[--->                                              ] 2/28[---->                                             ] 3/28[------>                                           ] 4/28[-------->                                         ] 5/28

Chain failed for example bfc0311a-6d35-4b1c-bae0-d0e46707f849 with inputs {'Question': 'How many bank failures occurred between 2021 and 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 5f0e62da-7f5f-4151-bb5f-491de60be2de with inputs {'Question': 'What factors contributed to the federal income tax gap in 2011-2013, both in percentage and dollar terms?'}
Error Type: KeyError, Message: 'question'
Chain failed for example f274198c-baa5-4eaa-b8d8-427f486024f8 with inputs {'Question': 'What is the total amount of underreported federal tax income from 2011-2013?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 7e5c821c-4a5c-4382-a5c6-8ba381ea49a0 with inputs {'Question': 'What were the deposits from bank failures for 2001-2020 and 2021-2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 3b721362-fc29-4d14-9fa3-5aaffddf2e6d with inputs {'Question': 'Can you calculate the year-over-year percentage change in net sales for the

[---------->                                       ] 6/28[----------->                                      ] 7/28[------------->                                    ] 8/28[--------------->                                  ] 9/28[----------------->                                ] 10/28

Chain failed for example c541c59a-cc42-4a06-8085-d44102ce2398 with inputs {'Question': 'For the three months that ended September 30, 2023, what percentage of total net sales did AWS contribute?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 46f2dd15-c2ab-4d0a-bf5d-4c172d3a3ae0 with inputs {'Question': 'Can you calculate the year-over-year percentage change in AWS operating income for the three months that ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 26f099ce-6672-4fa2-aaa4-3cd65f4a3627 with inputs {'Question': 'Can you calculate the year-over-year percentage change in Amazon net product sales from 2022 to 2023 for the nine months ended September 30?'}
Error Type: KeyError, Message: 'question'
Chain failed for example f412803c-ca62-47e4-bb72-7b657d13addc with inputs {'Question': "What was the year-over-year percentage change in revenue growth for Microsoft's Intelligent Cloud segment from 2022 to 2023 for the three

[------------------->                              ] 11/28[-------------------->                             ] 12/28[---------------------->                           ] 13/28[------------------------>                         ] 14/28[-------------------------->                       ] 15/28

Chain failed for example d0916586-8647-4d98-bc7d-3f599ed785ea with inputs {'Question': 'Can you calculate the year-over-year percentage change in revenue for Google Cloud from 2022 to 2023 for the Quarter Ended September 30?'}
Error Type: KeyError, Message: 'question'
Chain failed for example e0fb68ab-1374-49fd-87bd-5e700b2b3439 with inputs {'Question': 'For the three months ended September 30, 2023, what percentage of total Alphabet revenues does Google Cloud comprise?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 3518405e-b07b-4c2f-a473-fa0045c16f26 with inputs {'Question': 'For the three months ended September 30, 2022, how much revenue did Google advertising and Google Cloud account for?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 1c579a74-21b6-49f1-9a17-2dc9c5c4de80 with inputs {'Question': 'What were the revenues for AWS, Google Cloud, and Microsoft Intelligent Cloud for the three months that ended September 30, 2023?'}
Error Type: 

[---------------------------->                     ] 16/28[----------------------------->                    ] 17/28[------------------------------->                  ] 18/28[--------------------------------->                ] 19/28

Chain failed for example 69d28718-2fd1-4bb8-845d-2de2af664e27 with inputs {'Question': 'What percentage of total revenues do AWS, Google Cloud, and Microsoft Intelligent Cloud contribute for the three months ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'


[----------------------------------->              ] 20/28

Chain failed for example 652d39b8-06c6-47a1-8368-9768a12805b3 with inputs {'Question': "What is Amazon's Diluted earnings per share for the Nine Months that Ended September 30, 2023?"}
Error Type: KeyError, Message: 'question'
Chain failed for example e21e7e71-e68c-4bbb-a30a-b35f7a4978ec with inputs {'Question': "What is Alphabet's Research and development expense for the Quarter Ended September 30, 2022?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 70045d2c-bfd8-4292-ab5f-3c5c396e3632 with inputs {'Question': 'What is the Microsoft Research and development expense for the Three Months Ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 83d89a67-3f81-4a4b-be67-ba5e56c070be with inputs {'Question': "What is Datadog's Non-GAAP gross margin for the Nine Months Ended September 30, 2023?"}
Error Type: KeyError, Message: 'question'


[------------------------------------->            ] 21/28[-------------------------------------->           ] 22/28[---------------------------------------->         ] 23/28[------------------------------------------>       ] 24/28

Chain failed for example 771e5284-f845-4e48-9cd8-c46bff13964c with inputs {'Question': 'How many viewers did Amazon Prime attract to the Thursday Night Football season opener?'}
Error Type: KeyError, Message: 'question'


[-------------------------------------------->     ] 25/28

Chain failed for example 4e8d1c00-ebf7-4ce4-b17a-88eb152ca20e with inputs {'Question': "Based on Datadog's report 10 Insights on Real-World Container Use, what percentage of container organizations now run serverless containers and how does this compare to two years ago?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 0f1ea0a0-eab1-4c50-90c6-b25ee35e9767 with inputs {'Question': 'How long has the IRS has been estimating the size and composition of the tax gap?'}
Error Type: KeyError, Message: 'question'
Chain failed for example bed70228-f017-4f73-98de-3327e32eb651 with inputs {'Question': 'What are the two main resolution methods that the FDIC normally uses for failing banks?'}
Error Type: KeyError, Message: 'question'


[------------------------------------------------->] 28/28View the evaluation results for project 'aa93-baseline-250-tok' at:
https://smith.langchain.com/o/8088c1e2-0bd7-567f-a9a3-e89380e5cb42/datasets/0aba67b4-6e39-4f9d-b586-57a963cb724d/compare?selectedSessions=a6dce677-9118-4ba2-bb60-9120ff852c2c

View all tests for Dataset Semi-structured Reports at:
https://smith.langchain.com/o/8088c1e2-0bd7-567f-a9a3-e89380e5cb42/datasets/0aba67b4-6e39-4f9d-b586-57a963cb724d
[>                                                 ] 0/28

Chain failed for example 80216a8b-cae9-43e3-9be6-57a6bfbaaa25 with inputs {'Question': 'How much did Datadog spend on research and development for the three months ended September 30,2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 351d2a74-07f2-43b9-8077-a78106e21ae0 with inputs {'Question': 'Can you calculate the year-over-year percentage change in Datadog’s research and development spending for the three months ended September 30,2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 45c1960f-674f-4fa8-8f2e-f04170dff3c8 with inputs {'Question': 'How much capitalized software development costs did Datadog report for the three months that ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 813ea2d1-4316-4bfb-b1b2-43b96d545cdf with inputs {'Question': "What was Datadog's net income for the three months that ended September 30 in 2023 and 2022?"}
Error Type: KeyError, Message: 'question'
Chain faile

[->                                                ] 1/28[--->                                              ] 2/28[---->                                             ] 3/28[------>                                           ] 4/28[-------->                                         ] 5/28

Chain failed for example 3b721362-fc29-4d14-9fa3-5aaffddf2e6d with inputs {'Question': 'Can you calculate the year-over-year percentage change in net sales for the AWS segment from 2022 to 2023 for the three months ended September 30?'}
Error Type: KeyError, Message: 'question'
Chain failed for example f274198c-baa5-4eaa-b8d8-427f486024f8 with inputs {'Question': 'What is the total amount of underreported federal tax income from 2011-2013?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 5f0e62da-7f5f-4151-bb5f-491de60be2de with inputs {'Question': 'What factors contributed to the federal income tax gap in 2011-2013, both in percentage and dollar terms?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 7e5c821c-4a5c-4382-a5c6-8ba381ea49a0 with inputs {'Question': 'What were the deposits from bank failures for 2001-2020 and 2021-2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example bfc0311a-6d35-4b1c-bae0-d0e46707f849 with inpu

[---------->                                       ] 6/28[----------->                                      ] 7/28[------------->                                    ] 8/28[--------------->                                  ] 9/28[----------------->                                ] 10/28

Chain failed for example c541c59a-cc42-4a06-8085-d44102ce2398 with inputs {'Question': 'For the three months that ended September 30, 2023, what percentage of total net sales did AWS contribute?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 46f2dd15-c2ab-4d0a-bf5d-4c172d3a3ae0 with inputs {'Question': 'Can you calculate the year-over-year percentage change in AWS operating income for the three months that ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 26f099ce-6672-4fa2-aaa4-3cd65f4a3627 with inputs {'Question': 'Can you calculate the year-over-year percentage change in Amazon net product sales from 2022 to 2023 for the nine months ended September 30?'}
Error Type: KeyError, Message: 'question'
Chain failed for example bdc56941-33cd-4ae3-94cd-5b0e59af4561 with inputs {'Question': "For the three months ended September 30, 2023, what percentage of total revenue does Intelligent Cloud represent when considering Microso

[------------------->                              ] 11/28[-------------------->                             ] 12/28[---------------------->                           ] 13/28[------------------------>                         ] 14/28[-------------------------->                       ] 15/28

Chain failed for example e0fb68ab-1374-49fd-87bd-5e700b2b3439 with inputs {'Question': 'For the three months ended September 30, 2023, what percentage of total Alphabet revenues does Google Cloud comprise?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 69d28718-2fd1-4bb8-845d-2de2af664e27 with inputs {'Question': 'What percentage of total revenues do AWS, Google Cloud, and Microsoft Intelligent Cloud contribute for the three months ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 3518405e-b07b-4c2f-a473-fa0045c16f26 with inputs {'Question': 'For the three months ended September 30, 2022, how much revenue did Google advertising and Google Cloud account for?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 1c579a74-21b6-49f1-9a17-2dc9c5c4de80 with inputs {'Question': 'What were the revenues for AWS, Google Cloud, and Microsoft Intelligent Cloud for the three months that ended September 30, 2023?'}
Err

[---------------------------->                     ] 16/28[----------------------------->                    ] 17/28[------------------------------->                  ] 18/28[--------------------------------->                ] 19/28[----------------------------------->              ] 20/28

Chain failed for example 652d39b8-06c6-47a1-8368-9768a12805b3 with inputs {'Question': "What is Amazon's Diluted earnings per share for the Nine Months that Ended September 30, 2023?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 771e5284-f845-4e48-9cd8-c46bff13964c with inputs {'Question': 'How many viewers did Amazon Prime attract to the Thursday Night Football season opener?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 70045d2c-bfd8-4292-ab5f-3c5c396e3632 with inputs {'Question': 'What is the Microsoft Research and development expense for the Three Months Ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example e21e7e71-e68c-4bbb-a30a-b35f7a4978ec with inputs {'Question': "What is Alphabet's Research and development expense for the Quarter Ended September 30, 2022?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 83d89a67-3f81-4a4b-be67-ba5e56c070be with inputs {'Question': "What 

[------------------------------------->            ] 21/28[-------------------------------------->           ] 22/28[---------------------------------------->         ] 23/28[------------------------------------------>       ] 24/28[-------------------------------------------->     ] 25/28

Chain failed for example 4e8d1c00-ebf7-4ce4-b17a-88eb152ca20e with inputs {'Question': "Based on Datadog's report 10 Insights on Real-World Container Use, what percentage of container organizations now run serverless containers and how does this compare to two years ago?"}
Error Type: KeyError, Message: 'question'
Chain failed for example bed70228-f017-4f73-98de-3327e32eb651 with inputs {'Question': 'What are the two main resolution methods that the FDIC normally uses for failing banks?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 0f1ea0a0-eab1-4c50-90c6-b25ee35e9767 with inputs {'Question': 'How long has the IRS has been estimating the size and composition of the tax gap?'}
Error Type: KeyError, Message: 'question'


[------------------------------------------------->] 28/28View the evaluation results for project 'aa93-baseline-250-tok-mixtral' at:
https://smith.langchain.com/o/8088c1e2-0bd7-567f-a9a3-e89380e5cb42/datasets/0aba67b4-6e39-4f9d-b586-57a963cb724d/compare?selectedSessions=2d0d5360-cb9a-4bc6-bc36-04c8153dfeb3

View all tests for Dataset Semi-structured Reports at:
https://smith.langchain.com/o/8088c1e2-0bd7-567f-a9a3-e89380e5cb42/datasets/0aba67b4-6e39-4f9d-b586-57a963cb724d
[>                                                 ] 0/28

Chain failed for example 351d2a74-07f2-43b9-8077-a78106e21ae0 with inputs {'Question': 'Can you calculate the year-over-year percentage change in Datadog’s research and development spending for the three months ended September 30,2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 80216a8b-cae9-43e3-9be6-57a6bfbaaa25 with inputs {'Question': 'How much did Datadog spend on research and development for the three months ended September 30,2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 813ea2d1-4316-4bfb-b1b2-43b96d545cdf with inputs {'Question': "What was Datadog's net income for the three months that ended September 30 in 2023 and 2022?"}
Error Type: KeyError, Message: 'question'
Chain failed for example dbde8588-8b03-41e2-aa09-31ab0320ea88 with inputs {'Question': 'What was Datadog’s current and non-current deferred revenue as of September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 45c1960f-674f-4fa8-

[->                                                ] 1/28[--->                                              ] 2/28[---->                                             ] 3/28[------>                                           ] 4/28[-------->                                         ] 5/28

Chain failed for example bfc0311a-6d35-4b1c-bae0-d0e46707f849 with inputs {'Question': 'How many bank failures occurred between 2021 and 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 7e5c821c-4a5c-4382-a5c6-8ba381ea49a0 with inputs {'Question': 'What were the deposits from bank failures for 2001-2020 and 2021-2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 3b721362-fc29-4d14-9fa3-5aaffddf2e6d with inputs {'Question': 'Can you calculate the year-over-year percentage change in net sales for the AWS segment from 2022 to 2023 for the three months ended September 30?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 5f0e62da-7f5f-4151-bb5f-491de60be2de with inputs {'Question': 'What factors contributed to the federal income tax gap in 2011-2013, both in percentage and dollar terms?'}
Error Type: KeyError, Message: 'question'
Chain failed for example f274198c-baa5-4eaa-b8d8-427f486024f8 with inputs {'Question': 'What 

[---------->                                       ] 6/28[----------->                                      ] 7/28[------------->                                    ] 8/28[--------------->                                  ] 9/28[----------------->                                ] 10/28

Chain failed for example c541c59a-cc42-4a06-8085-d44102ce2398 with inputs {'Question': 'For the three months that ended September 30, 2023, what percentage of total net sales did AWS contribute?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 46f2dd15-c2ab-4d0a-bf5d-4c172d3a3ae0 with inputs {'Question': 'Can you calculate the year-over-year percentage change in AWS operating income for the three months that ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example bdc56941-33cd-4ae3-94cd-5b0e59af4561 with inputs {'Question': "For the three months ended September 30, 2023, what percentage of total revenue does Intelligent Cloud represent when considering Microsoft's three main business units (Productivity and Business Processes, Intelligent Cloud, and More Personal Computing)?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 26f099ce-6672-4fa2-aaa4-3cd65f4a3627 with inputs {'Question': 'Can you calculate the y

[------------------->                              ] 11/28[-------------------->                             ] 12/28[---------------------->                           ] 13/28[------------------------>                         ] 14/28[-------------------------->                       ] 15/28

Chain failed for example d0916586-8647-4d98-bc7d-3f599ed785ea with inputs {'Question': 'Can you calculate the year-over-year percentage change in revenue for Google Cloud from 2022 to 2023 for the Quarter Ended September 30?'}
Error Type: KeyError, Message: 'question'
Chain failed for example e0fb68ab-1374-49fd-87bd-5e700b2b3439 with inputs {'Question': 'For the three months ended September 30, 2023, what percentage of total Alphabet revenues does Google Cloud comprise?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 1c579a74-21b6-49f1-9a17-2dc9c5c4de80 with inputs {'Question': 'What were the revenues for AWS, Google Cloud, and Microsoft Intelligent Cloud for the three months that ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 3518405e-b07b-4c2f-a473-fa0045c16f26 with inputs {'Question': 'For the three months ended September 30, 2022, how much revenue did Google advertising and Google Cloud account for?'}
Error Type: 

[---------------------------->                     ] 16/28[----------------------------->                    ] 17/28[------------------------------->                  ] 18/28[--------------------------------->                ] 19/28[----------------------------------->              ] 20/28

Chain failed for example e21e7e71-e68c-4bbb-a30a-b35f7a4978ec with inputs {'Question': "What is Alphabet's Research and development expense for the Quarter Ended September 30, 2022?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 652d39b8-06c6-47a1-8368-9768a12805b3 with inputs {'Question': "What is Amazon's Diluted earnings per share for the Nine Months that Ended September 30, 2023?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 70045d2c-bfd8-4292-ab5f-3c5c396e3632 with inputs {'Question': 'What is the Microsoft Research and development expense for the Three Months Ended September 30, 2023?'}
Error Type: KeyError, Message: 'question'
Chain failed for example 83d89a67-3f81-4a4b-be67-ba5e56c070be with inputs {'Question': "What is Datadog's Non-GAAP gross margin for the Nine Months Ended September 30, 2023?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 771e5284-f845-4e48-9cd8-c46bff13964c with inputs {'Question': 'How man

[------------------------------------->            ] 21/28[-------------------------------------->           ] 22/28[---------------------------------------->         ] 23/28[------------------------------------------>       ] 24/28[-------------------------------------------->     ] 25/28

Chain failed for example 4e8d1c00-ebf7-4ce4-b17a-88eb152ca20e with inputs {'Question': "Based on Datadog's report 10 Insights on Real-World Container Use, what percentage of container organizations now run serverless containers and how does this compare to two years ago?"}
Error Type: KeyError, Message: 'question'
Chain failed for example 0f1ea0a0-eab1-4c50-90c6-b25ee35e9767 with inputs {'Question': 'How long has the IRS has been estimating the size and composition of the tax gap?'}
Error Type: KeyError, Message: 'question'
Chain failed for example bed70228-f017-4f73-98de-3327e32eb651 with inputs {'Question': 'What are the two main resolution methods that the FDIC normally uses for failing banks?'}
Error Type: KeyError, Message: 'question'


[--------------------------------------------->    ] 26/28[----------------------------------------------->  ] 27/28[------------------------------------------------->] 28/28