# Refer to notebooks in following section. It's all covered there

..\How-to-guides\2.Key features\Structured-Output-from-model\2.OutputParsers(for models not supporting structured_output)

In [2]:
import re

import requests
from langchain_community.document_loaders import BSHTMLLoader

# Download the content
response = requests.get("https://en.wikipedia.org/wiki/Car")
# Write it to a file
with open("car.html", "w", encoding="utf-8") as f:
    f.write(response.text)
# Load it with an HTML parser
loader = BSHTMLLoader("car.html")
document = loader.load()[0]
# Clean up code
# Replace consecutive new lines with a single new line
document.page_content = re.sub("\n\n+", "\n", document.page_content)

In [3]:
print(len(document.page_content))

80562


# Define the schema
Following the extraction tutorial, we will use Pydantic to define the schema of information we wish to extract. 

In this case, we will extract a list of "key developments" (e.g., important historical events) that include a year and description.

Note that we also include an ```evidence``` key and instruct the model to provide in verbatim the relevant sentences of text from the article. This allows us to compare the extraction results to (the model's reconstruction of) text from the original document.

In [4]:
from typing import List, Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field


class KeyDevelopment(BaseModel):
    """Information about a development in the history of cars."""

    year: int = Field(
        ..., description="The year when there was an important historic development."
    )
    description: str = Field(
        ..., description="What happened in this year? What was the development?"
    )
    evidence: str = Field(
        ...,
        description="Repeat in verbatim the sentence(s) from which the year and description information were extracted",
    )


class ExtractionData(BaseModel):
    """Extracted information about key developments in the history of cars."""

    key_developments: List[KeyDevelopment]


In [6]:
# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert at identifying key historic development in text. "
            "Only extract important historic developments. Extract nothing if no important information can be found in the text.",
        ),
        ("human", "{text}"),
    ]
)

# Create an extractor
Let's select an LLM. Because we are using tool-calling, we will need a model that supports a tool-calling feature. See this table for available LLMs.

In [8]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")



In [9]:
extractor = prompt | llm.with_structured_output(
    schema=ExtractionData,
    include_raw=False,
)

# Brute force approach

Split the documents into chunks such that each chunk fits into the context window of the LLMs.

In [11]:
from langchain_text_splitters import TokenTextSplitter

text_splitter = TokenTextSplitter(
    # Controls the size of each chunk
    chunk_size=2000,
    # Controls overlap between chunks
    chunk_overlap=20,
)


In [12]:
document

Document(metadata={'source': 'car.html', 'title': 'Car - Wikipedia'}, page_content='\nCar - Wikipedia\nJump to content\nMain menu\nMain menu\nmove to sidebar\nhide\n\t\tNavigation\n\t\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate\n\t\tContribute\n\t\nHelpLearn to editCommunity portalRecent changesUpload file\nSearch\nSearch\nAppearance\nCreate account\nLog in\nPersonal tools\n Create account Log in\n\t\tPages for logged out editors learn more\nContributionsTalk\nContents\nmove to sidebar\nhide\n(Top)\n1\nEtymology\n2\nHistory\n3\nMass production\n4\nComponents and design\nToggle Components and design subsection\n4.1\nPropulsion and fuels\n4.1.1\nFossil fuels\n4.1.2\nBatteries\n4.2\nUser interface\n4.3\nElectronics and interior\n4.4\nLighting\n4.5\nWeight and size\n4.6\nSeating and body style\n5\nSafety\n6\nCosts and benefits\n7\nEnvironmental effects\n8\nSocial issues\n9\nEmerging car technologies\nToggle Emerging car technologies subsection\n9.1\nAutono

In [13]:

texts = text_splitter.split_text(document.page_content)

In [19]:
texts

['\nCar - Wikipedia\nJump to content\nMain menu\nMain menu\nmove to sidebar\nhide\n\t\tNavigation\n\t\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate\n\t\tContribute\n\t\nHelpLearn to editCommunity portalRecent changesUpload file\nSearch\nSearch\nAppearance\nCreate account\nLog in\nPersonal tools\n Create account Log in\n\t\tPages for logged out editors learn more\nContributionsTalk\nContents\nmove to sidebar\nhide\n(Top)\n1\nEtymology\n2\nHistory\n3\nMass production\n4\nComponents and design\nToggle Components and design subsection\n4.1\nPropulsion and fuels\n4.1.1\nFossil fuels\n4.1.2\nBatteries\n4.2\nUser interface\n4.3\nElectronics and interior\n4.4\nLighting\n4.5\nWeight and size\n4.6\nSeating and body style\n5\nSafety\n6\nCosts and benefits\n7\nEnvironmental effects\n8\nSocial issues\n9\nEmerging car technologies\nToggle Emerging car technologies subsection\n9.1\nAutonomous car\n9.2\nOpen source development\n9.3\nCar sharing\n10\nIndustry\n11\nAltern

In [18]:
len(texts)

11

# Extraction in batches:

You can often use .batch() to parallelize the extractions! .batch uses a threadpool under the hood to help you parallelize workloads.

If your model is exposed via an API, this will likely speed up your extraction flow!


In [15]:
# Limit just to the first 3 chunks
# so the code can be re-run quickly
first_few = texts[:3]

extractions = extractor.batch(
    [{"text": text} for text in first_few],
    {"max_concurrency": 5},  # limit the concurrency by passing max concurrency!
)

In [16]:
extractions

[ExtractionData(key_developments=[]),
 ExtractionData(key_developments=[KeyDevelopment(year=1769, description='The French inventor Nicolas-Joseph Cugnot built the first steam-powered road vehicle.', evidence='The French inventor Nicolas-Joseph Cugnot built the first steam-powered road vehicle in 1769.'), KeyDevelopment(year=1808, description='The Swiss inventor François Isaac de Rivaz designed and constructed the first internal combustion-powered automobile.', evidence='the Swiss inventor François Isaac de Rivaz designed and constructed the first internal combustion-powered automobile in 1808.'), KeyDevelopment(year=1886, description='The German inventor Carl Benz patented his Benz Patent-Motorwagen, the modern car.', evidence='the modern car—a practical, marketable automobile for everyday use—was invented in 1886, when the German inventor Carl Benz patented his Benz Patent-Motorwagen.'), KeyDevelopment(year=1901, description='The 1901 Oldsmobile Curved Dash is considered one of the fi

# Merge results
After extracting data from across the chunks, we'll want to merge the extractions together.

In [20]:
key_developments = []

for extraction in extractions:
    key_developments.extend(extraction.key_developments)

key_developments[:10]

[KeyDevelopment(year=1769, description='The French inventor Nicolas-Joseph Cugnot built the first steam-powered road vehicle.', evidence='The French inventor Nicolas-Joseph Cugnot built the first steam-powered road vehicle in 1769.'),
 KeyDevelopment(year=1808, description='The Swiss inventor François Isaac de Rivaz designed and constructed the first internal combustion-powered automobile.', evidence='the Swiss inventor François Isaac de Rivaz designed and constructed the first internal combustion-powered automobile in 1808.'),
 KeyDevelopment(year=1886, description='The German inventor Carl Benz patented his Benz Patent-Motorwagen, the modern car.', evidence='the modern car—a practical, marketable automobile for everyday use—was invented in 1886, when the German inventor Carl Benz patented his Benz Patent-Motorwagen.'),
 KeyDevelopment(year=1901, description='The 1901 Oldsmobile Curved Dash is considered one of the first mass-produced cars.', evidence='The 1901 Oldsmobile Curved Dash 