In [1]:
import re
import requests
from langchain_community.document_loaders import BSHTMLLoader

In [2]:
response = requests.get("https://en.wikipedia.org/wiki/Car")
response

<Response [200]>

In [3]:
with open("car.html", "w", encoding="utf-8") as f:
    f.write(response.text)

In [7]:
loader = BSHTMLLoader("car.html")
document = loader.load()[0]
document.page_content = re.sub("\n\n+", "\n", document.page_content)

In [8]:
from typing import List, Optional

from langchain.chains import create_structured_output_runnable
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI



In [9]:
class KeyDevelopment(BaseModel):
    """Information about a development in the history of cars."""
    year:int = Field(...,description="The year when there was an important historic development.")
    description:str = Field(...,description="What happened in this year? What was the development?")
    evidence: str = Field(...,description="Repeat in verbatim the sentence(s) from which the year and description information were extracted",)
    

In [11]:
class ExtractionData(BaseModel):
    key_developments:List[KeyDevelopment]

In [13]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert at identifying key historic development in text. "
            "Only extract important historic developments. Extract nothing if no important information can be found in the text.",
        ),
        ("human","{text}"),
    ]
)

In [15]:
OPENAI_API_KEY = 

In [16]:
llm = ChatOpenAI(openai_api_key = OPENAI_API_KEY)

In [17]:
extractor = prompt | llm.with_structured_output(
    schema = ExtractionData,
    method = "function_calling",
    include_raw=False
)

  warn_beta(


In [18]:
from langchain_text_splitters import TokenTextSplitter

text_splitter = TokenTextSplitter(
    # Controls the size of each chunk
    chunk_size=2000,
    # Controls overlap between chunks
    chunk_overlap=20,
)

texts = text_splitter.split_text(document.page_content)

In [19]:
first_few = texts[:10]
extractions = extractor.batch(
    [{"text":text} for text in first_few],
    {"max_concurrency":5}
)

In [20]:
key_developments = []

for extraction in extractions:
    key_developments.extend(extraction.key_developments)

key_developments[:20]

[KeyDevelopment(year=1966, description="The Toyota Corolla has been in production since 1966 and is recognized as the world's best-selling automobile.", evidence="The Toyota Corolla has been in production since 1966 and is recognized as the world's best-selling automobile."),
 KeyDevelopment(year=1769, description='Nicolas-Joseph Cugnot built the first full-scale, self-propelled mechanical vehicle, a steam-powered tricycle.', evidence='Nicolas-Joseph Cugnot is widely credited with building the first full-scale, self-propelled mechanical vehicle in about 1769; he created a steam-powered tricycle.'),
 KeyDevelopment(year=1886, description='German inventor Carl Benz patented the Benz Patent-Motorwagen, considered the first modern car for everyday use.', evidence='In 1886, the German Carl Benz patented his Benz Patent-Motorwagen; he is generally acknowledged as the inventor of the car.'),
 KeyDevelopment(year=1807, description="Nicéphore Niépce and his brother Claude created what was proba