In [1]:
import os
import bs4
from dotenv import load_dotenv
import logging
from bs4 import BeautifulSoup, SoupStrainer
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Enables tracing for LangSmith or LangChain's internal operations, which could log detailed traces for debugging purposes.
os.environ["LANGCHAIN_TRACING_V2"] = "true"

# Enables tracing for LangSmith or LangChain's internal operations, which could log detailed traces for debugging purposes.
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

# Load environment variables from .env file.
load_dotenv()

# Load API key from environment variable.
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    log.error("OPENAI_API_KEY not found in environment variables.")
    raise ValueError("OPENAI_API_KEY not set in the environment.")
os.environ["OPENAI_API_KEY"] = api_key

llm = ChatOpenAI(model="gpt-4o-mini")


In [2]:
def get_user_agent() -> str:
    """Get user agent from environment variable."""
    env_user_agent = os.environ.get("USER_AGENT")
    if not env_user_agent:
        log.warning(
            "USER_AGENT environment variable not set, "
            "consider setting it to identify your requests."
        )
        return "DefaultLangchainUserAgent"
    return env_user_agent

In [3]:
# Define a strainer to filter the HTML content.
# The `bs4.SoupStrainer` is configured to retain only the elements with the classes "post-title", "post-header", and "post-content".
# This focused extraction ensures that only the necessary information for LangChain processing is captured, eliminating extraneous data.
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))

# Initialize the WebBaseLoader.
# This object is responsible for fetching the content of the specified webpage and applying the filtering defined by the `bs4_strainer`.
# It allows us to retrieve only the desired sections of the webpage while setting a custom User-Agent header to mimic a standard web browser request.
loader = WebBaseLoader( # Document loader class
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",), # Source 
    bs_kwargs={"parse_only": bs4_strainer}, # Converts to standadize format for processing by LangChain
    requests_kwargs={"headers": {"User-Agent": get_user_agent()}}
)

# Load the Document from the WebBaseLoader.
# The `loader.load()` method retrieves the webpage content and applies the previously defined filtering.
# The resulting documents are stored in the `docs` variable, which contains the extracted data ready for processing with LangChain.
docs = loader.load()

# Print information about the extracted content.
# This snippet outputs the length of the extracted content (in characters) and displays the first 500 characters.
# This verification step helps ensure that the extraction process was successful and that the expected data is captured.
# print(len(docs[0].page_content))
# print(docs[0].page_content[:500])


In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

len(all_splits[0].page_content)

all_splits[10].metadata

{'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/',
 'start_index': 7056}