In [3]:
import getpass
import os
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [8]:
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What is Task Decomposition?")

ValueError: Expected IDs to be a non-empty list, got 0 IDs

In [5]:
# cleanup
vectorstore.delete_collection()

In [15]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

# Only keep post title, headers, and content from the full HTML.
bs4_strainer = bs4.SoupStrainer()
loader = WebBaseLoader(
    web_paths=("https://www.nytimes.com/2024/08/07/us/politics/tim-walz-kamala-harris-2024.html",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()

print(docs[0].page_content)




Tim Walz’s Rise in the Democratic Party Was No Accident - The New York Times
  




















Skip to contentSkip to site indexPolitics Today’s PaperliveUpdatesAug. 8, 2024, 11:41 a.m. ETPoll TrackerWhy Harris Picked WalzWalz’s Path to ProminenceElection CalendarTimeline: Candidates’ CareersAdvertisementSKIP ADVERTISEMENTSupported bySKIP ADVERTISEMENTTim Walz’s Sudden Rise in the Democratic Party Was No AccidentMore than a year ago, Tim Walz and his aides decided to be ready in case an irresistible opportunity arose. Their tightly held strategy helped them catch political lightning in a bottle.Listen to this article · 10:25 min Learn moreShare full article1060Just months ago, Gov. Tim Walz of Minnesota was little known on the national stage. He was not initially considered a front-runner to become Vice President Kamala Harris’s running mate. Credit...Erin Schaff/The New York TimesBy Reid J. EpsteinLisa LererShane Goldmacher and Theodore SchleiferReid J. Epstein reported from P