In [3]:
import os
from langchain_community.document_loaders import WebBaseLoader
import bs4
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_experimental.text_splitter import SemanticChunker

In [5]:
import warnings

def simple_warning_format(message, category, filename, lineno, file=None, line=None):
    return f"{category.__name__}: {message}\n"

warnings.formatwarning = simple_warning_format
warnings.simplefilter("always", UserWarning)

In [None]:
# CODE FROM CHAPTER 2

#### INDEXING ####
loader = WebBaseLoader(
 web_paths=("https://kbourne.github.io/chapter1.html",),
 bs_kwargs=dict(parse_only=bs4.SoupStrainer(
 class_=("post-content",
 "post-title",
 "post-header")
 )
 ),
)

docs = loader.load()

text_splitter = SemanticChunker(OpenAIEmbeddings())
splits = text_splitter.split_documents(docs)

vectorstore = Chroma.from_documents(
 documents=splits,
 embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()


#### RETRIEVAL and GENERATION ####
prompt = hub.pull("jclemens24/rag-prompt")

def format_docs(docs):
 return "\n\n".join(doc.page_content for doc in docs)

llm = ChatOpenAI(model_name="gpt-4o-mini")

rag_chain = (
 {"context": retriever | format_docs,
 "question": RunnablePassthrough()}
 | prompt
 | llm
 | StrOutputParser()
)

rag_chain.invoke("What are the Advantages of using RAG?")

In [10]:
# Adding sources to your RAG

# The rag_chain_from_docs chain is created using RunnablePassthrough.assign() to format the documents retrieved from the context. 
# It then pipes the formatted context through prompt, llm, and StrOutputParser().
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)


# The rag_chain_with_source chain is created using RunnableParallel() to run the retriever 
# and RunnablePassthrough() in parallel for "context" and "question", respectively. 
# The result is then assigned to "answer" using rag_chain_from_docs.
rag_chain_with_source = RunnableParallel(
    {"context": retriever,
     "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)


In [None]:
rag_chain_with_source.invoke("What are the advantages of using RAG")