#RAG setup

## Imports

In [None]:
%pip install langchain_community
%pip install langchain_experimental
%pip install langchain-openai
%pip install langchainhub
%pip install chromadb
%pip install langchain
%pip install beautifulsoup4

In [15]:
# importing the packages
import os
from langchain_community.document_loaders import WebBaseLoader
import bs4
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_experimental.text_splitter import SemanticChunker

In [9]:
## setting up your OpenAi key
os.environ['OPENAI_API_KEY'] = 'sk-YOUR_KEY'
openai.api_key = os.environ['OPENAI_API_KEY']

## Indexing

1.   Web loading and crawling.
2.   Splitting the data into digestible chunks for the Chroma DB vectorizing algorithm.
3. Embedding and indexing those chunks.
4. Adding those chunks and embeddings to the Chroma DB vector store.



In [4]:
# Web loading and crawling

loader = WebBaseLoader(
    web_paths=("https://kbourne.github.io/chapter1.html",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
           class_=("post-content", "post-title",
                   "post-header")
        )
    ),
)

docs = loader.load()

## Splitting

In [12]:
text_splitter = SemanticChunker(OpenAIEmbeddings()) # We are not specifying which model to be used for embedding so it will be using text-embedding-ada-002  by default.
splits = text_splitter.split_documents(docs)

## Embedding and indexing the chunks

In [19]:
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

In [25]:
query = "Is Rag really worth it?"
relevant_docs = retriever.get_relevant_documents(query)
relevant_docs

[Document(metadata={'source': 'https://kbourne.github.io/chapter1.html'}, page_content='Can you imagine what you could do with all of the benefits mentioned above, but combined with all of the data within your company, about everything your company has ever done, about your customers and all of their interactions, or about all of your products and services combined with a knowledge of what a specific customer’s needs are? You do not have to imagine it, that is what RAG does! Even smaller companies are not able to access much of their internal data resources very effectively. Larger companies are swimming in petabytes of data that is not readily accessible or is not being fully utilized. Prior to RAG, most of the services you saw that connected customers or employees with the data resources of the company were really just scratching the surface of what is possible compared to if they could access ALL of the data in the company. With the advent of RAG and generative AI in general, corpor

## Retrieval and generation

Take in a user query.

*   Vectorize that user query.
* Perform a similarity search of the vector store to find the closest vectors to the
* user query vector, as well as their associated content.
* Pass the retrieved content into a prompt template, a process known as hydrating.
* Pass that hydrated prompt to the LLM.
* Once you receive a response from the LLM, present it to the user.


## Prompt templates from the LangChain Hub

In [None]:
%pip install langchain

In [32]:
LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
LANGCHAIN_API_KEY="lsv2_YOUR_LANGCHAIN_API_KEY"
LANGCHAIN_PROJECT="pr-giving-pegboard-42"

In [None]:
prompt = hub.pull("rlm/rag-prompt")
print(prompt)

## Formatting a function

In [34]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

## Defining your LLM


In [35]:
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

## Setting up a LangChain chain using LCEL

In [37]:
rag_chain = ({"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser())

## Submitting a question for RAG

In [None]:
rag_chain.invoke("what is rag?")