In [2]:
%pip install --upgrade pip

# Uninstall conflicting packages
%pip uninstall -y langchain-core langchain-openai langchain-experimental beautifulsoup4 langchain-community langchain chromadb beautifulsoup4

# Install compatible versions of langchain-core and langchain-openai
%pip install langchain-core==0.3.6
%pip install langchain-openai==0.2.1
%pip install langchain-experimental==0.3.2
%pip install langchain-community==0.3.1
%pip install langchain==0.3.1

# Install remaining packages
%pip install chromadb==0.5.11
%pip install beautifulsoup4==4.12.3

Found existing installation: langchain-core 0.3.12
Uninstalling langchain-core-0.3.12:
  Successfully uninstalled langchain-core-0.3.12
Found existing installation: langchain-openai 0.2.1
Uninstalling langchain-openai-0.2.1:
  Successfully uninstalled langchain-openai-0.2.1
Found existing installation: langchain-experimental 0.3.2
Uninstalling langchain-experimental-0.3.2:
  Successfully uninstalled langchain-experimental-0.3.2
Found existing installation: beautifulsoup4 4.12.3
Uninstalling beautifulsoup4-4.12.3:
  Successfully uninstalled beautifulsoup4-4.12.3
Found existing installation: langchain-community 0.3.1
Uninstalling langchain-community-0.3.1:
  Successfully uninstalled langchain-community-0.3.1
Found existing installation: langchain 0.3.1
Uninstalling langchain-0.3.1:
  Successfully uninstalled langchain-0.3.1
Found existing installation: chromadb 0.5.11
Uninstalling chromadb-0.5.11:
  Successfully uninstalled chromadb-0.5.11
Collecting langchain-core==0.3.6
  Using cached 

In [3]:
!pip install langchain-google-genai



In [4]:
import os
os.environ['USER_AGENT'] = 'RAGUserAgent'

In [6]:
import bs4
import openai
import langchain
import chromadb

from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

from langchain_experimental.text_splitter import SemanticChunker

from langchain_openai import ChatOpenAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from langchain import hub
from google.colab import userdata



### Environment

In [16]:
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')


In [17]:
openai.api_key = userdata.get('OPENAI_API_KEY')
gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

llm = ChatOpenAI(model_name='gpt-4o-mini', temperature=0)
str_output_parser = StrOutputParser()
user_query = 'What are the advantages of using RAG'

### LLM

### Source  

In [8]:
bs_kwargs=dict(
    parse_only=bs4.SoupStrainer(
        class_=("post-content", "post-title", "post-header")
    )
)
loader = WebBaseLoader(
    web_paths=('https://kbourne.github.io/chapter1.html',),
    bs_kwargs=bs_kwargs
)

docs = loader.load()

### Splitter

In [10]:
text_splitter = SemanticChunker(gemini_embeddings)
splits = text_splitter.split_documents(docs)

### Embedding and Retrieving

In [20]:
vector_store  = Chroma.from_documents(documents = splits, embedding=gemini_embeddings)
retriever = vector_store.as_retriever()

### Generation

In [14]:
prompt = hub.pull('jclemens24/rag-prompt')



### Post Processing

In [23]:
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

### Chaining

In [21]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(
        lambda x: format_docs(x['context'])
    ))
    |prompt
    |llm
    |str_output_parser
)

rag_chain_from_source = RunnableParallel(
    {'context': retriever, 'question': RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

### Testing

In [24]:
result = rag_chain_from_source.invoke(user_query)
result

{'context': [Document(metadata={'source': 'https://kbourne.github.io/chapter1.html'}, page_content="In both cases, we are talking about data that was not present during the training of the LLM. You can have the latest LLM trained on the most tokens ever, exceeding 10 trillion, but if that data was not present for the training, then the LLM will be at a disadvantage to help you reach your full productivity. Ultimately, this highlights the fact that for most organizations, connecting to data an LLM is not yet familiar with is a central need for them to fully utilize that LLM. RAG is the most popular paradigm for doing this. This book focuses on showing you how to set up a RAG application with your data, as well as how to get the most out of it in various situations. I intend to give you an in-depth understanding of RAG and its importance in leveraging LLM within the context of a company's private or specific data needs. Advantages of RAG#\nPotential advantages of using RAG include improv