In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")
os.environ['LANGCHAIN_TRACING_V2'] = "true"
os.environ['LANGCHAIN_PROJECT'] = os.getenv("LANGCHAIN_PROJECT")

In [3]:
from langchain_community.document_loaders import WebBaseLoader #Used for web scraping

In [10]:
loader = WebBaseLoader("https://www.comet.com/site/blog/langchain-document-loaders-for-web-data/")

In [11]:
docs = loader.load()
docs

[Document(metadata={'source': 'https://www.comet.com/site/blog/langchain-document-loaders-for-web-data/', 'title': 'LangChain Document Loaders for Web\xa0Data - Comet', 'description': 'The effectiveness of RAG hinges on the method used to retrieve documents. Explore 3 key LangChain document loaders + how they effect output', 'language': 'en-US'}, page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLangChain Document Loaders for Web\xa0Data - Comet\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n \n\n\n\n\n\nskip to Main Content\n\n\n\n\n\n\n\n\n\nEnterprise\nProducts\n\nExperiment Management\nArtifacts\nModel Registry\nModel Production Monitoring\nLLMOps\n\n\nDocs\nPricing\nCustomers\nLearn\n\nResources\nBlog\nDeep Learning Weekly\nLLM Course\n\n\nCompany\n\nAbout Us\nNews and Events\n\nEvents\nPress Releases\n\n\nCareers\nContact Us\nLeadership\n\n\nLogin\nGet Demo\nTry Comet Free\n\n\n\n\n\n \n\n\n\n\nSearch\n\n\nSubmit\n\n\nHome › Blog › LangChain Document Loaders for Web\xa0D

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter #split the text into documents 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
documents = text_splitter.split_documents(docs)


In [13]:
documents

[Document(metadata={'source': 'https://www.comet.com/site/blog/langchain-document-loaders-for-web-data/', 'title': 'LangChain Document Loaders for Web\xa0Data - Comet', 'description': 'The effectiveness of RAG hinges on the method used to retrieve documents. Explore 3 key LangChain document loaders + how they effect output', 'language': 'en-US'}, page_content='LangChain Document Loaders for Web\xa0Data - Comet\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n \n\n\n\n\n\nskip to Main Content\n\n\n\n\n\n\n\n\n\nEnterprise\nProducts\n\nExperiment Management\nArtifacts\nModel Registry\nModel Production Monitoring\nLLMOps\n\n\nDocs\nPricing\nCustomers\nLearn\n\nResources\nBlog\nDeep Learning Weekly\nLLM Course\n\n\nCompany\n\nAbout Us\nNews and Events\n\nEvents\nPress Releases\n\n\nCareers\nContact Us\nLeadership\n\n\nLogin\nGet Demo\nTry Comet Free\n\n\n\n\n\n \n\n\n\n\nSearch\n\n\nSubmit\n\n\nHome › Blog › LangChain Document Loaders for Web\xa0Data\n\n\n\n\n\n\nLangChain Document 

In [14]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [15]:
from langchain_community.vectorstores import FAISS #vector database
vectorstoredb = FAISS.from_documents(documents, embeddings)

In [16]:
vectorstoredb

<langchain_community.vectorstores.faiss.FAISS at 0x1df8ee4cd50>

In [17]:
query = "When extracting text from websites, the WebBaseLoader" #uses cosine similarity 
result = vectorstoredb.similarity_search(query)
result[0].page_content

'Quality of Extracted Text: WebBaseLoader relies on HTML structure and excels with well-structured websites. However, it might struggle with JavaScript-generated dynamic content, which is increasingly common in modern web design. This means the text it retrieves is as good as the HTML it interprets.\nEfficiency: WebBaseLoader is efficient and fast, handling multiple requests seamlessly. This efficiency translates into quicker embedding of documents into your vector database, which is crucial for large-scale applications.\nRelevance: The relevance of the extracted text can vary. In cases where websites are loaded with ads or unrelated content alongside the main text, WebBaseLoader might fetch some noise and valuable data. This could impact the precision of your RAG system’s outputs.'

In [18]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model = 'gpt-4o')
print(llm)

client=<openai.resources.chat.completions.Completions object at 0x000001DFDB69B510> async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x000001DFDB5C1C90> root_client=<openai.OpenAI object at 0x000001DFDB75D9D0> root_async_client=<openai.AsyncOpenAI object at 0x000001DFDC785C50> model_name='gpt-4o' model_kwargs={} openai_api_key=SecretStr('**********')


In [23]:
from langchain.chains.combine_documents import create_stuff_documents_chain

from langchain_core.prompts import ChatPromptTemplate #Using Chat Prompt Template
#using context so that it answers only based on the most similar document(context)
prompt = ChatPromptTemplate.from_template(
    """
    Answer the following question based only on the provided context:
    <context>
        {context} 
    </context>
"""
)

document_chain = create_stuff_documents_chain(llm,prompt)

In [24]:
document_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n    Answer the following question based only on the provided context:\n    <context>\n        {context}\n    </context>\n'), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x000001DFDB69B510>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x000001DFDB5C1C90>, root_client=<openai.OpenAI object at 0x000001DFDB75D9D0>, root_async_client=<openai.AsyncOpenAI object at 0x000001DFDC785C50>, model_name='gpt-4o', model_kwargs={}, openai_api_key=SecretStr('**********'))
| StrOutputParser(), kwargs={}, config={'run_

In [26]:
from langchain_core.documents import Document #manually providing context
document_chain.invoke({
    "input":"The SeleniumURLLoader is your toolkit’s Swiss",
    "context": [Document(page_content="The SeleniumURLLoader is your toolkit’s Swiss army knife for dealing with dynamic, JavaScript-heavy websites. It offers a depth of text retrieval unmatched by more straightforward methods but requires more resources and time. A RAG pipeline is the ideal choice when your focus is on comprehensively capturing the essence of modern, interactive web pages.")]
})

'The SeleniumURLLoader is particularly suited for dealing with what type of websites?\n\nBased on the provided context, the SeleniumURLLoader is particularly suited for dealing with dynamic, JavaScript-heavy websites.'

In [29]:
retriever = vectorstoredb.as_retriever() #retriever automatically retrieves the context (no need for cosine similarity)
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever, document_chain)
retrieval_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001DF8EE4CD50>, search_kwargs={}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n    Answer the following question based only on the provided context:\n    <context>\n        {context}\n    </context>\n'), additional_kwargs={})]

In [30]:
response = retrieval_chain.invoke({
    "input":"SeleniumURLLoader is used when?"
})
response['answer']

'**What are the benefits and drawbacks of using SeleniumURLLoader for text retrieval?**\n\n**Benefits:**\n1. **Comprehensive Text Retrieval:** SeleniumURLLoader can capture text that might be missed by other methods, including content that appears due to user interactions or is dynamically loaded by JavaScript.\n2. **Accuracy and Relevance:** The text retrieved tends to be highly accurate and reflective of the user’s experience on the website, leading to more relevant and context-rich embeddings in a vector database.\n\n**Drawbacks:**\n1. **Performance Considerations:** SeleniumURLLoader is slower and more resource-intensive than straightforward HTTP request methods, which could lead to longer processing times, especially for large volumes of data.'