In [1]:
# %% [Imports]
# from langchain_community.document_loaders import WebBaseLoader
# from langchain_community.vectorstores import FAISS
# from langchain_openai import OpenAIEmbeddings
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain.tools.retriever import create_retriever_tool
from langchain_groq import ChatGroq
# from langchain import hub
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.messages import SystemMessage
from bs4 import BeautifulSoup
import requests
import os
from dotenv import load_dotenv

In [2]:
# %% [Load API Keys]
load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize LLM
llm = ChatGroq(groq_api_key=groq_api_key, model_name="Llama3-8b-8192")

In [3]:
# %% [Step 1: Crawl All Links in the Website]
def extract_links(url):
    """Extracts all hyperlinks from a given webpage."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    # print(soup)

    # Find all hyperlinks on the page
    links = set()
    for link in soup.find_all("a", href=True):
        href = link["href"]
        # Convert relative links to absolute
        if href.startswith("/"):
            href = url.rstrip("/") + href
        if href.startswith("http"):  # Only store valid URLs
            links.add(href)
    
    return list(links)

# Get all links on the main page
base_url = "https://docs.smith.langchain.com/"
all_links = extract_links(base_url)

print(f"Found {len(all_links)} links to process.")

Found 43 links to process.


In [4]:
for i in range(len(all_links)):
    if "https://docs.smith.langchain.com/observability" in all_links[i]:
        print(all_links[i], type(all_links[i]))

https://docs.smith.langchain.com/observability/how_to_guides/dashboards <class 'str'>
https://docs.smith.langchain.com/observability/how_to_guides/tracing/trace_with_langgraph <class 'str'>
https://docs.smith.langchain.com/observability/how_to_guides/trace_with_langchain <class 'str'>
https://docs.smith.langchain.com/observability <class 'str'>


  results = retriever.get_relevant_documents(query)


Answer: Get started with LangSmith | 🦜️🛠️ LangSmith

Source URL: https://docs.smith.langchain.com/


# Trying to add recursivity

In [None]:
from bs4 import BeautifulSoup
import requests
import time

def extract_links_recursively(url, visited=None, depth=2):
    """
    Recursively extracts all hyperlinks from a given webpage and its subpages.
    
    Args:
        url (str): The starting URL to scrape.
        visited (set): A set to keep track of visited URLs.
        depth (int): How deep to follow links (default = 2).
    
    Returns:
        list: A list of unique URLs found on the site.
    """
    if visited is None:
        visited = set()
    
    # Stop recursion if max depth is reached
    if depth == 0:
        return visited

    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, "html.parser")

        for link in soup.find_all("a", href=True):
            href = link["href"]

            # Convert relative links to absolute
            if href.startswith("/"):
                href = url.rstrip("/") + href
            
            # If it's a valid link and not visited, add to the set
            if href.startswith("http") and href not in visited:
                visited.add(href)
                print(f"Found: {href}")  # Debugging output
                
                # Recursively extract links from this new page
                time.sleep(1)  # Avoid overloading the server
                extract_links_recursively(href, visited, depth - 1)

    except requests.RequestException as e:
        print(f"Skipping {url} due to an error: {e}")

    return visited

# Start crawling from the base URL with recursion (depth 2)
base_url = "https://docs.smith.langchain.com/"
all_links = extract_links_recursively(base_url, depth=2)

print(f"\nTotal Links Found: {len(all_links)}")


# Get the URL's from the metada

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load the webpage using WebBaseLoader
url = "https://docs.smith.langchain.com/"
loader = WebBaseLoader(url)
docs = loader.load()

# Store source URL in metadata
for doc in docs:
    doc.metadata["source_url"] = url

# Split documents into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = splitter.split_documents(docs)

# Store in FAISS vector database
vectordb = FAISS.from_documents(documents, OpenAIEmbeddings())
retriever = vectordb.as_retriever()

# Custom function to retrieve answer + source URL
def retrieve_answer_with_source(query):
    results = retriever.get_relevant_documents(query)
    
    if not results:
        return "Not found within the document's scope. Do you want me to look it up within my general knowledge?"
    
    response = f"Answer: {results[0].page_content}\n\nSource URL: {results[0].metadata['source_url']}"
    return response

# Test the retrieval function
query = "What is LangSmith?"
print(retrieve_answer_with_source(query))
