In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse 
from langchain_core.documents import Document 

In [2]:
visited = set()

def crawl(url,base_url, depth=0,max_depth=5):
    if depth>max_depth or url in visited:
        return []
    
    print(f"Crawling: {url}")
    visited.add(url)

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
        )
    }

    try:
        response = requests.get(url,headers=headers,timeout=5)
        if response.status_code !=200:
            print(f"Failed ({response.status_code}): {url}")
            return []
        
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text(separator="\n",strip=True)
        doc = Document(page_content=text,metadata={'source':url})

        # Find internal Links 
        links = [
            urljoin(url, a['href'])
            for a in soup.find_all("a",href=True)
            if urlparse(urljoin(url, a['href'])).netloc == urlparse(base_url).netloc
        ]

        docs = [doc] 
        for link in links:
            docs.extend(crawl(link,base_url,depth+1,max_depth))
        return docs
    
    except Exception as e:
        print(f"Error: {e}")
        return [] 

In [18]:
# Start Crawling 
docs1 = crawl("https://github.com/pytorch",
              "https://github.com/pytorch",max_depth=2)

In [19]:
# Start Crawling 
docs2 = crawl("https://docs.pytorch.org/docs/stable/index.html",
              "https://docs.pytorch.org/docs/stable/index.html",max_depth=2)

Crawling: https://docs.pytorch.org/docs/stable/index.html
Crawling: https://docs.pytorch.org/docs/stable/community/build_ci_governance.html
Crawling: https://docs.pytorch.org/docs/stable/community/contribution_guide.html
Crawling: https://docs.pytorch.org/docs/stable/community/design.html
Crawling: https://docs.pytorch.org/docs/stable/community/governance.html
Crawling: https://docs.pytorch.org/docs/stable/community/persons_of_interest.html
Crawling: https://docs.pytorch.org/docs/stable/notes/amp_examples.html
Crawling: https://docs.pytorch.org/docs/stable/notes/autograd.html
Crawling: https://docs.pytorch.org/docs/stable/notes/broadcasting.html
Crawling: https://docs.pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html
Crawling: https://docs.pytorch.org/docs/stable/notes/cuda.html
Crawling: https://docs.pytorch.org/docs/stable/notes/custom_operators.html
Crawling: https://docs.pytorch.org/docs/stable/notes/ddp.html
Crawling: https://docs.pytorch.org/docs/stable/notes

In [20]:
docs = docs1 + docs2

In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000,
                                               chunk_overlap = 200)
splits = text_splitter.split_documents(docs)

In [None]:
from langchain.vectorstores import Chroma 
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    google_api_key= "XXXXX"
)
vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=embeddings)

In [23]:
retriever = vectorstore.as_retriever()

In [24]:
from langchain import hub 
prompt = hub.pull("rlm/rag-prompt")



In [25]:
def format_docs(docs):
    return "\n".join(doc.page_content for doc in docs)

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    google_api_key ="XXXXX"
)

In [27]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser 

rag_chain = ({"context":retriever | format_docs,
              "question":RunnablePassthrough()}
              | prompt 
              | llm 
              | StrOutputParser())

In [28]:
rag_chain.invoke("use of torch.utils.dlpack")

"`torch.utils.dlpack` facilitates interoperability between PyTorch and other libraries by allowing tensors to share memory. It includes functions like `from_dlpack` to convert tensors from external libraries into PyTorch tensors and `to_dlpack` to create DLPack capsules representing PyTorch tensors. The DLPack capsule shares the tensor's memory."