In [None]:
%load_ext rich

In [None]:
import requests
from rich import print
from pathlib import Path
from urllib.parse import urlparse, urljoin
from langchain.docstore.document import Document
from rich.console import Console
from rich.status import Status
from langchain.document_loaders import AsyncHtmlLoader
import bs4

In [None]:
def get_url_base_path(url: str):
    parsed_url = urlparse(url)
    base_path = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path.rsplit('/', 1)[0] + '/'
    return base_path

def find_linked_urls(url: str, tag_class_name: str='right-next', collected_urls=None):
    if collected_urls is None:
        collected_urls = []

    # extract base_url path from url, everything except the file_name
    base_url = get_url_base_path(url)

    # extract html text from url using bs4
    response = requests.get(url)
    html_content = response.text
    soup = bs4.BeautifulSoup(html_content, "html.parser")

    # find href to the next url using tag_class_name
    tag = soup.find("a", {"class": tag_class_name})
    if tag and 'href' in tag.attrs:
        next_link = tag['href']
        next_url = urljoin(base_url, next_link)
        
        # Check if this URL is already collected to prevent infinite loop
        if next_url not in collected_urls:
            collected_urls.append(next_url)
            find_linked_urls(next_url, tag_class_name, collected_urls)

    return collected_urls

### Scraping TensorRT documentation

<https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html>

This is a single page document with no next-links

In [None]:
tensort_url = "https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html"

metadata = dict()
metadata['source'] = tensort_url
metadata['language'] = 'en'
tensorrt_html = requests.get(tensort_url).text
soup = bs4.BeautifulSoup(tensorrt_html, "html.parser")
title = soup.title.text.strip()
if title:
    metadata['title'] = title
contents_div = soup.find("article", {"id": "contents"})
if contents_div:
    page_content = contents_div.text.strip()

tensort_doc = Document(page_content=page_content, metadata=metadata)
json_obj = tensort_doc.to_json()
# print(json_obj)
# print(tensort_doc)

## Let's scrape the base url and all linked urls

<https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/getting_started/quickstart.html>

In [None]:
initial_url = "https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/getting_started/quickstart.html"
all_linked_urls = find_linked_urls(initial_url)
# print(all_linked_urls)

In [None]:
# Write urls to text file
URLS_FILE = Path("./nvidia_trition_inference_server.urls.txt")
urls_to_write = "\n".join(all_linked_urls)
URLS_FILE.write_text(urls_to_write, encoding="utf-8")

#### Now let's prepare LangChain Documents for each url

Upon inspecting the web pages all the main content is under this `<div class="tex2jax_ignore mathjax_ignore section" ../>`

Function `docs_from_html_div` extracts text from this div and returns the `Document` along with `metadata` with **source** and **title**

In [None]:
# function to extract all text under a div with a class name
def docs_from_html_div(url: str, div_class_name: str, lang: str='en'):
    # use bs4 to extract the text from the url and then by div_class_name
    metadata = dict()
    page_content = ""
    metadata['source'] = url
    metadata['language'] = lang
    response = requests.get(url)
    html_content = response.text
    # extract all text under the div with the class name
    soup = bs4.BeautifulSoup(html_content, "html.parser")
    title = soup.title.text.strip()
    if title:
        metadata['title'] = title
    div = soup.find("div", {"class": div_class_name})
    if div:
        page_content = div.text.strip()
    return Document(page_content=page_content, metadata=metadata)

In [None]:

console = Console()
status = Status(console=console, spinner="earth", status="Preparing docs")
status.start()
all_docs = []
desired_class = "tex2jax_ignore mathjax_ignore section"
for url in all_linked_urls:
    status.update(status=f"Preparing document for [i green]{url}[/i green]")
    doc = docs_from_html_div(url, desired_class)
    all_docs.append(doc)
status.stop()

In [None]:
# urls = ["https://www.espn.com", "https://lilianweng.github.io/posts/2023-06-23-agent/"]
# loader = AsyncHtmlLoader(urls)
# docs = loader.load()

In [None]:
# urls = ["https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/getting_started/quickstart.html"]
# loader = AsyncHtmlLoader(all_linked_urls)
# html_docs = loader.load()

In [None]:
# from langchain.document_transformers import BeautifulSoupTransformer
# bs_transformer = BeautifulSoupTransformer()
# docs_transformed = bs_transformer.transform_documents(
#     html_docs, tags_to_extract=["div"]
# )
# docs_transformed

In [None]:
# doc = docs_transformed[0]
# doc.page_content.find_all(

In [None]:
# Extract the desired class from the transformed content
# desired_class = "tex2jax_ignore mathjax_ignore section"
# extracted_data = []
# for doc in docs_transformed:
#     for tag in doc.page_content.find_all(class_=desired_class):
#         extracted_data.append(tag.text)

# # Print the extracted data
# for data in extracted_data:
#     print(data)


### HTML2Text Transformer

In [None]:
from langchain.document_transformers import Html2TextTransformer

html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(html_docs)
docs_transformed