# Download AWS documentation for QnA with LLMs

- Download HTML pages
- Convert HTML pages to text and then to LangChain docs
- Split docs into chunks using TextSplitter
- Encode and Ingest chunks with `Cohere-v3` embedding model into a new collection `aws_docs_coherev3` in Qdrant vectordb
- Test retrieval quality

In [17]:
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + "/utils")

In [18]:
import argparse
import time
from pathlib import Path
from urllib.parse import urljoin, urlparse

import boto3
import requests
from bs4 import BeautifulSoup
from rich import print
# from rich.console import Console
# from rich.status import Status
from utils import get_inference_parameters

%load_ext rich

The rich extension is already loaded. To reload it, use:
  %reload_ext rich


### Create directory for downloading html files

In [3]:
DATA_DIR = Path(f"../data/aws")
start_url = "https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html"

# console = Console()
current_url = start_url
failed_downloads = []
scraped_urls = []

root_url = urlparse(start_url)

# Extract topic name from url - https://docs.aws.amazon.com/bedrock/
folder_name = root_url.path.lstrip("/").split("/")[0]  # bedrock

# Root directory for storing downloaded html files
DATA_DIR = DATA_DIR.joinpath(f"{folder_name}/html")
DATA_DIR.mkdir(exist_ok=True, parents=True)
print(f"HTML files will be downloaded to {str(DATA_DIR)}")

### Download linked html docs to directory

In [5]:
# Function to download HTML content from a given URL
def download_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None


# Function to find the URL of the next topic in the HTML content
def find_next_url(soup, base_url):
    next_topic = soup.find("div", {"class": "next-link"})
    if next_topic:
        relative_url = next_topic.get("href")
        next_url = urljoin(base_url, relative_url)
        return next_url
    return None


# Function to write error messages to a file
def write_error_to_file(error_message, target_dir):
    error_path = target_dir.joinpath("errors.txt")
    _ = error_path.write_text(error_message, encoding="utf-8")

In [None]:
# status = Status(console=console, status=f"Downloading {start_url}", spinner="earth")
# status.start()

# Extract the base URL for constructing full URLs
base_url = "/".join(start_url.split("/")[:-1]) + "/"

while current_url:
    parsed_url = urlparse(current_url)
    file_name = parsed_url.path.split("/")[-1] or "index"
    # file_name = f"{file_name}.html"
    html_file_path = DATA_DIR.joinpath(file_name)

    # status.update(status=f"Downloading: {base_url}{file_name}")
    html_content = download_page(current_url)
    if html_content:
        html_file_path = DATA_DIR.joinpath(file_name)
        _ = html_file_path.write_text(html_content, encoding="utf-8")

        # add url to scraped urls
        scraped_urls.append(current_url)

        # find link to the next url
        soup = BeautifulSoup(html_content, "html.parser")
        next_url = find_next_url(soup, base_url)

        if next_url:
            current_url = next_url
        else:
            print("No more 'Next Topic' found. Exiting.")
            status.stop()
            break
    else:
        failed_downloads.append(current_url)
        write_error_to_file(f"Failed to download {current_url}")

    time.sleep(1)

if failed_downloads:
    print(f"Failed to download the following URLs: {failed_downloads}")

### Convert HTML2Text using Unstructured

In [9]:
file_urls = list(DATA_DIR.glob("*.html"))
print(f"Found {len(file_urls)} HTML files")

In [10]:
from typing import List

from langchain.docstore.document import Document
from unstructured.cleaners.core import clean_non_ascii_chars
from unstructured.partition.html import partition_html

def extract_text_from_html(file_paths: List[Path], base_url: str) -> List[Document]:
    """
    Function to reformat html_docs from html to plain text
    Input: file_paths List[Path]
    Output: List[Document]
    """
    extracted_docs = []
    print(f"Converting {len(file_paths)} files to text docs ...")
    for _path in file_paths:
        html_content = _path.read_text(encoding="utf-8")
        elements = partition_html(
            text=html_content,
            html_assemble_articles=True,
            skip_headers_and_footers=True,
            chunking_strategy="by_title",
        )
        extracted_text = "".join([e.text for e in elements])
        extracted_text = clean_non_ascii_chars(extracted_text)
        # Create metadata for document
        metadata = dict()
        metadata["language"] = "en"
        # extract title from html
        soup = BeautifulSoup(html_content, "html.parser")
        metadata["title"] = soup.find("title").text
        # set source to page url
        file_name = str(_path).split("/")[-1]
        metadata["source"] = f"{base_url}{file_name}"
        # extract links if available and append to metadata
        extracted_links = []
        for element in elements:
            if element.metadata.links is not None:
                print(element.metadata.links)
                link = element.metadata.links[0]["url"][1:]
                extracted_links.append(link)
        # Add extracted links to metadata as references
        if len(extracted_links) > 0:
            metadata["references"] = extracted_links
        doc = Document(page_content=extracted_text, metadata=metadata)
        extracted_docs.append(doc)
    return extracted_docs

In [12]:
base_url = "/".join(start_url.split("/")[:-1]) + "/"
html_docs = extract_text_from_html(file_paths=file_urls, base_url=base_url)

### Chunk docs and ingest to VectorStore (Qdrant)

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Remember TextSplitter chunk_size is != model max length
splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=0)
doc_chunks = splitter.split_documents(documents=html_docs)
print(len(doc_chunks))

### Instantiate BedrockEmbedding with `Cohere.embed-v3` model

In [19]:
from langchain.embeddings.bedrock import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

region = "us-west-2"
b_client = boto3.client("bedrock-runtime", region_name=region)
model_kwargs = get_inference_parameters(
    "anthropic"
)  # We need pass in model_kwargs for a model
# llm_model_id = "anthropic.claude-v2"
llm_model_id = "anthropic.claude-instant-v1"
embed_model_id = "cohere.embed-english-v3"

llm = Bedrock(
    client=b_client,
    model_kwargs=model_kwargs,
    model_id=llm_model_id,
    region_name=region,
)
embeddings = BedrockEmbeddings(
    client=b_client, model_id=embed_model_id, region_name=region
)

### Add docs to vectorstore (Qdrant)

**NOTE:** Install and run `qdrant` vector store locally using docker. 

Refer here for Installation: <https://qdrant.tech/documentation/quick-start/>


_NOTE:_ Ensure Qdrant is running at port `6333` on localhost. 

You should be able to access dashboard <http://localhost:6333/dashboard>

In [25]:
from langchain.vectorstores.qdrant import Qdrant
from qdrant_client import QdrantClient

collection_name = "aws_docs_coherev3"  # define collection name
qclient = QdrantClient(location="localhost", port=6333)

# Instantiating LangChain Qdrant db object
db = Qdrant(
    client=qclient,
    collection_name=collection_name,
    distance_strategy="cosine",
    embeddings=embeddings,
)

# Check if collection exists if not create from documents
try:
    collection_status = qclient.get_collection(collection_name=collection_name).status
    if collection_status == "green":
        print(f"Connected to collection: [b magenta]{collection_name}[/b magenta] ✅")
except Exception as e:
    # response = json.loads(e.content.decode())
    if e.reason_phrase == "Not Found":
        print(
            f"Creating collection: {collection_name} with [b]{len(doc_chunks)}[/b] doc chunks ..."
        )
        # Add documents to vector db with force_recreate = True for testing
        db = db.from_documents(
            documents=doc_chunks,
            embedding=embeddings,
            collection_name=collection_name,
            force_recreate=True,  # Set this to false in PROD
        )
        print(
            f"Added [b]{len(doc_chunks)}[/b] to collection: [b green]{collection_name}[/b green] ✅"
        )

### Testing Qdrant retriever

For a given query, retrieve **top 5** documents and test if the document chunks returned are relevant to the query.

We set our `search_type` to `similarity`, we can also try with `mmr`

In [28]:
# define retriever args
retriever_kwargs = {"search_type": "similarity", "top_k": 5}
retriever = db.as_retriever(**retriever_kwargs)
# query = "What are some of General guidelines for Amazon Bedrock LLM users"
query = "Which regions are Agents for Amazon Bedrock available in"
retriever.get_relevant_documents(query)


[1m[[0m
    [1;35mDocument[0m[1m([0m
        [33mpage_content[0m=[32m"AWS[0m[32m\n\nDocumentation\n\nAmazon Bedrock\n\nUser Guide\n\nSupported regions and models\n\nAgents for Amazon Bedrock is supported in the following regions.\n\nUS East [0m[32m([0m[32mN. Virginia[0m[32m)[0m[32m\n\nUS West [0m[32m([0m[32mOregon[0m[32m)[0m[32m\n\nYou can use Agents for Amazon Bedrock with the following models.\n\nAnthropic Claude Instant v1\n\nAnthropic Claude v2.0\n\nJavascript is disabled or is unavailable in your browser.\n\nTo use the Amazon Web Services Documentation, Javascript must be enabled. Please refer to your browser's Help pages for instructions.Document Conventions\n\nAgents for Amazon Bedrock\n\nSet up action groups for your agent\n\nDid this page help you? - Yes\n\nThanks for letting us know we're doing a good job!\n\nIf you've got a moment, please tell us what we did right so we can do more of it.\n\nDid this page help you? - No\n\nThanks for letting us kn