1. SCRAPE AWS DOCS

In [2]:
import requests 
from bs4 import BeautifulSoup
import re
import os

def scrape_aws_docs(url):
    """
    Scrapes an AWS documentation page and returns cleaned text.
    """
    print(f"Scraping: {url}")
    
    response = requests.get(url)
    if response.status_code != 200:
        print("Error fetching page:", response.status_code)
        return ""

    soup = BeautifulSoup(response.text, "lxml")

    possible_selectors = [
        "main","article","section","div.awsdocs-content",
        "div.awsui-util-container","div.awsui-content",
        "div.document-body","div#main-content","div.g-content"
    ]

    doc_text = ""
    for selector in possible_selectors:
        section = soup.select_one(selector)
        if section:
            doc_text = section.get_text(separator="\n")
            break
    
    if not doc_text:
        print("Warning: No content found with known selectors.")
        doc_text = soup.get_text(separator="\n")

    doc_text = re.sub(r"\n\s*\n", "\n\n", doc_text)
    return doc_text.strip()

# List of AWS doc URLs
aws_urls = [
    # Textract
    "https://docs.aws.amazon.com/textract/latest/dg/what-is.html",
    "https://docs.aws.amazon.com/textract/latest/dg/how-it-works.html",
    "https://docs.aws.amazon.com/textract/latest/dg/limits.html",
    "https://docs.aws.amazon.com/textract/latest/dg/API_Operations.html",
    "https://docs.aws.amazon.com/textract/latest/dg/document-understanding.html",

    # Bedrock
    "https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html",
    "https://docs.aws.amazon.com/bedrock/latest/userguide/security.html",
    "https://docs.aws.amazon.com/bedrock/latest/userguide/kb.html",
    "https://docs.aws.amazon.com/bedrock/latest/userguide/embedding-models.html",
    "https://docs.aws.amazon.com/bedrock/latest/userguide/model-inference.html",

    # S3
    "https://docs.aws.amazon.com/AmazonS3/latest/userguide/Welcome.html",
    "https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-with-s3-actions.html",
    "https://docs.aws.amazon.com/AmazonS3/latest/userguide/security-best-practices.html",

    # Lambda
    "https://docs.aws.amazon.com/lambda/latest/dg/welcome.html",
    "https://docs.aws.amazon.com/lambda/latest/dg/getting-started.html",
    "https://docs.aws.amazon.com/lambda/latest/dg/lambda-invocation.html",

    # API Gateway
    "https://docs.aws.amazon.com/apigateway/latest/developerguide/welcome.html",
    "https://docs.aws.amazon.com/apigateway/latest/developerguide/how-to-call-api.html",

    # IAM
    "https://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html",
    "https://docs.aws.amazon.com/IAM/latest/UserGuide/access_policies.html",
    "https://docs.aws.amazon.com/IAM/latest/UserGuide/best-practices.html",
]

all_text = ""
for url in aws_urls:
    text = scrape_aws_docs(url)
    all_text += "\n\n==============================\n\n"
    all_text += text

# Save raw scraped text
with open("aws_docs_output.txt", "w", encoding="utf-8") as f:
    f.write(all_text)

print("Scraping complete. Saved to aws_docs_output.txt")


Scraping: https://docs.aws.amazon.com/textract/latest/dg/what-is.html
Scraping: https://docs.aws.amazon.com/textract/latest/dg/how-it-works.html
Scraping: https://docs.aws.amazon.com/textract/latest/dg/limits.html
Scraping: https://docs.aws.amazon.com/textract/latest/dg/API_Operations.html
Scraping: https://docs.aws.amazon.com/textract/latest/dg/document-understanding.html
Scraping: https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html
Scraping: https://docs.aws.amazon.com/bedrock/latest/userguide/security.html
Scraping: https://docs.aws.amazon.com/bedrock/latest/userguide/kb.html
Scraping: https://docs.aws.amazon.com/bedrock/latest/userguide/embedding-models.html
Scraping: https://docs.aws.amazon.com/bedrock/latest/userguide/model-inference.html
Scraping: https://docs.aws.amazon.com/AmazonS3/latest/userguide/Welcome.html
Scraping: https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-with-s3-actions.html
Scraping: https://docs.aws.amazon.com/AmazonS3/latest/u

2. CLEAN TEXT

In [3]:
input_file = "aws_docs_output.txt"
output_file = "aws_docs_cleaned.txt"

def clean_text(text):
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r"(Previous Topic|Next Topic|Feedback|Navigation|©.*AWS.*)", "", text)
    return text.strip()

with open(input_file, "r", encoding="utf-8") as f:
    text = f.read()

cleaned = clean_text(text)

with open(output_file, "w", encoding="utf-8") as f:
    f.write(cleaned)

print("Cleaned text saved to aws_docs_cleaned.txt")



Cleaned text saved to aws_docs_cleaned.txt


3. CHUNKING (sentence-based)

In [4]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

cleaned_file = "aws_docs_cleaned.txt"
CHUNK_DIR = "aws_chunks"
os.makedirs(CHUNK_DIR, exist_ok=True)

chunk_size = 5       # sentences per chunk
chunk_overlap = 2    # overlapping sentences

with open(cleaned_file, "r", encoding="utf-8") as f:
    text = f.read()

sentences = sent_tokenize(text)
chunks = []
i = 0
while i < len(sentences):
    chunk = " ".join(sentences[i:i+chunk_size])
    chunks.append(chunk)
    i += (chunk_size - chunk_overlap)

print(f"Total chunks created: {len(chunks)}")

# Save chunks
for i, chunk in enumerate(chunks):
    chunk_file = os.path.join(CHUNK_DIR, f"chunk_{i}.txt")
    with open(chunk_file, "w", encoding="utf-8") as f:
        f.write(chunk)

print(f"Chunks saved in folder: {CHUNK_DIR}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shadi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Total chunks created: 523
Chunks saved in folder: aws_chunks


4. BUILD TF-IDF VECTOR SPACE

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

chunk_files = sorted([os.path.join(CHUNK_DIR, f) for f in os.listdir(CHUNK_DIR)])
texts = []
for f in chunk_files:
    with open(f, "r", encoding="utf-8") as fi:
        texts.append(fi.read())

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
X = vectorizer.fit_transform(texts)  # shape = (num_chunks, vocab_size)



5. QUERY / RETRIEVAL FUNCTION

In [6]:
def retrieve(query, top_k=3):
    q_vec = vectorizer.transform([query])
    sims = cosine_similarity(q_vec, X)[0]
    top_indices = sims.argsort()[::-1][:top_k]
    results = [texts[i] for i in top_indices]
    return results

 6. LOCAL RAG CHATBOT

In [13]:
print("AWS Doc Chatbot (type 'exit' to quit)")

while True:
    query = input("\nYou: ")
    if query.lower() in ["exit", "quit"]:
        print("Goodbye!")
        break
    
    results = retrieve(query, top_k=3)
    
    print("\nBot: Here are some relevant document snippets:\n")
    for i, r in enumerate(results):
        print(f"--- Snippet {i+1} ---")
        print(r[:500], "...")  # first 500 chars for brevity


AWS Doc Chatbot (type 'exit' to quit)



You:  aws textract



Bot: Here are some relevant document snippets:

--- Snippet 1 ---
Other quotas, like file size and languages supported by Amazon Textract, cannot be changed. For more information on set quotas, see Set Quotas in Amazon Textract
. First-Time Amazon Textract Users If this is your first time using Amazon Textract, we recommend that you read the following sections in order: Identifying Your Amazon Textract Use Case â This section introduces the Amazon Textract components and how they work together for an end-to-end experience. Getting Started with Amazon Textrac ...
--- Snippet 2 ---
Getting Started with Amazon Textract â In this section, you set up your account and test the Amazon Textract API. Javascript is disabled or is unavailable in your browser. To use the Amazon Web Services Documentation, Javascript must be enabled. Please refer to your browser's Help pages for instructions. Document Conventions
Amazon Textract offers a variety of operations that apply to different documents.


You:  exit


Goodbye!
