### 2_scraper.ipynb  
- Grabs the visible text content from a test website using Python.  
- Strips HTML and returns plain text that represents the body of a webpage.  
- This simulates what the Chrome extension would send.

In [7]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re, os
import time

Launches a real Chrome browser to bypass anti-bot systems like Cloudflare

Navigates to the URL, waits 10 seconds, and returns the full rendered HTML of the page

In [8]:
def fetch_html_with_selenium(url):
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")
    # options.add_argument("--headless=new")  

    # Manually set version_main to match your local Chrome
    driver = uc.Chrome(version_main=137, options=options)  
    driver.get(url)

    # Give Cloudflare time to verify you
    time.sleep(10)

    html = driver.page_source
    driver.quit()
    return html

Parses the HTML using BeautifulSoup and removes unwanted tags

Extracts and returns the readable, human-facing text from the page, collapsed into clean whitespace


In [9]:
def extract_visible_text(html):
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript", "header", "footer", "svg", "img"]):
        tag.decompose()
    text = soup.get_text(separator=" ")
    return re.sub(r"\s+", " ", text).strip()

Trims leading/trailing spaces from the text

In [10]:
def clean_text(text):
    return text.strip()

Combines all three steps: fetches the HTML, extracts visible content, and cleans it

Returns the final plain text ready for semantic search, embedding, or display

In [11]:
def scrape_page_text(url):
    html = fetch_html_with_selenium(url)
    visible = extract_visible_text(html)
    cleaned = clean_text(visible)
    return cleaned


In [12]:
url = "https://www.science.org/doi/10.1126/scirobotics.adt1591"
page_text = scrape_page_text(url)

In [13]:
os.makedirs("data", exist_ok=True)
with open("data/content.txt", "w", encoding="utf-8") as f:
    f.write(page_text)