In [2]:
import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import WebBaseLoader
import time
import pickle
import os

BASE_URL = "https://medlineplus.gov"
INDEX_URL_TEMPLATE = "https://medlineplus.gov/ency/encyclopedia_{letter}.htm"
LETTERS = [chr(c) for c in range(ord('A'), ord('Z')+1)] + ['0-9']

def scrape_article_links():
    all_links = []
    for letter in LETTERS:
        url = INDEX_URL_TEMPLATE.format(letter=letter)
        print(f"Scraping index page: {url}")
        resp = requests.get(url)
        if resp.status_code != 200:
            print(f"Failed to load {url}")
            continue
        soup = BeautifulSoup(resp.text, "html.parser")
        anchors = soup.select("div#content a[href^='/ency/article/']")
        links = [BASE_URL + a['href'] for a in anchors]
        print(f"Found {len(links)} article links for {letter}")
        all_links.extend(links)
        time.sleep(1)  # polite delay
    unique_links = list(set(all_links))  # remove duplicates
    print(f"Total unique article links scraped: {len(unique_links)}")
    return unique_links

def load_articles(urls, delay=1.0):
    docs = []
    for i, url in enumerate(urls):
        print(f"Loading article {i+1}/{len(urls)}: {url}")
        try:
            loader = WebBaseLoader(url)
            article_docs = loader.load()
            docs.extend(article_docs)
        except Exception as e:
            print(f"Failed to load {url}: {e}")
        time.sleep(delay)  # polite delay between requests
    print(f"Total documents loaded: {len(docs)}")
    return docs

In [4]:
article_links = scrape_article_links()

# Step 2: Load a few articles (e.g., first 5 for now)
docs = load_articles(article_links[:5])

# Preview first few docs content (for example first 3 docs)
for i, doc in enumerate(docs[:3]):
    print(f"--- Document {i+1} ---")
    print(doc.page_content[:1000])  # print first 1000 characters for preview
    print("\n" + "="*40 + "\n")


Scraping index page: https://medlineplus.gov/ency/encyclopedia_A.htm
Found 0 article links for A
Scraping index page: https://medlineplus.gov/ency/encyclopedia_B.htm
Found 0 article links for B
Scraping index page: https://medlineplus.gov/ency/encyclopedia_C.htm
Found 0 article links for C
Scraping index page: https://medlineplus.gov/ency/encyclopedia_D.htm
Found 0 article links for D
Scraping index page: https://medlineplus.gov/ency/encyclopedia_E.htm
Found 0 article links for E
Scraping index page: https://medlineplus.gov/ency/encyclopedia_F.htm
Found 0 article links for F
Scraping index page: https://medlineplus.gov/ency/encyclopedia_G.htm
Found 0 article links for G
Scraping index page: https://medlineplus.gov/ency/encyclopedia_H.htm
Found 0 article links for H
Scraping index page: https://medlineplus.gov/ency/encyclopedia_I.htm
Found 0 article links for I
Scraping index page: https://medlineplus.gov/ency/encyclopedia_J.htm
Found 0 article links for J
Scraping index page: https://m