In [2]:
pip install requests

Defaulting to user installation because normal site-packages is not writeable
Collecting requests
  Using cached requests-2.32.4-py3-none-any.whl (64 kB)
Collecting idna<4,>=2.5
  Using cached idna-3.10-py3-none-any.whl (70 kB)
Collecting urllib3<3,>=1.21.1
  Using cached urllib3-2.5.0-py3-none-any.whl (129 kB)
Collecting certifi>=2017.4.17
  Using cached certifi-2025.6.15-py3-none-any.whl (157 kB)
Collecting charset_normalizer<4,>=2
  Using cached charset_normalizer-3.4.2-cp39-cp39-macosx_10_9_universal2.whl (201 kB)
Installing collected packages: urllib3, idna, charset-normalizer, certifi, requests
Successfully installed certifi-2025.6.15 charset-normalizer-3.4.2 idna-3.10 requests-2.32.4 urllib3-2.5.0
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import requests

# URLs of CDC and WHO PDFs (including additional CDC health reports to reach 15 files)
urls = [
    # CDC core documents
    "https://www.cdc.gov/training-publichealth101/media/pdfs/introduction-to-public-health.pdf",
    "https://stacks.cdc.gov/view/cdc/81408/cdc_81408_DS1.pdf",
    "https://www.cdc.gov/nchs/data/hus/hus20-21.pdf",
    "https://www.cdc.gov/nchs/data/hus/hus18.pdf",
    "https://higherlogicdownload.s3.amazonaws.com/APIC/2f36f21f-3978-4294-a698-bfce7ae9143d/UploadedImages/CDC's%20guideline%20for%20Environmental%20Infection%20Control.pdf",
    # WHO core document (corrected URL)
    "https://cdn.who.int/media/docs/default-source/gho-documents/world-health-statistic-reports/2023/world-health-statistics-2023_20230519_.pdf",
    # Additional CDC Health, United States reports (selected recent years and supplements)
    "https://www.cdc.gov/nchs/data/hus/hus19.pdf",
    "https://www.cdc.gov/nchs/data/hus/hus17.pdf",
    "https://www.cdc.gov/nchs/data/hus/hus16.pdf",
    "https://www.cdc.gov/nchs/data/hus/hus15.pdf",
    "https://www.cdc.gov/nchs/data/hus/hus14.pdf",
    "https://www.cdc.gov/nchs/data/hus/hus13.pdf",
    "https://www.cdc.gov/nchs/data/hus/hus12.pdf",
    "https://www.cdc.gov/nchs/data/hus/hus11.pdf",
    "https://www.cdc.gov/nchs/data/hus/hus10.pdf"
]

# Directory to save downloaded files
save_dir = "downloaded_health_pdfs"
os.makedirs(save_dir, exist_ok=True)

def download_file(url, save_path):
    try:
        print(f"Downloading: {url}")
        response = requests.get(url, stream=True, timeout=30)
        response.raise_for_status()
        with open(save_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print(f"Saved to: {save_path}\n")
    except Exception as e:
        print(f"Failed to download {url}: {e}\n")

for url in urls:
    filename = url.split('/')[-1]
    # Some URLs may have query parameters, strip them
    filename = filename.split('?')[0]
    # Ensure filename ends with .pdf
    if not filename.lower().endswith('.pdf'):
        filename += '.pdf'
    save_path = os.path.join(save_dir, filename)
    download_file(url, save_path)




Downloading: https://www.cdc.gov/training-publichealth101/media/pdfs/introduction-to-public-health.pdf
Saved to: downloaded_health_pdfs/introduction-to-public-health.pdf

Downloading: https://stacks.cdc.gov/view/cdc/81408/cdc_81408_DS1.pdf
Saved to: downloaded_health_pdfs/cdc_81408_DS1.pdf

Downloading: https://www.cdc.gov/nchs/data/hus/hus20-21.pdf
Saved to: downloaded_health_pdfs/hus20-21.pdf

Downloading: https://www.cdc.gov/nchs/data/hus/hus18.pdf
Saved to: downloaded_health_pdfs/hus18.pdf

Downloading: https://higherlogicdownload.s3.amazonaws.com/APIC/2f36f21f-3978-4294-a698-bfce7ae9143d/UploadedImages/CDC's%20guideline%20for%20Environmental%20Infection%20Control.pdf
Saved to: downloaded_health_pdfs/CDC's%20guideline%20for%20Environmental%20Infection%20Control.pdf

Downloading: https://cdn.who.int/media/docs/default-source/gho-documents/world-health-statistic-reports/2023/world-health-statistics-2023_20230519_.pdf
Saved to: downloaded_health_pdfs/world-health-statistics-2023_20230

In [14]:
pip install pymupdf


Defaulting to user installation because normal site-packages is not writeable
Collecting pymupdf
  Downloading pymupdf-1.26.1-cp39-abi3-macosx_11_0_arm64.whl (22.4 MB)
[K     |████████████████████████████████| 22.4 MB 18.0 MB/s eta 0:00:01
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.1
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
import os
import fitz  # PyMuPDF module

PDF_DIR = "downloaded_health_pdfs"
OUTPUT_DIR = "chunked_docs"

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

def extract_text(pdf_path):
    """Extract text from a PDF using PyMuPDF."""
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
    return text

def chunk_text(text, chunk_size=300):
    """Splits text into chunks of roughly chunk_size words."""
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield " ".join(words[i:i + chunk_size])

# Main execution
try:
    # Check if PDF directory exists
    if not os.path.exists(PDF_DIR):
        raise FileNotFoundError(f"PDF directory '{PDF_DIR}' does not exist")
    
    # Process each PDF file
    for pdf_file in os.listdir(PDF_DIR):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(PDF_DIR, pdf_file)
            print(f"Processing: {pdf_file}")
            
            text = extract_text(pdf_path)
            
            # Only process if text extraction succeeded
            if text:
                for idx, chunk in enumerate(chunk_text(text)):
                    # Fixed filename generation
                    base_name = os.path.splitext(pdf_file)[0]
                    chunk_filename = f"{base_name}_chunk_{idx}.txt"
                    chunk_file = os.path.join(OUTPUT_DIR, chunk_filename)
                    
                    with open(chunk_file, "w", encoding="utf-8") as f:
                        f.write(chunk)
    
    print(f"Done! Created chunks in '{OUTPUT_DIR}/'.")

except Exception as e:
    print(f"Critical error: {e}")


Processing: introduction-to-public-health.pdf
Processing: hus14.pdf
Processing: hus15.pdf
Processing: hus17.pdf
Processing: hus16.pdf
Processing: hus12.pdf
Processing: hus13.pdf
Processing: hus11.pdf
Processing: hus10.pdf
Processing: CDC's%20guideline%20for%20Environmental%20Infection%20Control.pdf
Processing: hus18.pdf
Processing: hus20-21.pdf
Processing: cdc_81408_DS1.pdf
Processing: world-health-statistics-2023_20230519_.pdf
Done! Created chunks in 'chunked_docs/'.


In [16]:
pip install langchain langchain_community chromadb sentence-transformers pydantic

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain
  Downloading langchain-0.3.26-py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 3.5 MB/s eta 0:00:01
[?25hCollecting langchain_community
  Downloading langchain_community-0.3.26-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 13.4 MB/s eta 0:00:01
[?25hCollecting chromadb
  Downloading chromadb-1.0.13-cp39-abi3-macosx_11_0_arm64.whl (17.9 MB)
[K     |████████████████████████████████| 17.9 MB 1.8 MB/s eta 0:00:01
[?25hCollecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Collecting pydantic
  Using cached pydantic-2.11.7-py3-none-any.whl (444 kB)
Collecting PyYAML>=5.3
  Using cached PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl (172 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8
  Using cached langchain_text_splitters-0.3.8-py3-none-any.whl (32 kB)
Collecting langsmith>=0.1.17


In [17]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import os

# Paths
CHUNK_DIR = "chunked_docs"

# Hugging Face Embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Read chunks
texts = []
metadatas = []
for chunk_file in os.listdir(CHUNK_DIR):
    if chunk_file.endswith(".txt"):
        with open(os.path.join(CHUNK_DIR, chunk_file), "r") as f:
            texts.append(f.read())
            metadatas.append({"source": chunk_file})

# Create Chroma vectorstore
vectorstore = Chroma.from_texts(
    texts=texts,
    embedding=embedding_model,
    metadatas=metadatas,
    persist_directory="./chroma_db"
)

# Done: Embeddings + Index Created


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
  from .autonotebook import tqdm as notebook_tqdm
