In [1]:
import os
from bs4 import BeautifulSoup
from pathlib import Path

# Directory Setup
DATA_DIR = Path("data")
EXTRACTED_DIR = DATA_DIR / "extracted"
EXTRACTED_DIR.mkdir(parents=True, exist_ok=True) # Ensure output directory exists

# Helper Function
def extract_text_from_html(file_path):
    """
    Extracts all visible text from an HTML file using BeautifulSoup.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.extract()

        # Get text and clean it
        text = soup.get_text(separator=" ", strip=True)
        return text

def run_extractor():
    """
    Processes all *_10K.html files in the data/ folder, extracts key sections,
    and saves clean .txt files into data/extracted/.
    """
    print(f"Ensured output directory '{EXTRACTED_DIR.resolve()}' exists.")
    print("\nStarting extraction process...")

    html_files_processed = 0
    for html_file_path in DATA_DIR.glob("*.html"):
        html_files_processed += 1
        output_txt_path = EXTRACTED_DIR / html_file_path.name.replace(".html", ".txt")

        print(f" Extracting from {html_file_path.name}...")
        clean_text = extract_text_from_html(html_file_path)

        with open(output_txt_path, "w", encoding="utf-8") as f_out:
            f_out.write(clean_text)
        print(f"Saved extracted text to {output_txt_path}")

    print(f"\nFinished extraction process. Total HTML files processed: {html_files_processed}")