In [44]:
import pprint
import fitz
import re
from bs4 import BeautifulSoup

In [45]:
def extract_header_subheader_body(pdf_path):
    # Open the PDF file
    document = fitz.open(pdf_path)
    
    # List to store the text of the entire document
    full_text = []
    
    # Iterate through each page
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text = page.get_text("text")
        full_text.append(text)
    
    # Join all pages' text into a single string
    full_text = "\n".join(full_text)
    
    # Convert the text into an HTML-like format
    html_text = ""
    for line in full_text.split("\n"):
        if re.match(r'^\d+\.\s', line):  # Numbered headers
            html_text += f"<h1>{line}</h1>\n"
        elif re.match(r'^\d+\.\d+\.\s', line):  # Numbered subheaders
            html_text += f"<h2>{line}</h2>\n"
        elif line.isupper():  # Assuming headers are in uppercase
            html_text += f"<h1>{line}</h1>\n"
        elif line.istitle():  # Assuming subheaders are in title case
            html_text += f"<h2>{line}</h2>\n"
        elif line.strip().startswith(("-", "*", "•")):  # Bullet lists
            html_text += f"<li>{line}</li>\n"
        else:
            html_text += f"<p>{line}</p>\n"
    
    # Parse the HTML-like text with BeautifulSoup
    soup = BeautifulSoup(html_text, "html.parser")
    
    # Extract headers, subheaders, and bodies
    content = []
    current_header = ""
    current_subheader = ""
    current_body = []
    
    for tag in soup.find_all(["h1", "h2", "p", "li"]):
        if tag.name == "h1":
            if current_header and current_body:
                content.append({
                    'header': current_header,
                    'subheader': current_subheader,
                    'body': "\n".join(current_body)
                })
                current_subheader = ""
                current_body = []
            current_header = tag.get_text()
        elif tag.name == "h2":
            if current_subheader and current_body:
                content.append({
                    'header': current_header,
                    'subheader': current_subheader,
                    'body': "\n".join(current_body)
                })
                current_body = []
            current_subheader = tag.get_text()
        else:
            current_body.append(tag.get_text())
    
    # Store the last section
    if current_header and current_body:
        content.append({
            'header': current_header,
            'subheader': current_subheader,
            'body': "\n".join(current_body)
        })
    
    return content

In [46]:
# Example usage
pdf_path = "ExampleRFPs/GoodFit/Attachment A - Statement of Work_July 2023.pdf"
content = extract_header_subheader_body(pdf_path)
pprint.pprint(content)

[{'body': 'Statement of Work ',
  'header': '',
  'subheader': 'Project Management Support '},
 {'body': '1.', 'header': 'PR 777449 ', 'subheader': 'July 2023 '},
 {'body': 'Battelle Memorial Institute, Pacific Northwest Division, operator '
          'of the Pacific Northwest National \n'
          'Laboratory (PNNL) for the U.S. Department of Energy is working '
          'through a digital transformation \n'
          'through critical strategic objectives against our operating model. ',
  'header': 'OBJECTIVE/PURPOSE',
  'subheader': ''},
 {'body': 'PNNL has a portfolio of strategic initiatives to modernize various '
          'business systems, and operating \n'
          'model improvements initiatives for Assets & Facilities and '
          'Operations. In support of these \n'
          'needs, PNNL needs an agile project manager to successfully lead '
          'these efforts through planning \n'
          'and execution against strategic roadmaps. ',
  'header': '2. BACKGROUND