In [1]:
import fitz
import re
import pprint

In [2]:
# This function extracts text from a PDF file and converts it to a string.
def pdf_to_string(pdf_path):
    doc = fitz.open(pdf_path)
    doc_content = ""

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        # Unify lines if they end with something like "number.\n"
        text = re.sub(r'(\d+\.)\n+', r'\1 ', text)
        # Unify lines if they have "●\n"
        text = re.sub(r'(●)\n+', r'\1 ', text)
        doc_content += text

    doc_content += "\n"
    doc.close()
    return doc_content

# This function extracts header patterns from the text.
def extract_header_patterns(text):
    # Define the regex pattern for different header formats
    numbered_header_pattern = r'^\d+\.\s.*'
    # colon_header_pattern have words starting with a capital letters and containing a colon, only capture the first part
    colon_header_pattern = r'^[A-Z].*?:'
    # patter for '\d.\d.\d text' like headers
    numbered_text_header_pattern = r'^\d+[\.\d+]{1,}\s.*'
    # Combine the patterns into one
    header_pattern = f'(?:{numbered_header_pattern}|{colon_header_pattern}|{numbered_text_header_pattern})'
    # Find all matches in the text
    matches = re.findall(header_pattern, text, re.MULTILINE)
    # Filter out empty matches
    headers = [match for match in matches if match]
    return headers

In [3]:
# Example usage
pdf_path = "ExampleRFPs\GoodFit\Attachment A - Statement of Work_July 2023.pdf"
# "ExampleRFPs\GoodFit\RFP+CR-346570+Attachment+A+SOW+SQA+Consulting.pdf"
doc_content = pdf_to_string(pdf_path)
print("doc string content:")
print(doc_content)

doc string content:
Statement of Work 
Project Management Support 
July 2023 
PR 777449 
1. OBJECTIVE/PURPOSE
Battelle Memorial Institute, Pacific Northwest Division, operator of the Pacific Northwest National 
Laboratory (PNNL) for the U.S. Department of Energy is working through a digital transformation 
through critical strategic objectives against our operating model. 
2. BACKGROUND
PNNL has a portfolio of strategic initiatives to modernize various business systems, and operating 
model improvements initiatives for Assets & Facilities and Operations. In support of these 
needs, PNNL needs an agile project manager to successfully lead these efforts through planning 
and execution against strategic roadmaps. 
3. WORK SCOPE
● Under direction of Digital Platform Managers and the PMO Director, lead initiatives
and manage over scope, schedule, and budget from planning through execution.
● Ensure the timely delivery of high-quality technical solutions that drive key business
strategic out

In [4]:
# use the header extraction function to get the headers
headers = extract_header_patterns(doc_content)
# use the headers to get the header to get the text upto the next header and save it in a dict
header_dict = {}
for i in range(len(headers)):
    # get the header
    header = headers[i]
    # get the text upto the next header
    if i < len(headers) - 1:
        next_header = headers[i + 1]
        # get the text between the two headers
        text = doc_content.split(header)[1].split(next_header)[0]
    else:
        # if this is the last header, get the text upto the end of the document
        text = doc_content.split(header)[1]
    # save the header and text in a dict
    header_dict[header] = text
# print the headers and the text
print("headers and text:")
for header, text in header_dict.items():
    print(f"Header: {header}")
    print(f"Text: {text}")
    print("\n")

headers and text:
Header: 1. OBJECTIVE/PURPOSE
Text: 
Battelle Memorial Institute, Pacific Northwest Division, operator of the Pacific Northwest National 
Laboratory (PNNL) for the U.S. Department of Energy is working through a digital transformation 
through critical strategic objectives against our operating model. 



Header: 2. BACKGROUND
Text: 
PNNL has a portfolio of strategic initiatives to modernize various business systems, and operating 
model improvements initiatives for Assets & Facilities and Operations. In support of these 
needs, PNNL needs an agile project manager to successfully lead these efforts through planning 
and execution against strategic roadmaps. 



Header: 3. WORK SCOPE
Text: 
● Under direction of Digital Platform Managers and the PMO Director, lead initiatives
and manage over scope, schedule, and budget from planning through execution.
● Ensure the timely delivery of high-quality technical solutions that drive key business
strategic outcomes.
● Lead cross 