In [15]:
import fitz  # PyMuPDF
import os
import re

# Path to the PDF file (assuming it is in the current folder)
pdf_path = "Profitability_2.pdf"

# Directory to save extracted text files (in the current folder)
output_dir = "Extracted_Cases"
os.makedirs(output_dir, exist_ok=True)

# Open the PDF
document = fitz.open(pdf_path)

# Extract and group pages containing "Interview Transcript"
case_number = 1
case_text = ""

for page_number in range(document.page_count):
    page = document.load_page(page_number)
    text_blocks = page.get_text("blocks")  # Extract blocks of text with positional information

    conversation_lines = []

    # Loop through each text block
    for block in text_blocks:
        # The block should contain 7 elements: x0, y0, x1, y1, text, block number, block type
        x0, y0, x1, y1, text, _, _ = block

        # Heuristic to filter out headers and footers based on vertical position
        page_height = page.rect.height

        # Skip if the block is likely part of a header (top 8%) or footer (bottom 3%)
        if y1 < 0.08 * page_height or y0 > 0.97 * page_height:
            continue

        # Additional filters for common headers/footers
        if re.match(r"^\s*Page \d+", text):  # Page numbers
            continue
        if re.match(r"^\s*\(\w\)", text):  # Footnote indicators
            continue

        # Add block text to conversation lines
        conversation_lines.append(text.strip())

    # Join conversation lines into text
    cleaned_text = "\n".join(conversation_lines)

    # Append to case text if "Interview Transcript" is found
    if "Interview Transcript" in cleaned_text:
        if case_text:
            case_text += "\n\n"
        case_text += cleaned_text
        print(case_text)

    # Save each case when reaching the end of one or the last page of the document
    if page_number + 1 == document.page_count or "Interview Transcript" not in document.load_page(page_number + 1).get_text():
        # Save current case to a file
        if case_text:
            output_file_path = os.path.join(output_dir, f"Case_{case_number}.txt")
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(case_text)
            print(f"Saved Case {case_number} to {output_file_path}")
            case_number += 1
            case_text = ""

# Close the document
document.close()
