In [1]:
import os
import fitz  # PyMuPDF
import pdfplumber
import re

class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

# Step 1: Extract content and metadata from the main SOP document
def extract_chunks_with_metadata(pdf_path, toc_path):
    # Load Table of Contents (TOC)
    toc = []
    with pdfplumber.open(toc_path) as toc_pdf:
        for page in toc_pdf.pages:
            toc.extend(page.extract_text().splitlines())

    # Identify chapters and subtopics from TOC
    chapter_pattern = re.compile(r"^[A-Z][A-Z\s]+$")
    subtopic_pattern = re.compile(r"^[a-z]\)")

    chapters = {}
    current_chapter = None

    for line in toc:
        line = line.strip()
        if chapter_pattern.match(line):
            current_chapter = line
            chapters[current_chapter] = []
        elif subtopic_pattern.match(line) and current_chapter:
            chapters[current_chapter].append(line)

    doc = fitz.open(pdf_path)
    chunks = []

    current_chapter = None
    current_subtopic = None
    current_chunk = []
    current_pages = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")

        for line in text.splitlines():
            line = line.strip()

            # Check for chapter
            if line in chapters:
                if current_chunk:
                    chunks.append({
                        "text": "\n".join(current_chunk),
                        "metadata": {
                            "chapter": current_chapter,
                            "subtopic": current_subtopic,
                            "pages": current_pages
                        }
                    })
                    current_chunk = []
                    current_pages = []

                current_chapter = line
                current_subtopic = None  # Reset subtopic when a new chapter starts

            # Check for subtopic
            elif current_chapter and line in chapters[current_chapter]:
                if current_chunk:
                    chunks.append({
                        "text": "\n".join(current_chunk),
                        "metadata": {
                            "chapter": current_chapter,
                            "subtopic": current_subtopic,
                            "pages": current_pages
                        }
                    })
                    current_chunk = []
                    current_pages = []

                current_subtopic = line  # Set the current subtopic

            # Add content to current chunk
            else:
                current_chunk.append(line)
                if page_num + 1 not in current_pages:
                    current_pages.append(page_num + 1)

    # Add the last remaining chunk
    if current_chunk:
        chunks.append({
            "text": "\n".join(current_chunk),
            "metadata": {
                "chapter": current_chapter,
                "subtopic": current_subtopic,
                "pages": current_pages
            }
        })

    return chunks

# Step 2: Index and print chunks
def index_and_print_chunks(chunks):
    indexed_chunks = {}
    for idx, chunk in enumerate(chunks):
        indexed_chunks[idx] = {
            "text": chunk["text"],
            "metadata": chunk["metadata"]
        }
        print(f"Index: {idx}\nChapter: {chunk['metadata']['chapter']}\nSubtopic: {chunk['metadata']['subtopic']}\nPages: {chunk['metadata']['pages']}\nContent: {chunk['text']}\n{'-'*80}")
    return indexed_chunks

# Example Usage
if __name__ == "__main__":
    # File paths to the SOP and TOC PDFs
    pdf_file = "SOP SAMPLE.pdf"  # Update path
    toc_file = "SOP TOC.pdf"  # Update path

    # Load and preprocess documents
    chunks = extract_chunks_with_metadata(pdf_file, toc_file)

    # Index and print chunks
    indexed_chunks = index_and_print_chunks(chunks)

Index: 0
Chapter: None
Subtopic: None
Pages: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
Content: 19







Drilling operations need to be carried out in a safe, efficient, and
environmental friendly manner for successfully completing the well to meet the
desired geological objectives. By following recommended practices, a well can
be
drilled
in
shortest
poss
ible time and with minimum of complications.
This chapter includes the following topics on drilling operations:
a) Preparation for spudding the well
b) Pre-spud meeting/conference
c) Common drilling practices for all phases
d) Drilling 26" hole section
e) Drilling 17 ½" hole section
f) Drilling 12 ¼”/8 ½”/6” hole sections
g) Casing integrity test
h) Shoe integrity test
i) Leak off test
j) Extended LOT
k) Recommended practices for tripping out
l) Recommended practices for tripping in
m) Recommended practices for reaming
n) Recommended practices for back-reaming
o) Recommended practices for wiper 