In [2]:
pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Downloading pdfminer.six-20240706-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20240706


In [3]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import pdfplumber
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal, LTTextLineHorizontal, LTChar
from typing import List, Dict, Any
from collections import defaultdict


class SectionContainer:
    """Container to store parsed content and metadata."""
    def __init__(self, content: str, metadata: Dict[str, Any]):
        self.content = content
        self.metadata = metadata

    def __repr__(self):
        return f"SectionContainer(metadata={self.metadata}, content_length={len(self.content)})"


def analyze_pdf_layout(pdf_path: str):
    """Analyzes the layout of a PDF to understand its structure."""
    layout_data = []
    for page_layout in extract_pages(pdf_path):
        page_elements = []
        for element in page_layout:
            if isinstance(element, LTTextBoxHorizontal):
                for line in element:
                    if isinstance(line, LTTextLineHorizontal):
                        font_size = max(
                            (char.size for char in line if isinstance(char, LTChar)),
                            default=0,
                        )
                        font_name = max(
                            (char.fontname for char in line if isinstance(char, LTChar)),
                            default="",
                        )
                        text = line.get_text().strip()
                        page_elements.append({"text": text, "font_size": font_size, "font_style": font_name})
        layout_data.append(page_elements)
    return layout_data


def parse_pdf_content(pdf_path: str) -> List[SectionContainer]:
    """Parses PDF content and organizes it into containers with metadata."""
    containers = []
    layout_analysis = analyze_pdf_layout(pdf_path)

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            metadata = {"page_number": page_num}

            # Group text by font size and style
            font_size_hierarchy = {}
            for element in layout_analysis[page_num - 1]:
                if "text" in element:
                    rounded_font_size = round(element["font_size"])
                    key = (rounded_font_size, element["font_style"])
                    font_size_hierarchy.setdefault(key, []).append(element["text"])

            for (font_size, font_style), texts in font_size_hierarchy.items():
                combined_text = "\n".join(texts)
                metadata["font_size"] = font_size
                metadata["font_style"] = font_style
                metadata["content_length"] = len(combined_text)

                # Exclude sections with content length <= 3
                if len(combined_text) > 3:
                    containers.append(SectionContainer(content=combined_text, metadata=metadata.copy()))

    return containers


def group_by_top_font_sizes(parsed_sections: List[SectionContainer], top_n: int = 5) -> Dict[int, Dict[str, List[Dict[str, Any]]]]:
    """Groups sections by rounded font size and font style, keeping only the top N font sizes."""
    grouped_data = defaultdict(lambda: defaultdict(list))

    # Collect all unique font sizes
    font_sizes = sorted(
        {section.metadata["font_size"] for section in parsed_sections},
        reverse=True,
    )

    # Select the top N font sizes
    top_font_sizes = font_sizes[:top_n]

    for section in parsed_sections:
        font_size = section.metadata["font_size"]
        if font_size in top_font_sizes:
            font_style = section.metadata["font_style"]
            grouped_data[font_size][font_style].append({
                "content": section.content,
                "page_number": section.metadata["page_number"],
                "content_length": section.metadata["content_length"],
            })

    return grouped_data


def save_grouped_sections_to_file(grouped_sections: Dict[int, Dict[str, List[Dict[str, Any]]]], output_file: str):
    """Saves grouped sections to a text file."""
    with open(output_file, "w", encoding="utf-8") as f:
        for font_size, styles in sorted(grouped_sections.items(), reverse=True):
            f.write(f"Font Size: {font_size}\n")
            f.write("=" * 50 + "\n")
            for font_style, sections in styles.items():
                f.write(f"  Font Style: {font_style}\n")
                f.write("  " + "-" * 45 + "\n")
                for section in sections:
                    f.write(f"    Page Number: {section['page_number']}\n")

                    # Include content or content length conditionally
                    if section["content_length"] <= 120:
                        f.write(f"    Content: {section['content']}\n")
                    else:
                        f.write(f"    Content Length: {section['content_length']}\n")

                    f.write("\n")
                f.write("  " + "-" * 45 + "\n")
            f.write("-" * 50 + "\n\n")


if __name__ == "__main__":
    pdf_path = "/content/2309.07930v1.pdf"  # Replace with your PDF path
    output_file = "/content/top_font_sizes.txt"  # Replace with desired output path

    parsed_sections = parse_pdf_content(pdf_path)
    grouped_sections = group_by_top_font_sizes(parsed_sections, top_n=5)
    save_grouped_sections_to_file(grouped_sections, output_file)

    print(f"Top 5 font sizes saved to {output_file}")


Top 5 font sizes saved to /content/top_font_sizes.txt


In [6]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar, LTImage, LTFigure
from pdfplumber import open as pdfplumber_open

class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        """Load the configuration file containing font size and style for headings."""
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        """Identify the average font size and style for a line."""
        font_sizes = []
        font_styles = set()

        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))  # Round off font size
                font_styles.add(char.fontname)

        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0  # Round average font size
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        """Match the font size and style with the config to classify the heading."""
        for heading_type, attributes in self.config.items():
            if avg_font_size >= attributes['font_size'] and attributes['font_style'] in font_style:
                return heading_type
        return None

    def parse_table(self, pdf_path, page_number):
        """Parse table content using pdfplumber."""
        with pdfplumber_open(pdf_path) as pdf:
            page = pdf.pages[page_number - 1]
            tables = page.extract_tables()
            return tables if tables else []

    def process_element(self, element, current_section, current_subsection, content, page_number, pdf_path):
        """Process each element (text, table, or image) and store it appropriately."""
        if isinstance(element, LTTextBox):
            for line in element:
                if isinstance(line, LTTextLine):
                    avg_font_size, font_style = self.identify_font(line)
                    text = line.get_text().strip()
                    heading_type = self.match_heading(avg_font_size, font_style)

                    if heading_type == "title":
                        # Append to the title if it's a continuation
                        self.title += " " + text if self.title else text
                    elif heading_type == "section":
                        # Save previous content before starting a new section
                        if current_section or content:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section or "",
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": [],
                                "Image": []
                            })
                        current_section = text
                        current_subsection = None
                        content = []
                    elif heading_type == "subsection":
                        # Save previous content before starting a new subsection
                        if current_subsection or content:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section,
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": [],
                                "Image": []
                            })
                        current_subsection = text
                        content = []
                    else:
                        # Regular content
                        content.append(text)

        elif isinstance(element, LTFigure):
            # Handle figures (e.g., tables or images)
            tables = self.parse_table(pdf_path, page_number)
            if tables:
                # Append table information to the current content without creating a separate entry
                self.result.append({
                    "Document Name": self.doc_name,
                    "Title": self.title,
                    "Sections Heading": current_section or "",
                    "Subsections Heading": current_subsection or "",
                    "Raw Content": " ".join(content),
                    "Page Number": page_number,
                    "Tables": tables,
                    "Image": []
                })
                content = []  # Clear content after adding table info

        elif isinstance(element, LTImage):
            # Handle images
            content.append("[Image detected]")

        return current_section, current_subsection, content

    def parse_pdf(self, pdf_path):
        """Parse the PDF using pdfminer and include page numbers."""
        current_section = None
        current_subsection = None
        content = []

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                current_section, current_subsection, content = self.process_element(
                    element, current_section, current_subsection, content, page_number, pdf_path
                )

        # Merge last content into the last dictionary
        if self.result and content:
            self.result[-1]["Raw Content"] += " " + " ".join(content)

        return self.result

# Example usage
config_path = "/content/config.json"  # Path to the config file
pdf_path = "/content/00050107-Generative-AI-pt-ai-Zant-Kouw-Schomaker.pdf"  # Path to the PDF

# Document name for identification
doc_name = "Example Document"

# Initialize and parse
parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

# Save the parsed content to a JSON file
with open("parsed_content.json", "w") as f:
    json.dump(parsed_content, f, indent=4)


In [13]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar, LTImage, LTFigure
from pdfplumber import open as pdfplumber_open

class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        """Load the configuration file containing font size and style for headings."""
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        """Identify the average font size and style for a line."""
        font_sizes = []
        font_styles = set()

        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))  # Round off font size
                font_styles.add(char.fontname)

        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0  # Round average font size
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        """Match the font size and style with the config to classify the heading."""
        for heading_type, attributes in self.config.items():
            if avg_font_size == attributes['font_size'] and attributes['font_style'] == font_style:
                return heading_type
        return None

    def parse_table(self, pdf_path, page_number):
        """Parse table content using pdfplumber."""
        with pdfplumber_open(pdf_path) as pdf:
            page = pdf.pages[page_number - 1]
            tables = page.extract_tables()
            return tables if tables else []

    def process_element(self, element, current_section, current_subsection, content, page_number, pdf_path):
        """Process each element (text, table, or image) and store it appropriately."""
        if isinstance(element, LTTextBox):
            for line in element:
                if isinstance(line, LTTextLine):
                    avg_font_size, font_style = self.identify_font(line)
                    text = line.get_text().strip()
                    heading_type = self.match_heading(avg_font_size, font_style)

                    if heading_type == "title":
                        # Append to the title if it's a continuation
                        self.title += " " + text if self.title else text
                    elif heading_type == "section":
                        # Save previous content before starting a new section
                        if current_section or content:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section or "",
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": [],
                                "Image": []
                            })
                        current_section = text
                        current_subsection = None
                        content = []
                    elif heading_type == "subsection":
                        # Save previous content before starting a new subsection
                        if current_subsection or content:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section,
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": [],
                                "Image": []
                            })
                        current_subsection = text
                        content = []
                    else:
                        # Regular content
                        content.append(text)

        elif isinstance(element, LTFigure):
            # Handle figures (e.g., tables or images)
            tables = self.parse_table(pdf_path, page_number)
            if tables:
                # Append table information to the current content without creating a separate entry
                self.result.append({
                    "Document Name": self.doc_name,
                    "Title": self.title,
                    "Sections Heading": current_section or "",
                    "Subsections Heading": current_subsection or "",
                    "Raw Content": " ".join(content),
                    "Page Number": page_number,
                    "Tables": tables,
                    "Image": []
                })
                content = []  # Clear content after adding table info

        elif isinstance(element, LTImage):
            # Handle images
            content.append("[Image detected]")

        return current_section, current_subsection, content

    def parse_pdf(self, pdf_path):
        """Parse the PDF using pdfminer and include page numbers."""
        current_section = None
        current_subsection = None
        content = []

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                current_section, current_subsection, content = self.process_element(
                    element, current_section, current_subsection, content, page_number, pdf_path
                )

        # Merge last content into the last dictionary only if it contains a table or image
        if self.result and content:
            last_entry = self.result[-1]
            if last_entry["Tables"] or "[Image detected]" in last_entry["Raw Content"]:
                last_entry["Raw Content"] += " " + " ".join(content)
            else:
                # Create a new dictionary for remaining content
                self.result.append({
                    "Document Name": self.doc_name,
                    "Title": self.title,
                    "Sections Heading": current_section or "",
                    "Subsections Heading": current_subsection or "",
                    "Raw Content": " ".join(content),
                    "Page Number": page_number,
                    "Tables": [],
                    "Image": []
                })

        return self.result

# Example usage
config_path = "/content/config.json"  # Path to the config file
pdf_path = "/content/Quantum Computing_ Principles and Applications.pdf"  # Path to the PDF

# Document name for identification
doc_name = "Example Document"

# Initialize and parse
parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

# Save the parsed content to a JSON file
with open("parsed_content_fixed_11.json", "w") as f:
    json.dump(parsed_content, f, indent=4)


In [4]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar, LTImage, LTFigure
from pdfplumber import open as pdfplumber_open

class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        """Load the configuration file containing font size and style for headings."""
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        """Identify the average font size and style for a line."""
        font_sizes = []
        font_styles = set()

        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))  # Round off font size
                font_styles.add(char.fontname)

        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0  # Round average font size
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        """Match the font size and style with the config to classify the heading."""
        for heading_type, attributes in self.config.items():
            if avg_font_size == attributes['font_size'] and attributes['font_style'] == font_style:
                return heading_type
        return None

    def parse_table(self, pdf_path, page_number):
        """Parse table content using pdfplumber."""
        with pdfplumber_open(pdf_path) as pdf:
            page = pdf.pages[page_number - 1]
            tables = page.extract_tables()
            return tables if tables else []

    def process_element(self, element, current_section, current_subsection, content, page_number, pdf_path):
        """Process each element (text, table, or image) and store it appropriately."""
        if isinstance(element, LTTextBox):
            for line in element:
                if isinstance(line, LTTextLine):
                    avg_font_size, font_style = self.identify_font(line)
                    text = line.get_text().strip()
                    heading_type = self.match_heading(avg_font_size, font_style)

                    if heading_type == "title":
                        # Append to the title if it's a continuation
                        self.title += " " + text if self.title else text
                    elif heading_type == "section":
                        # Save previous content before starting a new section
                        if current_section or content:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section or "",
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": [],
                                "Image": []
                            })
                        current_section = text
                        current_subsection = None
                        content = []
                    elif heading_type == "subsection":
                        # Save previous content before starting a new subsection
                        if current_subsection or content:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section,
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": [],
                                "Image": []
                            })
                        current_subsection = text
                        content = []
                    else:
                        # Regular content
                        content.append(text)

        elif isinstance(element, LTFigure):
            # Handle figures (e.g., tables or images)
            tables = self.parse_table(pdf_path, page_number)
            if tables:
                # Append table information to the current content instead of creating a new entry
                if self.result and (current_section or current_subsection):
                    last_entry = self.result[-1]
                    if last_entry["Sections Heading"] == current_section and last_entry["Subsections Heading"] == current_subsection:
                        last_entry["Tables"].extend(tables)
                        last_entry["Page Number"] = page_number  # Update page number if necessary
                        return current_section, current_subsection, content

                # If no matching section/subsection exists, create a new entry
                self.result.append({
                    "Document Name": self.doc_name,
                    "Title": self.title,
                    "Sections Heading": current_section or "",
                    "Subsections Heading": current_subsection or "",
                    "Raw Content": " ".join(content),
                    "Page Number": page_number,
                    "Tables": tables,
                    "Image": []
                })
                content = []  # Clear content after adding table info

        elif isinstance(element, LTImage):
            # Handle images
            content.append("[Image detected]")

        return current_section, current_subsection, content

    def parse_pdf(self, pdf_path):
        """Parse the PDF using pdfminer and include page numbers."""
        current_section = None
        current_subsection = None
        content = []

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                current_section, current_subsection, content = self.process_element(
                    element, current_section, current_subsection, content, page_number, pdf_path
                )

        # Merge last content into the last dictionary only if it contains a table or image
        if self.result and content:
            last_entry = self.result[-1]
            if last_entry["Tables"] or "[Image detected]" in last_entry["Raw Content"]:
                last_entry["Raw Content"] += " " + " ".join(content)
            else:
                # Create a new dictionary for remaining content
                self.result.append({
                    "Document Name": self.doc_name,
                    "Title": self.title,
                    "Sections Heading": current_section or "",
                    "Subsections Heading": current_subsection or "",
                    "Raw Content": " ".join(content),
                    "Page Number": page_number,
                    "Tables": [],
                    "Image": []
                })

        return self.result

# Example usage
config_path = "/content/config.json"  # Path to the config file
pdf_path = "/content/Quantum Computing_ Principles and Applications.pdf"  # Path to the PDF

# Document name for identification
doc_name = "Example Document"

# Initialize and parse
parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

# Save the parsed content to a JSON file
with open("parsed_content_fixed_4.json", "w") as f:
    json.dump(parsed_content, f, indent=4)


In [17]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar, LTFigure
import pdfplumber

class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        font_sizes = []
        font_styles = set()
        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))
                font_styles.add(char.fontname)
        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        for heading_type, attributes in self.config.items():
            if avg_font_size == attributes['font_size'] and attributes['font_style'] == font_style:
                return heading_type
        return None

    def parse_table(self, pdf_path, page_number):
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[page_number - 1]
            tables = page.extract_tables()
            return tables if tables else []

    def process_element(self, element, current_section, current_subsection, content, tables, page_number, pdf_path):
        if isinstance(element, LTTextBox):
            for line in element:
                if isinstance(line, LTTextLine):
                    avg_font_size, font_style = self.identify_font(line)
                    text = line.get_text().strip()
                    heading_type = self.match_heading(avg_font_size, font_style)

                    if heading_type == "title":
                        self.title += " " + text if self.title else text
                    elif heading_type == "section":
                        if current_section or content or tables:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section or "",
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": tables,
                                "Images": []
                            })
                        current_section = text
                        current_subsection = None
                        content, tables = [], []
                    elif heading_type == "subsection":
                        if current_subsection or content or tables:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section,
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": tables,
                                "Images": []
                            })
                        current_subsection = text
                        content, tables = [], []
                    else:
                        content.append(text)

        elif isinstance(element, LTFigure):
            new_tables = self.parse_table(pdf_path, page_number)
            tables.extend(new_tables)

        return current_section, current_subsection, content, tables

    def parse_pdf(self, pdf_path):
        current_section = None
        current_subsection = None
        content, tables = [], []

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                current_section, current_subsection, content, tables = self.process_element(
                    element, current_section, current_subsection, content, tables, page_number, pdf_path
                )

        if current_section or current_subsection or content or tables:
            self.result.append({
                "Document Name": self.doc_name,
                "Title": self.title,
                "Sections Heading": current_section or "",
                "Subsections Heading": current_subsection or "",
                "Raw Content": " ".join(content),
                "Page Number": page_number,
                "Tables": tables,
                "Images": []
            })

        return self.result

# Example usage
config_path = "/content/config.json"
pdf_path = "/content/2309.07930v1.pdf"
doc_name = "Example Document"

parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

with open("/content/parsed_content_13.json", "w") as f:
    json.dump(parsed_content, f, indent=4)
