In [1]:
pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Downloading pdfminer.six-20240706-py3-none-any.whl (5.6 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/5.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/5.6 MB[0m [31m60.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m5.6/5.6 MB[0m [31m84.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20240706


In [2]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import pdfplumber
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal, LTTextLineHorizontal, LTChar
from typing import List, Dict, Any
from collections import defaultdict


class SectionContainer:
    """Container to store parsed content and metadata."""
    def __init__(self, content: str, metadata: Dict[str, Any]):
        self.content = content
        self.metadata = metadata

    def __repr__(self):
        return f"SectionContainer(metadata={self.metadata}, content_length={len(self.content)})"


def analyze_pdf_layout(pdf_path: str):
    """Analyzes the layout of a PDF to understand its structure."""
    layout_data = []
    for page_layout in extract_pages(pdf_path):
        page_elements = []
        for element in page_layout:
            if isinstance(element, LTTextBoxHorizontal):
                for line in element:
                    if isinstance(line, LTTextLineHorizontal):
                        font_size = max(
                            (char.size for char in line if isinstance(char, LTChar)),
                            default=0,
                        )
                        font_name = max(
                            (char.fontname for char in line if isinstance(char, LTChar)),
                            default="",
                        )
                        text = line.get_text().strip()
                        page_elements.append({"text": text, "font_size": font_size, "font_style": font_name})
        layout_data.append(page_elements)
    return layout_data


def parse_pdf_content(pdf_path: str) -> List[SectionContainer]:
    """Parses PDF content and organizes it into containers with metadata."""
    containers = []
    layout_analysis = analyze_pdf_layout(pdf_path)

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            metadata = {"page_number": page_num}

            # Group text by font size and style
            font_size_hierarchy = {}
            for element in layout_analysis[page_num - 1]:
                if "text" in element:
                    rounded_font_size = round(element["font_size"])
                    key = (rounded_font_size, element["font_style"])
                    font_size_hierarchy.setdefault(key, []).append(element["text"])

            for (font_size, font_style), texts in font_size_hierarchy.items():
                combined_text = "\n".join(texts)
                metadata["font_size"] = font_size
                metadata["font_style"] = font_style
                metadata["content_length"] = len(combined_text)

                # Exclude sections with content length <= 3
                if len(combined_text) > 3:
                    containers.append(SectionContainer(content=combined_text, metadata=metadata.copy()))

    return containers


def group_by_top_font_sizes(parsed_sections: List[SectionContainer], top_n: int = 5) -> Dict[int, Dict[str, List[Dict[str, Any]]]]:
    """Groups sections by rounded font size and font style, keeping only the top N font sizes."""
    grouped_data = defaultdict(lambda: defaultdict(list))

    # Collect all unique font sizes
    font_sizes = sorted(
        {section.metadata["font_size"] for section in parsed_sections},
        reverse=True,
    )

    # Select the top N font sizes
    top_font_sizes = font_sizes[:top_n]

    for section in parsed_sections:
        font_size = section.metadata["font_size"]
        if font_size in top_font_sizes:
            font_style = section.metadata["font_style"]
            grouped_data[font_size][font_style].append({
                "content": section.content,
                "page_number": section.metadata["page_number"],
                "content_length": section.metadata["content_length"],
            })

    return grouped_data


def save_grouped_sections_to_file(grouped_sections: Dict[int, Dict[str, List[Dict[str, Any]]]], output_file: str):
    """Saves grouped sections to a text file."""
    with open(output_file, "w", encoding="utf-8") as f:
        for font_size, styles in sorted(grouped_sections.items(), reverse=True):
            f.write(f"Font Size: {font_size}\n")
            f.write("=" * 50 + "\n")
            for font_style, sections in styles.items():
                f.write(f"  Font Style: {font_style}\n")
                f.write("  " + "-" * 45 + "\n")
                for section in sections:
                    f.write(f"    Page Number: {section['page_number']}\n")

                    # Include content or content length conditionally
                    if section["content_length"] <= 120:
                        f.write(f"    Content: {section['content']}\n")
                    else:
                        f.write(f"    Content Length: {section['content_length']}\n")

                    f.write("\n")
                f.write("  " + "-" * 45 + "\n")
            f.write("-" * 50 + "\n\n")


if __name__ == "__main__":
    pdf_path = "/content/1807.01544v2-2.pdf"  # Replace with your PDF path
    output_file = "/content/top_font_sizes.txt"  # Replace with desired output path

    parsed_sections = parse_pdf_content(pdf_path)
    grouped_sections = group_by_top_font_sizes(parsed_sections, top_n=5)
    save_grouped_sections_to_file(grouped_sections, output_file)

    print(f"Top 5 font sizes saved to {output_file}")


Top 5 font sizes saved to /content/top_font_sizes.txt


In [6]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar, LTImage, LTFigure
from pdfplumber import open as pdfplumber_open

class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        """Load the configuration file containing font size and style for headings."""
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        """Identify the average font size and style for a line."""
        font_sizes = []
        font_styles = set()

        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))  # Round off font size
                font_styles.add(char.fontname)

        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0  # Round average font size
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        """Match the font size and style with the config to classify the heading."""
        for heading_type, attributes in self.config.items():
            if avg_font_size >= attributes['font_size'] and attributes['font_style'] in font_style:
                return heading_type
        return None

    def parse_table(self, pdf_path, page_number):
        """Parse table content using pdfplumber."""
        with pdfplumber_open(pdf_path) as pdf:
            page = pdf.pages[page_number - 1]
            tables = page.extract_tables()
            return tables if tables else []

    def process_element(self, element, current_section, current_subsection, content, page_number, pdf_path):
        """Process each element (text, table, or image) and store it appropriately."""
        if isinstance(element, LTTextBox):
            for line in element:
                if isinstance(line, LTTextLine):
                    avg_font_size, font_style = self.identify_font(line)
                    text = line.get_text().strip()
                    heading_type = self.match_heading(avg_font_size, font_style)

                    if heading_type == "title":
                        # Append to the title if it's a continuation
                        self.title += " " + text if self.title else text
                    elif heading_type == "section":
                        # Save previous content before starting a new section
                        if current_section or content:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section or "",
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": [],
                                "Image": []
                            })
                        current_section = text
                        current_subsection = None
                        content = []
                    elif heading_type == "subsection":
                        # Save previous content before starting a new subsection
                        if current_subsection or content:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section,
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": [],
                                "Image": []
                            })
                        current_subsection = text
                        content = []
                    else:
                        # Regular content
                        content.append(text)

        elif isinstance(element, LTFigure):
            # Handle figures (e.g., tables or images)
            tables = self.parse_table(pdf_path, page_number)
            if tables:
                # Append table information to the current content without creating a separate entry
                self.result.append({
                    "Document Name": self.doc_name,
                    "Title": self.title,
                    "Sections Heading": current_section or "",
                    "Subsections Heading": current_subsection or "",
                    "Raw Content": " ".join(content),
                    "Page Number": page_number,
                    "Tables": tables,
                    "Image": []
                })
                content = []  # Clear content after adding table info

        elif isinstance(element, LTImage):
            # Handle images
            content.append("[Image detected]")

        return current_section, current_subsection, content

    def parse_pdf(self, pdf_path):
        """Parse the PDF using pdfminer and include page numbers."""
        current_section = None
        current_subsection = None
        content = []

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                current_section, current_subsection, content = self.process_element(
                    element, current_section, current_subsection, content, page_number, pdf_path
                )

        # Merge last content into the last dictionary
        if self.result and content:
            self.result[-1]["Raw Content"] += " " + " ".join(content)

        return self.result

# Example usage
config_path = "/content/config.json"  # Path to the config file
pdf_path = "/content/00050107-Generative-AI-pt-ai-Zant-Kouw-Schomaker.pdf"  # Path to the PDF

# Document name for identification
doc_name = "Example Document"

# Initialize and parse
parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

# Save the parsed content to a JSON file
with open("parsed_content.json", "w") as f:
    json.dump(parsed_content, f, indent=4)


In [13]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar, LTImage, LTFigure
from pdfplumber import open as pdfplumber_open

class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        """Load the configuration file containing font size and style for headings."""
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        """Identify the average font size and style for a line."""
        font_sizes = []
        font_styles = set()

        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))  # Round off font size
                font_styles.add(char.fontname)

        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0  # Round average font size
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        """Match the font size and style with the config to classify the heading."""
        for heading_type, attributes in self.config.items():
            if avg_font_size == attributes['font_size'] and attributes['font_style'] == font_style:
                return heading_type
        return None

    def parse_table(self, pdf_path, page_number):
        """Parse table content using pdfplumber."""
        with pdfplumber_open(pdf_path) as pdf:
            page = pdf.pages[page_number - 1]
            tables = page.extract_tables()
            return tables if tables else []

    def process_element(self, element, current_section, current_subsection, content, page_number, pdf_path):
        """Process each element (text, table, or image) and store it appropriately."""
        if isinstance(element, LTTextBox):
            for line in element:
                if isinstance(line, LTTextLine):
                    avg_font_size, font_style = self.identify_font(line)
                    text = line.get_text().strip()
                    heading_type = self.match_heading(avg_font_size, font_style)

                    if heading_type == "title":
                        # Append to the title if it's a continuation
                        self.title += " " + text if self.title else text
                    elif heading_type == "section":
                        # Save previous content before starting a new section
                        if current_section or content:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section or "",
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": [],
                                "Image": []
                            })
                        current_section = text
                        current_subsection = None
                        content = []
                    elif heading_type == "subsection":
                        # Save previous content before starting a new subsection
                        if current_subsection or content:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section,
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": [],
                                "Image": []
                            })
                        current_subsection = text
                        content = []
                    else:
                        # Regular content
                        content.append(text)

        elif isinstance(element, LTFigure):
            # Handle figures (e.g., tables or images)
            tables = self.parse_table(pdf_path, page_number)
            if tables:
                # Append table information to the current content without creating a separate entry
                self.result.append({
                    "Document Name": self.doc_name,
                    "Title": self.title,
                    "Sections Heading": current_section or "",
                    "Subsections Heading": current_subsection or "",
                    "Raw Content": " ".join(content),
                    "Page Number": page_number,
                    "Tables": tables,
                    "Image": []
                })
                content = []  # Clear content after adding table info

        elif isinstance(element, LTImage):
            # Handle images
            content.append("[Image detected]")

        return current_section, current_subsection, content

    def parse_pdf(self, pdf_path):
        """Parse the PDF using pdfminer and include page numbers."""
        current_section = None
        current_subsection = None
        content = []

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                current_section, current_subsection, content = self.process_element(
                    element, current_section, current_subsection, content, page_number, pdf_path
                )

        # Merge last content into the last dictionary only if it contains a table or image
        if self.result and content:
            last_entry = self.result[-1]
            if last_entry["Tables"] or "[Image detected]" in last_entry["Raw Content"]:
                last_entry["Raw Content"] += " " + " ".join(content)
            else:
                # Create a new dictionary for remaining content
                self.result.append({
                    "Document Name": self.doc_name,
                    "Title": self.title,
                    "Sections Heading": current_section or "",
                    "Subsections Heading": current_subsection or "",
                    "Raw Content": " ".join(content),
                    "Page Number": page_number,
                    "Tables": [],
                    "Image": []
                })

        return self.result

# Example usage
config_path = "/content/config.json"  # Path to the config file
pdf_path = "/content/Quantum Computing_ Principles and Applications.pdf"  # Path to the PDF

# Document name for identification
doc_name = "Example Document"

# Initialize and parse
parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

# Save the parsed content to a JSON file
with open("parsed_content_fixed_11.json", "w") as f:
    json.dump(parsed_content, f, indent=4)


In [4]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar, LTImage, LTFigure
from pdfplumber import open as pdfplumber_open

class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        """Load the configuration file containing font size and style for headings."""
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        """Identify the average font size and style for a line."""
        font_sizes = []
        font_styles = set()

        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))  # Round off font size
                font_styles.add(char.fontname)

        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0  # Round average font size
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        """Match the font size and style with the config to classify the heading."""
        for heading_type, attributes in self.config.items():
            if avg_font_size == attributes['font_size'] and attributes['font_style'] == font_style:
                return heading_type
        return None

    def parse_table(self, pdf_path, page_number):
        """Parse table content using pdfplumber."""
        with pdfplumber_open(pdf_path) as pdf:
            page = pdf.pages[page_number - 1]
            tables = page.extract_tables()
            return tables if tables else []

    def process_element(self, element, current_section, current_subsection, content, page_number, pdf_path):
        """Process each element (text, table, or image) and store it appropriately."""
        if isinstance(element, LTTextBox):
            for line in element:
                if isinstance(line, LTTextLine):
                    avg_font_size, font_style = self.identify_font(line)
                    text = line.get_text().strip()
                    heading_type = self.match_heading(avg_font_size, font_style)

                    if heading_type == "title":
                        # Append to the title if it's a continuation
                        self.title += " " + text if self.title else text
                    elif heading_type == "section":
                        # Save previous content before starting a new section
                        if current_section or content:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section or "",
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": [],
                                "Image": []
                            })
                        current_section = text
                        current_subsection = None
                        content = []
                    elif heading_type == "subsection":
                        # Save previous content before starting a new subsection
                        if current_subsection or content:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section,
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": [],
                                "Image": []
                            })
                        current_subsection = text
                        content = []
                    else:
                        # Regular content
                        content.append(text)

        elif isinstance(element, LTFigure):
            # Handle figures (e.g., tables or images)
            tables = self.parse_table(pdf_path, page_number)
            if tables:
                # Append table information to the current content instead of creating a new entry
                if self.result and (current_section or current_subsection):
                    last_entry = self.result[-1]
                    if last_entry["Sections Heading"] == current_section and last_entry["Subsections Heading"] == current_subsection:
                        last_entry["Tables"].extend(tables)
                        last_entry["Page Number"] = page_number  # Update page number if necessary
                        return current_section, current_subsection, content

                # If no matching section/subsection exists, create a new entry
                self.result.append({
                    "Document Name": self.doc_name,
                    "Title": self.title,
                    "Sections Heading": current_section or "",
                    "Subsections Heading": current_subsection or "",
                    "Raw Content": " ".join(content),
                    "Page Number": page_number,
                    "Tables": tables,
                    "Image": []
                })
                content = []  # Clear content after adding table info

        elif isinstance(element, LTImage):
            # Handle images
            content.append("[Image detected]")

        return current_section, current_subsection, content

    def parse_pdf(self, pdf_path):
        """Parse the PDF using pdfminer and include page numbers."""
        current_section = None
        current_subsection = None
        content = []

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                current_section, current_subsection, content = self.process_element(
                    element, current_section, current_subsection, content, page_number, pdf_path
                )

        # Merge last content into the last dictionary only if it contains a table or image
        if self.result and content:
            last_entry = self.result[-1]
            if last_entry["Tables"] or "[Image detected]" in last_entry["Raw Content"]:
                last_entry["Raw Content"] += " " + " ".join(content)
            else:
                # Create a new dictionary for remaining content
                self.result.append({
                    "Document Name": self.doc_name,
                    "Title": self.title,
                    "Sections Heading": current_section or "",
                    "Subsections Heading": current_subsection or "",
                    "Raw Content": " ".join(content),
                    "Page Number": page_number,
                    "Tables": [],
                    "Image": []
                })

        return self.result

# Example usage
config_path = "/content/config.json"  # Path to the config file
pdf_path = "/content/Quantum Computing_ Principles and Applications.pdf"  # Path to the PDF

# Document name for identification
doc_name = "Example Document"

# Initialize and parse
parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

# Save the parsed content to a JSON file
with open("parsed_content_fixed.json", "w") as f:
    json.dump(parsed_content, f, indent=4)


In [6]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar, LTFigure,LTRect
import pdfplumber

class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        font_sizes = []
        font_styles = set()
        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))
                font_styles.add(char.fontname)
        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        for heading_type, attributes in self.config.items():
            if avg_font_size == attributes['font_size'] and attributes['font_style'] == font_style:
                return heading_type
        return None

    def parse_table(self, pdf_path, page_number):
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[page_number - 1]
            tables = page.extract_tables()
            return tables if tables else []

    def process_element(self, element, current_section, current_subsection, content, tables, page_number, pdf_path):
        if isinstance(element, LTTextBox):
            for line in element:
                if isinstance(line, LTTextLine):
                    avg_font_size, font_style = self.identify_font(line)
                    text = line.get_text().strip()
                    heading_type = self.match_heading(avg_font_size, font_style)

                    if heading_type == "title":
                        self.title += " " + text if self.title else text
                    elif heading_type == "section":
                        if current_section or content or tables:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section or "",
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": tables,
                                "Images": []
                            })
                        current_section = text
                        current_subsection = None
                        content, tables = [], []
                    elif heading_type == "subsection":
                        if current_subsection or content or tables:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section,
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": tables,
                                "Images": []
                            })
                        current_subsection = text
                        content, tables = [], []
                    else:
                        content.append(text)

        elif isinstance(element, LTRect):
            new_tables = self.parse_table(pdf_path, page_number)
            tables.extend(new_tables)

        return current_section, current_subsection, content, tables

    def parse_pdf(self, pdf_path):
        current_section = None
        current_subsection = None
        content, tables = [], []

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                current_section, current_subsection, content, tables = self.process_element(
                    element, current_section, current_subsection, content, tables, page_number, pdf_path
                )

        if current_section or current_subsection or content or tables:
            self.result.append({
                "Document Name": self.doc_name,
                "Title": self.title,
                "Sections Heading": current_section or "",
                "Subsections Heading": current_subsection or "",
                "Raw Content": " ".join(content),
                "Page Number": page_number,
                "Tables": tables,
                "Images": []
            })

        return self.result

# Example usage
config_path = "/content/config.json"
pdf_path = "/content/Quantum Computing_ Principles and Applications.pdf"
doc_name = "Example Document"

parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

with open("/content/parsed_content_13.json", "w") as f:
    json.dump(parsed_content, f, indent=4)


In [12]:
import pdfplumber
pdf_path="/content/Quantum Computing_ Principles and Applications.pdf"
all_tables = []
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        tables = page.extract_tables()  # Extract tables from the current page
        if tables:
            all_tables.extend(tables)
            print(page.page_number)

8
9
10
21


In [13]:
all_tables

[[['𝜓\n𝑖𝑛1', '𝜓\n𝑖𝑛2', '𝜓\n𝑜𝑢𝑡1', '𝜓\n𝑜𝑢𝑡2'],
  ['0', '0', '0', '0'],
  ['0', '1', '0', '1'],
  ['1', '0', '1', '1'],
  ['1', '1', '1', '0']],
 [['𝐼𝑁\n1', '𝐼𝑁\n2', '𝑂𝑢𝑡'],
  ['0', '0', '1'],
  ['0', '1', '1'],
  ['1', '0', '1'],
  ['1', '1', '0']],
 [['𝜓\n𝑖𝑛1', '𝜓\n𝑖𝑛2', '𝜓\n𝑖𝑛3', '𝜓\n𝑜𝑢𝑡1', '𝜓\n𝑜𝑢𝑡2', '𝜓\n𝑜𝑢𝑡3'],
  ['0', '0', '1', '0', '0', '1'],
  ['0', '1', '1', '0', '1', '1'],
  ['1', '0', '1', '1', '0', '1'],
  ['1', '1', '1', '1', '1', '0']],
 [['𝜓\n𝑖𝑛1',
   None,
   None,
   '𝜓\n𝑖𝑛2',
   None,
   None,
   '𝜓\n𝑖𝑛3',
   None,
   None,
   '𝜓\n𝑜𝑢𝑡1',
   None,
   None,
   '𝜓\n𝑜𝑢𝑡2',
   None,
   None,
   '𝜓\n𝑜𝑢𝑡3',
   None,
   None],
  ['0',
   None,
   None,
   '0',
   None,
   None,
   '0',
   None,
   None,
   '0',
   None,
   None,
   '0',
   None,
   None,
   '0',
   None,
   None],
  ['',
   '0',
   '',
   '',
   '0',
   '',
   '',
   '1',
   '',
   '',
   '0',
   '',
   '',
   '0',
   '',
   '',
   '1',
   ''],
  ['0',
   None,
   None,
   '1',
   None,
   None,
   '0',
   None

In [10]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar, LTFigure
import pdfplumber
import camelot

class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        font_sizes = []
        font_styles = set()
        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))
                font_styles.add(char.fontname)
        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        for heading_type, attributes in self.config.items():
            if avg_font_size == attributes['font_size'] and attributes['font_style'] == font_style:
                return heading_type
        return None

    def parse_table_with_pdfplumber(self, pdf_path, page_number):
        """Extract tables using pdfplumber."""
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[page_number - 1]
            tables = page.extract_tables(
                table_settings={
                    "vertical_strategy": "lines",
                    "horizontal_strategy": "lines",
                    "intersection_tolerance": 5,
                }
            )
            return tables if tables else []

    def parse_table_with_camelot(self, pdf_path, page_number):
        """Extract tables using Camelot."""
        tables = camelot.read_pdf(
            pdf_path, pages=str(page_number), flavor="stream"
        )  # Use "stream" for line-based tables
        return [table.df.values.tolist() for table in tables] if tables else []

    def parse_tables(self, pdf_path, page_number):
        """Combine both methods for table extraction."""
        tables = self.parse_table_with_pdfplumber(pdf_path, page_number)
        if not tables:
            tables = self.parse_table_with_camelot(pdf_path, page_number)
        return tables

    def process_element(self, element, current_section, current_subsection, content, tables, page_number, pdf_path):
        if isinstance(element, LTTextBox):
            for line in element:
                if isinstance(line, LTTextLine):
                    avg_font_size, font_style = self.identify_font(line)
                    text = line.get_text().strip()
                    heading_type = self.match_heading(avg_font_size, font_style)

                    if heading_type == "title":
                        self.title += " " + text if self.title else text
                    elif heading_type == "section":
                        if current_section or content or tables:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section or "",
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": tables,
                                "Images": []
                            })
                        current_section = text
                        current_subsection = None
                        content, tables = [], []
                    elif heading_type == "subsection":
                        if current_subsection or content or tables:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section,
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": tables,
                                "Images": []
                            })
                        current_subsection = text
                        content, tables = [], []
                    else:
                        content.append(text)

        elif isinstance(element, LTFigure):
            new_tables = self.parse_tables(pdf_path, page_number)
            tables.extend(new_tables)

        return current_section, current_subsection, content, tables

    def parse_pdf(self, pdf_path):
        current_section = None
        current_subsection = None
        content, tables = [], []

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                current_section, current_subsection, content, tables = self.process_element(
                    element, current_section, current_subsection, content, tables, page_number, pdf_path
                )

        if current_section or current_subsection or content or tables:
            self.result.append({
                "Document Name": self.doc_name,
                "Title": self.title,
                "Sections Heading": current_section or "",
                "Subsections Heading": current_subsection or "",
                "Raw Content": " ".join(content),
                "Page Number": page_number,
                "Tables": tables,
                "Images": []
            })

        return self.result

# Example usage
config_path = "/content/config.json"
pdf_path = "/content/2309.07930v1.pdf"
doc_name = "Example Document"

parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

with open("/content/parsed_content_combined.json", "w") as f:
    json.dump(parsed_content, f, indent=4)


In [11]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar, LTFigure
import pdfplumber
import camelot

class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        font_sizes = []
        font_styles = set()
        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))
                font_styles.add(char.fontname)
        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        for heading_type, attributes in self.config.items():
            if avg_font_size == attributes['font_size'] and attributes['font_style'] == font_style:
                return heading_type
        return None

    def parse_table_with_pdfplumber(self, pdf_path, page_number):
        """Extract tables using pdfplumber."""
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[page_number - 1]
            tables = page.extract_tables(
                table_settings={
                    "vertical_strategy": "lines",
                    "horizontal_strategy": "lines",
                    "intersection_tolerance": 5,
                }
            )
            return tables if tables else []

    def parse_table_with_camelot(self, pdf_path, page_number):
        """Extract tables using Camelot."""
        tables = camelot.read_pdf(
            pdf_path, pages=str(page_number), flavor="stream"
        )  # Use "stream" for line-based tables
        return [table.df.values.tolist() for table in tables] if tables else []

    def parse_tables(self, pdf_path, page_number):
        """Combine both methods for table extraction."""
        tables = self.parse_table_with_pdfplumber(pdf_path, page_number)
        if not tables:
            tables = self.parse_table_with_camelot(pdf_path, page_number)
        return tables

    def process_element(self, element, current_section, current_subsection, content, tables, page_number, pdf_path):
        if isinstance(element, LTTextBox):
            for line in element:
                if isinstance(line, LTTextLine):
                    avg_font_size, font_style = self.identify_font(line)
                    text = line.get_text().strip()
                    heading_type = self.match_heading(avg_font_size, font_style)

                    if heading_type == "title":
                        self.title += " " + text if self.title else text
                    elif heading_type == "section":
                        if current_section or content or tables:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section or "",
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": tables,
                                "Images": []
                            })
                        current_section = text
                        current_subsection = None
                        content, tables = [], []
                    elif heading_type == "subsection":
                        if current_subsection or content or tables:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section,
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": tables,
                                "Images": []
                            })
                        current_subsection = text
                        content, tables = [], []
                    else:
                        content.append(text)

        elif isinstance(element, LTFigure):
            # Recursively analyze nested elements within the figure
            for sub_element in element:
                if isinstance(sub_element, LTTextBox):
                    for line in sub_element:
                        if isinstance(line, LTTextLine):
                            text = line.get_text().strip()
                            content.append(text)

            # Attempt table extraction for the current page using pdfplumber and camelot
            new_tables = self.parse_tables(pdf_path, page_number)
            if new_tables:
                tables.extend(new_tables)

        return current_section, current_subsection, content, tables

    def parse_pdf(self, pdf_path):
        current_section = None
        current_subsection = None
        content, tables = [], []

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                current_section, current_subsection, content, tables = self.process_element(
                    element, current_section, current_subsection, content, tables, page_number, pdf_path
                )

        if current_section or current_subsection or content or tables:
            self.result.append({
                "Document Name": self.doc_name,
                "Title": self.title,
                "Sections Heading": current_section or "",
                "Subsections Heading": current_subsection or "",
                "Raw Content": " ".join(content),
                "Page Number": page_number,
                "Tables": tables,
                "Images": []
            })

        return self.result

# Example usage
config_path = "/content/config.json"
pdf_path = "/content/2309.07930v1.pdf"
doc_name = "Example Document"

parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

with open("/content/parsed_content_combined_1.json", "w") as f:
    json.dump(parsed_content, f, indent=4)


In [4]:
all_tables

[[['a\nTraining Discriminator\nsamples\nreal/fake?'],
  ['Generator\nz\nTraining\nReward model RLHF\nHuman preference\ndata\nfeedback\nSelf-supervised Supervised Conversational\nLanguage model\nlearning fine-tuning model\ning Fine-tuning\nmplesofdifferenttrainingproceduresforgenerativeAImodels. (a)Generativeadve\nN)wherezisrandominput. (b)Reinforcementlearningfromhumanfeedback(RL\nsationalgenerativeAImodels.']],
 [['', 'Reward model\nHuman preference\ndata']],
 [['', 'Self-supervised\nLanguage model\nlearning']]]

In [7]:
pip install camelot-py

Collecting camelot-py
  Downloading camelot_py-1.0.0-py3-none-any.whl.metadata (9.4 kB)
Collecting pdfminer-six>=20240706 (from camelot-py)
  Using cached pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdf<4.0,>=3.17 (from camelot-py)
  Downloading pypdf-3.17.4-py3-none-any.whl.metadata (7.5 kB)
Downloading camelot_py-1.0.0-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached pdfminer.six-20240706-py3-none-any.whl (5.6 MB)
Downloading pypdf-3.17.4-py3-none-any.whl (278 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf, pdfminer-six, camelot-py
  Attempting uninstall: pdfminer-six
    Found existing installation: pdfminer.six 20231228
    Uninstalling pdfminer.six-20231228:
      Successfully uninstalled pdfminer.six-20231228
[31mERROR: pip'

In [8]:
import camelot
pdf_path="/content/2309.07930v1.pdf"
page_number = 6
tables = camelot.read_pdf(pdf_path, pages=str(page_number), flavor='stream')
for i, table in enumerate(tables):
    print(f"Table {i}:\n", table.df)
    # Optionally, save the table to CSV
    table.to_csv(f"/content/table_{i}.csv")

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


Table 0:
                                    0  \
0                            Concept   
1     Diffusion probabilistic models   
2                                      
3                                      
4                                      
5                                      
6     Generative adversarial network   
7                                      
8                                      
9                                      
10                                     
11                                     
12            (Large) language model   
13                                     
14                                     
15                                     
16                                     
17                                     
18                                     
19     Reinforcement\nlearning\nfrom   
20                    human feedback   
21                                     
22                                     
23                            

In [4]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar, LTFigure, LTRect
import pdfplumber

class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        font_sizes = []
        font_styles = set()
        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))
                font_styles.add(char.fontname)
        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        for heading_type, attributes in self.config.items():
            if avg_font_size == attributes['font_size'] and attributes['font_style'] == font_style:
                return heading_type
        return None

    def parse_table(self, pdf_path, page_number, table_num):
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[page_number - 1]
            tables = page.extract_tables()
            if tables and table_num < len(tables):
                return tables[table_num]
        return []

    def table_converter(self, table):
        table_string = ''
        for row in table:
            cleaned_row = [
                item.replace('\n', ' ') if item and '\n' in item else 'None' if item is None else item
                for item in row
            ]
            table_string += '|' + '|'.join(cleaned_row) + '|\n'
        return table_string.strip()

    def process_element(self, element, current_section, current_subsection, content, tables, page_number, pdf_path):
        if isinstance(element, LTTextBox):
            for line in element:
                if isinstance(line, LTTextLine):
                    avg_font_size, font_style = self.identify_font(line)
                    text = line.get_text().strip()
                    heading_type = self.match_heading(avg_font_size, font_style)

                    if heading_type == "title":
                        self.title += " " + text if self.title else text
                    elif heading_type == "section":
                        if current_section or content or tables:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section or "",
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": tables,
                                "Images": []
                            })
                        current_section = text
                        current_subsection = None
                        content, tables = [], []
                    elif heading_type == "subsection":
                        if current_subsection or content or tables:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section,
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": tables,
                                "Images": []
                            })
                        current_subsection = text
                        content, tables = [], []
                    else:
                        content.append(text)

        # elif isinstance(element, LTFigure):
        #     # Fallback for tables in figures
        #     new_tables = self.parse_table(pdf_path, page_number, 0)
        #     tables.extend(new_tables)

        elif isinstance(element, LTRect):
            # Custom logic for table boundaries
            new_tables = self.parse_table(pdf_path, page_number, 0)  # Adjust table_num as needed
            # for table in new_tables:
            #     table_string = self.table_converter(table)
            tables.append(new_tables)

        return current_section, current_subsection, content, tables

    def parse_pdf(self, pdf_path):
        current_section = None
        current_subsection = None
        content, tables = [], []

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                current_section, current_subsection, content, tables = self.process_element(
                    element, current_section, current_subsection, content, tables, page_number, pdf_path
                )

        if current_section or current_subsection or content or tables:
            self.result.append({
                "Document Name": self.doc_name,
                "Title": self.title,
                "Sections Heading": current_section or "",
                "Subsections Heading": current_subsection or "",
                "Raw Content": " ".join(content),
                "Page Number": page_number,
                "Tables": tables,
                "Images": []
            })

        return self.result

# Example usage
config_path = "/content/config.json"
pdf_path = "/content/Quantum Computing_ Principles and Applications.pdf"
doc_name = "Example Document"

parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

with open("/content/parsed_content_17.json", "w") as f:
    json.dump(parsed_content, f, indent=4)


KeyboardInterrupt: 

In [5]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar
import pdfplumber

class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        font_sizes = []
        font_styles = set()
        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))
                font_styles.add(char.fontname)
        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        for heading_type, attributes in self.config.items():
            if avg_font_size == attributes['font_size'] and attributes['font_style'] == font_style:
                return heading_type
        return None

    def extract_tables_from_pdf(self, pdf_path):
        """Extract tables and their locations using pdfplumber."""
        table_data = {}
        with pdfplumber.open(pdf_path) as pdf:
            for page_number, page in enumerate(pdf.pages, start=1):
                tables = page.extract_tables()
                if tables:
                    table_data[page_number] = [
                        {"content": table, "bbox": page.bbox} for table in tables
                    ]
        return table_data

    def assign_tables_to_sections(self, page_number, table_data, current_section, current_subsection, content, page_layout):
        """Associate extracted tables with sections or subsections based on spatial proximity."""
        if page_number not in table_data:
            return content, []

        assigned_tables = []
        unprocessed_content = list(content)  # Copy of raw content

        for table in table_data[page_number]:
            # Optionally refine the association logic based on specific bounding box relationships
            assigned_tables.append(table["content"])

            # Optionally remove table-like content from raw content
            # (Further customization can be added here)

        return unprocessed_content, assigned_tables

    # def assign_tables_to_sections(self, page_number, table_data, current_section, current_subsection, content, page_layout):
    #     """Associate extracted tables with sections or subsections based on spatial proximity and remove duplicates."""
    #     if page_number not in table_data:
    #         return content, []

    #     assigned_tables = []
    #     updated_content = []

    #     for table in table_data[page_number]:
    #         table_bbox = table["bbox"]
    #         assigned_tables.append(table["content"])

    #         # Remove table-like content from raw_content by checking overlap
    #         for element in page_layout:
    #             if isinstance(element, LTTextBox):
    #                 for line in element:
    #                     if isinstance(line, LTTextLine):
    #                         line_bbox = line.bbox
    #                         if not (
    #                             line_bbox[0] >= table_bbox[0]
    #                             and line_bbox[1] >= table_bbox[1]
    #                             and line_bbox[2] <= table_bbox[2]
    #                             and line_bbox[3] <= table_bbox[3]
    #                         ):
    #                             updated_content.append(line.get_text().strip())

    #     return updated_content, assigned_tables



    def process_element(self, element, current_section, current_subsection, content, tables, page_number, table_data, page_layout):
        if isinstance(element, LTTextBox):
            for line in element:
                if isinstance(line, LTTextLine):
                    avg_font_size, font_style = self.identify_font(line)
                    text = line.get_text().strip()
                    heading_type = self.match_heading(avg_font_size, font_style)

                    if heading_type == "title":
                        self.title += " " + text if self.title else text
                    elif heading_type == "section":
                        if current_section or content or tables:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section or "",
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": tables,
                                "Images": []
                            })
                        current_section = text
                        current_subsection = None
                        content, tables = [], []
                    elif heading_type == "subsection":
                        if current_subsection or content or tables:
                            self.result.append({
                                "Document Name": self.doc_name,
                                "Title": self.title,
                                "Sections Heading": current_section,
                                "Subsections Heading": current_subsection or "",
                                "Raw Content": " ".join(content),
                                "Page Number": page_number,
                                "Tables": tables,
                                "Images": []
                            })
                        current_subsection = text
                        content, tables = [], []
                    else:
                        content.append(text)

        return current_section, current_subsection, content, tables

    def parse_pdf(self, pdf_path):
        table_data = self.extract_tables_from_pdf(pdf_path)
        current_section = None
        current_subsection = None
        content, tables = [], []

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                current_section, current_subsection, content, tables = self.process_element(
                    element, current_section, current_subsection, content, tables, page_number, table_data, page_layout
                )

            # After processing all elements on the page, assign tables to sections/subsections
            content, page_tables = self.assign_tables_to_sections(
                page_number, table_data, current_section, current_subsection, content, page_layout
            )
            tables.extend(page_tables)

        if current_section or current_subsection or content or tables:
            self.result.append({
                "Document Name": self.doc_name,
                "Title": self.title,
                "Sections Heading": current_section or "",
                "Subsections Heading": current_subsection or "",
                "Raw Content": " ".join(content),
                "Page Number": page_number,
                "Tables": tables,
                "Images": []
            })

        return self.result

# Example usage
config_path = "/content/config.json"
pdf_path = "/content/1807.01544v2-2.pdf"
doc_name = "Example Document"

parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

with open("/content/parsed_content_19.json", "w") as f:
    json.dump(parsed_content, f, indent=4)


In [5]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar
import pdfplumber


class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        font_sizes = []
        font_styles = set()
        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))
                font_styles.add(char.fontname)
        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        for heading_type, attributes in self.config.items():
            if avg_font_size == attributes['font_size'] and attributes['font_style'] == font_style:
                return heading_type
        return None

    def extract_tables_from_pdf(self, pdf_path):
        """Extract tables and their locations using pdfplumber."""
        table_data = {}
        with pdfplumber.open(pdf_path) as pdf:
            for page_number, page in enumerate(pdf.pages, start=1):
                tables = page.extract_tables()
                if tables:
                    table_data[page_number] = [
                        {"content": table, "bbox": page.bbox} for table in tables
                    ]
        return table_data

    def process_element(self, element, current_section, current_subsection, content, tables, page_number):
        """Process individual PDF elements to determine structure and content."""
        if isinstance(element, LTTextBox):
            for line in element:
                if isinstance(line, LTTextLine):
                    avg_font_size, font_style = self.identify_font(line)
                    text = line.get_text().strip()
                    heading_type = self.match_heading(avg_font_size, font_style)

                    if heading_type == "title":
                        self.title += " " + text if self.title else text
                    elif heading_type == "section":
                        if current_section or content or tables:
                            self.add_section(current_section, current_subsection, content, tables, page_number)
                        current_section = text
                        current_subsection = None
                        content, tables = [], []
                    elif heading_type == "subsection":
                        if current_subsection or content or tables:
                            self.add_section(current_section, current_subsection, content, tables, page_number)
                        current_subsection = text
                        content, tables = [], []
                    else:
                        content.append(text)

        return current_section, current_subsection, content, tables

    def add_section(self, current_section, current_subsection, content, tables, page_number):
        """Add a completed section or subsection to the result."""
        self.result.append({
            "Document Name": self.doc_name,
            "Title": self.title,
            "Sections Heading": current_section or "",
            "Subsections Heading": current_subsection or "",
            "Raw Content": " ".join(content),
            "Page Number": page_number,
            "Tables": tables,
            "Images": []
        })

    def parse_pdf(self, pdf_path):
        table_data = self.extract_tables_from_pdf(pdf_path)
        current_section = None
        current_subsection = None
        content, tables = [], []
        section_headings = []  # Stores (bbox, text) for sections
        subsection_headings = []  # Stores (bbox, text) for subsections

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                if isinstance(element, LTTextBox):
                    for line in element:
                        if isinstance(line, LTTextLine):
                            avg_font_size, font_style = self.identify_font(line)
                            text = line.get_text().strip()
                            heading_type = self.match_heading(avg_font_size, font_style)

                            if heading_type == "section":
                                if current_section or content or tables:
                                    self.add_section(current_section, current_subsection, content, tables, page_number)
                                current_section = text
                                current_subsection = None
                                section_headings.append((line.bbox, text))
                                content, tables = [], []
                            elif heading_type == "subsection":
                                if current_subsection or content or tables:
                                    self.add_section(current_section, current_subsection, content, tables, page_number)
                                current_subsection = text
                                subsection_headings.append((line.bbox, text))
                                content, tables = [], []
                            else:
                                content.append(text)

            # Assign tables to sections/subsections for the current page
            table_assignments = self.assign_tables_to_sections(page_number, table_data, section_headings, subsection_headings)
            for heading, tables_in_heading in table_assignments.items():
                if heading == current_subsection:
                    tables.extend(tables_in_heading)
                elif heading == current_section and not current_subsection:
                    tables.extend(tables_in_heading)

        # Finalize the last section or subsection
        if current_section or current_subsection or content or tables:
            self.add_section(current_section, current_subsection, content, tables, page_number)

        return self.result



# Example usage
config_path = "/content/config.json"
pdf_path = "/content/1807.01544v2-2.pdf"
doc_name = "Example Document"

parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

with open("/content/parsed_content_20.json", "w") as f:
    json.dump(parsed_content, f, indent=4)




AttributeError: 'PDFParser' object has no attribute 'assign_tables_to_sections'

In [6]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar
import pdfplumber


class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        font_sizes = []
        font_styles = set()
        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))
                font_styles.add(char.fontname)
        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        for heading_type, attributes in self.config.items():
            if avg_font_size == attributes['font_size'] and attributes['font_style'] == font_style:
                return heading_type
        return None

    def extract_tables_from_pdf(self, pdf_path):
        """Extract tables and their locations using pdfplumber."""
        table_data = {}
        with pdfplumber.open(pdf_path) as pdf:
            for page_number, page in enumerate(pdf.pages, start=1):
                tables = page.extract_tables()
                if tables:
                    table_data[page_number] = [
                        {"content": table, "bbox": page.bbox} for table in tables
                    ]
        return table_data

    def assign_tables_to_sections(self, page_number, table_data, section_headings, subsection_headings):
        """Assign tables to sections/subsections based on proximity and page number."""
        assigned_tables = {}
        if page_number not in table_data:
            return assigned_tables

        for table in table_data[page_number]:
            table_bbox = table["bbox"]

            # Find the closest section or subsection heading based on bounding boxes
            closest_heading = None
            closest_distance = float("inf")
            for heading_bbox, heading_text in section_headings + subsection_headings:
                # Compute vertical distance between table and heading
                distance = abs(heading_bbox[1] - table_bbox[1])
                if distance < closest_distance:
                    closest_heading = heading_text
                    closest_distance = distance

            # Assign table to the closest heading
            if closest_heading:
                assigned_tables.setdefault(closest_heading, []).append(table["content"])

        return assigned_tables

    def add_section(self, current_section, current_subsection, content, tables, page_number):
        """Add a completed section or subsection to the result."""
        self.result.append({
            "Document Name": self.doc_name,
            "Title": self.title,
            "Sections Heading": current_section or "",
            "Subsections Heading": current_subsection or "",
            "Raw Content": " ".join(content),
            "Page Number": page_number,
            "Tables": tables,
            "Images": []
        })

    def parse_pdf(self, pdf_path):
        table_data = self.extract_tables_from_pdf(pdf_path)
        current_section = None
        current_subsection = None
        content, tables = [], []
        section_headings = []  # Stores (bbox, text) for sections
        subsection_headings = []  # Stores (bbox, text) for subsections

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                if isinstance(element, LTTextBox):
                    for line in element:
                        if isinstance(line, LTTextLine):
                            avg_font_size, font_style = self.identify_font(line)
                            text = line.get_text().strip()
                            heading_type = self.match_heading(avg_font_size, font_style)

                            if heading_type == "section":
                                if current_section or content or tables:
                                    self.add_section(current_section, current_subsection, content, tables, page_number)
                                current_section = text
                                current_subsection = None
                                section_headings.append((line.bbox, text))
                                content, tables = [], []
                            elif heading_type == "subsection":
                                if current_subsection or content or tables:
                                    self.add_section(current_section, current_subsection, content, tables, page_number)
                                current_subsection = text
                                subsection_headings.append((line.bbox, text))
                                content, tables = [], []
                            else:
                                content.append(text)

            # Assign tables to sections/subsections for the current page
            table_assignments = self.assign_tables_to_sections(page_number, table_data, section_headings, subsection_headings)
            for heading, tables_in_heading in table_assignments.items():
                if heading == current_subsection:
                    tables.extend(tables_in_heading)
                elif heading == current_section and not current_subsection:
                    tables.extend(tables_in_heading)

        # Finalize the last section or subsection
        if current_section or current_subsection or content or tables:
            self.add_section(current_section, current_subsection, content, tables, page_number)

        return self.result

config_path = "/content/config.json"
pdf_path = "/content/1807.01544v2-2.pdf"
doc_name = "Example Document"

parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

with open("/content/parsed_content_20.json", "w") as f:
    json.dump(parsed_content, f, indent=4)


In [7]:
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine, LTChar
import pdfplumber


class PDFParser:
    def __init__(self, config_path, doc_name):
        self.config = self.load_config(config_path)
        self.doc_name = doc_name
        self.result = []
        self.title = ""

    def load_config(self, path):
        with open(path, 'r') as f:
            return json.load(f)

    def identify_font(self, line):
        font_sizes = []
        font_styles = set()
        for char in line:
            if isinstance(char, LTChar):
                font_sizes.append(round(char.size))
                font_styles.add(char.fontname)
        avg_font_size = round(sum(font_sizes) / len(font_sizes)) if font_sizes else 0
        font_style = next(iter(font_styles), None) if font_styles else ""
        return avg_font_size, font_style

    def match_heading(self, avg_font_size, font_style):
        for heading_type, attributes in self.config.items():
            if avg_font_size == attributes['font_size'] and attributes['font_style'] == font_style:
                return heading_type
        return None

    def extract_tables_from_pdf(self, pdf_path):
        """Extract tables and their locations using pdfplumber."""
        table_data = {}
        with pdfplumber.open(pdf_path) as pdf:
            for page_number, page in enumerate(pdf.pages, start=1):
                tables = page.extract_tables()
                if tables:
                    table_data[page_number] = [
                        {"content": table, "bbox": page.bbox} for table in tables
                    ]
        return table_data

    def assign_tables_to_sections(self, page_number, table_data, section_headings, subsection_headings):
        """Assign tables to sections/subsections based on proximity and page number."""
        assigned_tables = {}
        if page_number not in table_data:
            return assigned_tables

        for table in table_data[page_number]:
            table_bbox = table["bbox"]

            # Find the closest section or subsection heading based on bounding boxes
            closest_heading = None
            closest_distance = float("inf")
            for heading_bbox, heading_text in section_headings + subsection_headings:
                # Compute vertical distance between table and heading
                heading_y_center = (heading_bbox[1] + heading_bbox[3]) / 2
                table_y_center = (table_bbox[1] + table_bbox[3]) / 2
                distance = abs(heading_y_center - table_y_center)
                if distance < closest_distance:
                    closest_heading = heading_text
                    closest_distance = distance

            # Assign table to the closest heading
            if closest_heading:
                assigned_tables.setdefault(closest_heading, []).append(table["content"])

        return assigned_tables

    def add_section(self, current_section, current_subsection, content, tables, page_number):
        """Add a completed section or subsection to the result."""
        self.result.append({
            "Document Name": self.doc_name,
            "Title": self.title,
            "Sections Heading": current_section or "",
            "Subsections Heading": current_subsection or "",
            "Raw Content": " ".join(content),
            "Page Number": page_number,
            "Tables": tables,
            "Images": []
        })

    def parse_pdf(self, pdf_path):
        table_data = self.extract_tables_from_pdf(pdf_path)
        current_section = None
        current_subsection = None
        content, tables = [], []
        section_headings = []  # Stores (bbox, text) for sections
        subsection_headings = []  # Stores (bbox, text) for subsections

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                if isinstance(element, LTTextBox):
                    for line in element:
                        if isinstance(line, LTTextLine):
                            avg_font_size, font_style = self.identify_font(line)
                            text = line.get_text().strip()
                            heading_type = self.match_heading(avg_font_size, font_style)

                            if heading_type == "section":
                                if current_section or content or tables:
                                    self.add_section(current_section, current_subsection, content, tables, page_number)
                                current_section = text
                                current_subsection = None
                                section_headings.append((line.bbox, text))
                                content, tables = [], []
                            elif heading_type == "subsection":
                                if current_subsection or content or tables:
                                    self.add_section(current_section, current_subsection, content, tables, page_number)
                                current_subsection = text
                                subsection_headings.append((line.bbox, text))
                                content, tables = [], []
                            else:
                                content.append(text)

            # Assign tables to sections/subsections for the current page
            table_assignments = self.assign_tables_to_sections(page_number, table_data, section_headings, subsection_headings)
            for heading, tables_in_heading in table_assignments.items():
                if heading == current_subsection:
                    tables.extend(tables_in_heading)
                elif heading == current_section and not current_subsection:
                    tables.extend(tables_in_heading)

        # Finalize the last section or subsection
        if current_section or current_subsection or content or tables:
            self.add_section(current_section, current_subsection, content, tables, page_number)

        return self.result


config_path = "/content/config.json"
pdf_path = "/content/1807.01544v2-2.pdf"
doc_name = "Example Document"

parser = PDFParser(config_path, doc_name)
parsed_content = parser.parse_pdf(pdf_path)

with open("/content/parsed_content_21.json", "w") as f:
    json.dump(parsed_content, f, indent=4)



In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import pdfplumber
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine

class DebugPDFParser(PDFParser):
    def visualize_page(self, pdf_path, page_number, tables, headings):
        """Visualize tables and headings on a PDF page."""
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[page_number - 1]
            im = page.to_image()

            # Create a Matplotlib figure
            fig, ax = plt.subplots(1, figsize=(10, 15))
            ax.imshow(im.original)

            # Draw table bounding boxes
            for table in tables:
                x0, y0, x1, y1 = table["bbox"]
                rect = Rectangle(
                    (x0, im.original.size[1] - y1),
                    x1 - x0,
                    y1 - y0,
                    edgecolor="blue",
                    facecolor="none",
                    linewidth=2,
                    label="Table" if "Table" not in [p.get_label() for p in ax.patches] else ""
                )
                ax.add_patch(rect)

            # Draw heading bounding boxes
            for bbox, text in headings:
                x0, y0, x1, y1 = bbox
                rect = Rectangle(
                    (x0, im.original.size[1] - y1),
                    x1 - x0,
                    y1 - y0,
                    edgecolor="red",
                    facecolor="none",
                    linewidth=2,
                    label="Heading" if "Heading" not in [p.get_label() for p in ax.patches] else ""
                )
                ax.add_patch(rect)

            ax.legend(loc="upper right")
            plt.title(f"Visualization for Page {page_number}")
            plt.show()

    def debug_parse_pdf(self, pdf_path):
        table_data = self.extract_tables_from_pdf(pdf_path)
        section_headings = []  # (bbox, text)
        subsection_headings = []  # (bbox, text)

        for page_number, page_layout in enumerate(extract_pages(pdf_path), start=1):
            for element in page_layout:
                if isinstance(element, LTTextBox):
                    for
