In [None]:
import pdfplumber
from collections import defaultdict

class SectionContainer:
    """Container to store parsed content and metadata."""
    def __init__(self, content: str, metadata: dict):
        self.content = content
        self.metadata = metadata

    def __repr__(self):
        return f"SectionContainer(metadata={self.metadata}, content_length={len(self.content)})"

def analyze_pdf_layout(pdf_path: str):
    """Analyzes the layout of a PDF to extract text, font size, and font style."""
    layout_data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            lines = defaultdict(list)
            for char in page.chars:
                lines[round(char['top'])].append(char)

            grouped_lines = {}
            for top, chars in sorted(lines.items()):
                # Sort characters by their x0 coordinate (horizontal position)
                chars = sorted(chars, key=lambda x: x['x0'])
                line_content = []

                for i, char in enumerate(chars):
                    line_content.append(char['text'])

                    # Add space if the next character is not immediately adjacent
                    if i < len(chars) - 1:
                        next_char = chars[i + 1]
                        if next_char['x0'] > char['x1'] + 1:  # Adjust threshold as needed
                            line_content.append(' ')

                line_content = ''.join(line_content).strip()
                if line_content:
                    grouped_lines[line_content] = chars

            page_elements = []
            for line_content, chars in grouped_lines.items():
                font_size = round(chars[0].get("size", 0))
                font_name = chars[0].get("fontname", "Unknown")
                page_elements.append({
                    "text": line_content,
                    "font_size": font_size,
                    "font_style": font_name,
                    "page_number": page_num,
                })

            layout_data.append(page_elements)
    return layout_data

def parse_pdf_content(pdf_path: str):
    """Parses PDF content and organizes it into containers with metadata."""
    containers = []
    layout_analysis = analyze_pdf_layout(pdf_path)

    for page_data in layout_analysis:
        # Group text by font size and style
        font_size_hierarchy = defaultdict(list)
        for element in page_data:
            key = (element["font_size"], element["font_style"])
            font_size_hierarchy[key].append(element)

        for (font_size, font_style), elements in font_size_hierarchy.items():
            combined_text = " ".join(e["text"] for e in elements)
            if len(combined_text) > 3:  # Skip very short sections
                metadata = {
                    "font_size": font_size,
                    "font_style": font_style,
                    "page_number": elements[0]["page_number"],
                    "content_length": len(combined_text),
                }
                containers.append(SectionContainer(content=combined_text, metadata=metadata))

    return containers

def group_by_top_font_sizes(parsed_sections, top_n=5):
    """Groups sections by font size and style, selecting only the top N font sizes."""
    grouped_data = defaultdict(lambda: defaultdict(list))
    font_sizes = sorted({s.metadata["font_size"] for s in parsed_sections}, reverse=True)
    top_font_sizes = font_sizes[:top_n]

    for section in parsed_sections:
        font_size = section.metadata["font_size"]
        if font_size in top_font_sizes:
            font_style = section.metadata["font_style"]
            grouped_data[font_size][font_style].append({
                "content": section.content,
                "page_number": section.metadata["page_number"],
                "content_length": section.metadata["content_length"],
            })

    return grouped_data

def save_grouped_sections_to_file(grouped_sections, output_file):
    """Saves grouped sections to a text file."""
    with open(output_file, "w", encoding="utf-8") as f:
        for font_size, styles in sorted(grouped_sections.items(), reverse=True):
            f.write(f"Font Size: {font_size}\n")
            f.write("=" * 50 + "\n")
            for font_style, sections in styles.items():
                f.write(f"  Font Style: {font_style}\n")
                f.write("  " + "-" * 45 + "\n")
                for section in sections:
                    f.write(f"    Page Number: {section['page_number']}\n")
                    if section["content_length"] <= 150:
                        f.write(f"    Content: {section['content']}\n")
                    else:
                        f.write(f"    Content Length: {section['content_length']}\n")
                    f.write("\n")
                f.write("  " + "-" * 45 + "\n")
            f.write("-" * 50 + "\n\n")

if __name__ == "__main__":
    pdf_path = "/content/2212.14052v3.pdf"  # Replace with your PDF path
    output_file = "/content/output_file.txt"     # Replace with your desired output path

    parsed_sections = parse_pdf_content(pdf_path)
    grouped_sections = group_by_top_font_sizes(parsed_sections, top_n=5)
    save_grouped_sections_to_file(grouped_sections, output_file)

    print(f"Top 5 font sizes saved to {output_file}")
