# Marker Demo: Research Paper Block Extraction & Exploration

Self-contained, Google Colab-ready.

- Upload a PDF, run Marker to extract JSON structure
- Flatten and filter block structure (inspired by TypeScript)
- Explore block types, metadata, tables, and figures
- No LLM-based summarization or captioning included


In [None]:
# 1. Install marker-pdf and dependencies
!pip install --quiet marker-pdf

In [None]:
# 2. Upload a PDF
from google.colab import files
uploaded = files.upload()
pdf_path = next(iter(uploaded))

In [None]:
# 3. Run Marker to extract JSON structure
import os
output_dir = 'marker_output'
os.makedirs(output_dir, exist_ok=True)
json_out = os.path.join(output_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '_structure.json')

!marker_single "{pdf_path}" --output_format json --output_dir "{output_dir}"

In [None]:
# 4. Load the Marker JSON
import json
with open(json_out, 'r') as f:
    marker_json = json.load(f)

In [None]:
# 5. Data models and flattening utilities (Python version of your TypeScript)
from typing import List, Dict, Any

class SimplifiedBlock:
    def __init__(self, type: str, content: str, page: int, bbox: list):
        self.type = type
        self.content = content
        self.page = page
        self.bbox = bbox

    def as_dict(self):
        return {
            'type': self.type,
            'content': self.content,
            'page': self.page,
            'bbox': self.bbox,
        }

import html

def decode_html_entities(text: str) -> str:
    return html.unescape(text)

def flatten_marker_json(blocks: List[Dict[str, Any]], page_number: int = 0) -> List[SimplifiedBlock]:
    flat_blocks = []
    for block in blocks:
        # Skip Page blocks but process their children
        if block.get('block_type') == 'Page':
            child_page = int(block.get('id', '0/0/0').split('/')[2]) if 'id' in block else 0
            flat_blocks.extend(flatten_marker_json(block.get('children', []), child_page))
            continue

        # Process current block
        content = ''
        if block.get('images') and isinstance(block['images'], dict) and block['images']:
            content = next(iter(block['images'].values()))
        elif block.get('block_type') == 'Table':
            content = block.get('html', '').strip()
        elif block.get('html'):
            import re
            content = re.sub(r'<[^>]*>', ' ', block['html']).strip()
        content = decode_html_entities(content)

        page = (int(block.get('id', '0/0/0').split('/')[2]) if 'id' in block else page_number) + 1
        bbox = block.get('bbox', [0,0,0,0])

        flat_blocks.append(SimplifiedBlock(
            type=block.get('block_type', ''),
            content=content,
            page=page,
            bbox=bbox
        ))

        # Recursively process children (except for Page blocks)
        if block.get('children'):
            flat_blocks.extend(flatten_marker_json(block['children'], page))
    return flat_blocks

def filter_and_flatten_marker_json(blocks: List[Dict[str, Any]], page_number: int = 0) -> List[SimplifiedBlock]:
    unfiltered = flatten_marker_json(blocks, page_number)
    remove_types = {
        'TableCell', 'TableGroup', 'FigureGroup', 'ListGroup', 'Reference',
        'PageFooter', 'PageHeader', 'Footnote'
    }
    return [b for b in unfiltered if b.type not in remove_types and b.content]

In [None]:
# 6. Flatten and filter the Marker output
flat_blocks = filter_and_flatten_marker_json(marker_json.get('children', []))

In [None]:
# 7. Explore block types and content
import pandas as pd

df = pd.DataFrame([b.as_dict() for b in flat_blocks])
print('Block types found:', df['type'].unique())
df.head(20)  # Show first 20 blocks

In [None]:
# 8. Simple metadata extraction (title, authors, abstract)
def extract_metadata(blocks: List[SimplifiedBlock]):
    title = next((b.content for b in blocks if b.type.lower() in {'title', 'main_title'}), '')
    authors = next((b.content for b in blocks if 'author' in b.type.lower()), '')
    abstract = next((b.content for b in blocks if 'abstract' in b.type.lower()), '')
    return {'title': title, 'authors': authors, 'abstract': abstract}

metadata = extract_metadata(flat_blocks)
print('Extracted Metadata:', metadata)

In [None]:
# 9. Find and display all tables and figures (with extensibility for custom processing)
tables = [b for b in flat_blocks if b.type == 'Table']
figures = [b for b in flat_blocks if b.type == 'Figure' or b.type == 'Picture']

print(f'Found {len(tables)} tables and {len(figures)} figures.')

# Example: Show first table's HTML (for further processing)
if tables:
    from IPython.display import display, HTML
    print('First table HTML:')
    display(HTML(tables[0].content))

# Example: Show first figure as image (if base64-encoded)
import base64
from IPython.display import Image

def show_base64_image(b64str):
    try:
        display(Image(data=base64.b64decode(b64str)))
    except Exception as e:
        print('Could not display image:', e)

if figures:
    print('First figure (if image):')
    show_base64_image(figures[0].content)

In [None]:
# 10. (Optional) Extensible: Add your own logic to process tables/figures, e.g., send table HTML to a model, extract captions, etc.
# (No LLM-based summarization or captioning included)

In [None]:
# 11. Save flattened blocks for further analysis
df.to_json('flattened_blocks.json', orient='records', indent=2)
from google.colab import files
files.download('flattened_blocks.json')