In [0]:
%pip list

In [0]:
%pip install marker-pdf==1.9.0

In [0]:
dbutils.library.restartPython()

In [0]:
# Imports for the conversion
import torch
from marker.models import create_model_dict
from marker.converters.pdf import PdfConverter
from marker.output import text_from_rendered
import os
import transformers

# 1. Diagnostic Check
print(f"--- Using Transformers Version: {transformers.__version__} ---")
print(f"--- Using Torch Version: {torch.__version__} ---")

# 2. Define the path to your PDF in Databricks Volumes
pdf_path = "/Volumes/renjiharold_demo/sandpit/marker-pdf-test/docs/thinkpython.pdf"
print(f"Target PDF path: {pdf_path}")

# Check if file exists before proceeding
if not os.path.exists(pdf_path):
    print(f"❌ ERROR: File not found at {pdf_path}")
else:
    print(f"✅ File found at {pdf_path}")

    # 3. Check for GPU and set the device
    if torch.cuda.is_available():
        device = 'cuda'
        print(f"✅ GPU (CUDA) is available! Using device: {torch.cuda.get_device_name(0)}")
    else:
        device = 'cpu'
        print("⚠️ WARNING: GPU not found. Falling back to CPU.")

    # 4. Load the models onto the specified device
    print("\nLoading models... (This may take a few minutes on first run)")
    model_dict = create_model_dict(device=device)

    # 5. Instantiate the PdfConverter
    converter = PdfConverter(
      artifact_dict=model_dict
    )

    # 6. Run the conversion
    
    rendered_doc = converter(pdf_path)

    # 7. Extract the text (markdown) from the rendered document
    text_output, _, images = text_from_rendered(rendered_doc)

    print("\n--- ✅ CONVERSION SUCCESSFUL ---")

    # 8. Display the first 2000 characters of the result
    print("\n--- Start of Extracted Text (Markdown) ---")
    print(text_output[:2000])
    print("\n--- End of Extracted Text ---")