<a href="https://colab.research.google.com/github/srinath-96/Pdf2Markdown/blob/main/CrewAI_PDF_to_Markdown_(Colab_V2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
################################################################################
# CrewAI PDF to Markdown Converter - Colab Notebook (Initial Version)
################################################################################

# @title 1. Install Dependencies
# Run this cell first to install necessary libraries.
!pip install crewai crewai[tools] google-generativeai python-dotenv pypdfium2 markdownify Pillow pytesseract pdf2image

# For OCR capabilities with Tesseract (run this if you plan to use OCR)
# You might need to restart the runtime after running this.
!sudo apt-get update
!sudo apt-get install tesseract-ocr
!sudo apt-get install libtesseract-dev



Collecting crewai
  Downloading crewai-0.120.1-py3-none-any.whl.metadata (33 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting pypdfium2
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting markdownify
  Downloading markdownify-1.1.0-py3-none-any.whl.metadata (9.1 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting appdirs>=1.4.4 (from crewai)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting auth0-python>=4.7.1 (from crewai)
  Downloading auth0_python-4.9.0-py3-none-any.whl.metadata (9.0 kB)
Collecting chromadb>=0.5.23 (from crewai)
  Downloading chromadb-1.0.9-cp39-abi3-manylin

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,930 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,675 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,721 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:13 http://archive.ubuntu.com/ubuntu jamm

In [2]:
# @title 2. Import Libraries and Setup API Key
import os
import re
from dotenv import load_dotenv
import google.generativeai as genai
from crewai import Agent, Task, Crew, Process,LLM
from crewai.tools import BaseTool
from google.colab import userdata
# For PDF processing
import pypdfium2 as pdfium

# For OCR (if used)
from PIL import Image
import pytesseract
from pdf2image import convert_from_path

# For potential HTML to Markdown conversion (useful if PDF elements are HTML-like)
from markdownify import markdownify as md


api_key = userdata.get('GEMINI_API_KEY') # <--- IMPORTANT: REPLACE WITH YOUR ACTUAL KEY or use Colab Secrets


llm = LLM(
    model="gemini/gemini-2.0-flash", # Or other compatible models
    api_key= api_key# generation_config=genai.types.GenerationConfig(...) # Optional
)




In [4]:
from typing import ClassVar

In [5]:
# @title 3. Define Custom Tools

class PDFProcessingTool(BaseTool):
    name: str = "PDF Content Extractor"
    description: str = (
        "Extracts text content from a given PDF file. "
        "Input should be the path to the PDF file. "
        "Automatically attempts OCR on pages with minimal text. "
        "OCR can also be forced for specific pages or all pages."
    )
    # Define a threshold for considering a page as potentially image-based
    # CORRECTED: Added ClassVar type annotation
    MIN_TEXT_LENGTH_FOR_NO_OCR: ClassVar[int] = 50 # If text length is less than this, consider OCR

    def _run(self, pdf_file_path: str, force_ocr_all_pages: bool = False, force_ocr_pages: list = None) -> str:
        """
        Extracts text from a PDF, attempting OCR on image-like pages.
        Args:
            pdf_file_path: Path to the PDF file.
            force_ocr_all_pages: Boolean, if True, forces OCR on all pages.
            force_ocr_pages: Optional list of page numbers (0-indexed) to force OCR on.
        Returns:
            Extracted text content as a single string.
        """
        if not os.path.exists(pdf_file_path):
            return "Error: PDF file not found at the specified path."

        full_text = []
        try:
            pdf = pdfium.PdfDocument(pdf_file_path)
            n_pages = len(pdf)

            for i in range(n_pages):
                page_text_content = ""
                perform_ocr_on_this_page = False

                # 1. Try direct text extraction
                textpage = pdf.get_page(i)
                extracted_text_direct = str(textpage.get_textpage().get_text_range())
                page_text_content = extracted_text_direct

                # 2. Decide if OCR is needed for this page
                if force_ocr_all_pages:
                    perform_ocr_on_this_page = True
                    print(f"Forcing OCR on page {i+1} as per force_ocr_all_pages flag.")
                elif force_ocr_pages and i in force_ocr_pages:
                    perform_ocr_on_this_page = True
                    print(f"Forcing OCR on page {i+1} as per force_ocr_pages list.")
                elif len(extracted_text_direct.strip()) < self.MIN_TEXT_LENGTH_FOR_NO_OCR:
                    # If direct text is too short, it might be an image page or mostly image
                    page_objects = textpage.get_objects()
                    if len(page_objects) > 0: # A simple check, could be more sophisticated
                        print(f"Page {i+1} has minimal direct text (length: {len(extracted_text_direct.strip())}). Attempting OCR.")
                        perform_ocr_on_this_page = True
                    else:
                        print(f"Page {i+1} has minimal direct text but also appears to have no objects. Skipping OCR.")


                # 3. Perform OCR if decided
                if perform_ocr_on_this_page:
                    try:
                        print(f"Attempting OCR on page {i+1}...")
                        images = convert_from_path(pdf_file_path, first_page=i+1, last_page=i+1, dpi=300) # Higher DPI for better OCR
                        if images:
                            pil_image = images[0].convert('L') # Convert to grayscale
                            ocr_text = pytesseract.image_to_string(pil_image, lang='eng')
                            print(f"OCR successful for page {i+1}. Length: {len(ocr_text)}")
                            # Decide whether to replace or append OCR text.
                            if len(extracted_text_direct.strip()) < self.MIN_TEXT_LENGTH_FOR_NO_OCR:
                                page_text_content = ocr_text # Replace if direct text was minimal
                            else:
                                page_text_content += "\n\n--- OCR Text ---\n" + ocr_text
                        else:
                            page_text_content += f"\n[OCR attempted but no image returned for page {i+1}]"
                    except Exception as e:
                        ocr_error_msg = f"\n[OCR error on page {i+1}: {str(e)}]"
                        page_text_content += ocr_error_msg
                        print(ocr_error_msg)

                full_text.append(f"\n--- Page {i+1} ---\n{page_text_content}")

            return "\n".join(full_text)

        except Exception as e:
            return f"Error processing PDF: {str(e)}"
pdf_tool = PDFProcessingTool()



In [6]:
# @title 4. Define Agents
pdf_analyzer_agent = Agent(
    role='PDF Content Analyst',
    goal='Accurately extract all text content from a given PDF file, utilizing OCR for image-based or scanned pages.',
    backstory="Expert in digital document processing, adept at extracting textual content from diverse PDF types, including scanned documents via OCR.",
    tools=[pdf_tool],
    llm=llm,
    verbose=True,
    allow_delegation=False,
    max_iter=5
)

structure_identifier_agent = Agent(
    role='Document Structure Semantic Analyzer',
    goal='Identify the logical structure (headings, paragraphs, lists, tables, code blocks) of extracted PDF text, which may include OCR output.',
    backstory="AI with deep understanding of document layouts and semantic meaning, able to infer structure from text, including potentially noisy OCR results.",
    llm=llm,
    verbose=True,
    allow_delegation=False,
    max_iter=10
)

markdown_converter_agent = Agent(
    role='Markdown Conversion Specialist',
    goal='Convert structurally annotated text (potentially including OCR data) into clean, well-formatted Markdown.',
    backstory="Meticulous AI excelling at generating perfect, standard-compliant Markdown from structured text, even if it contains OCR artifacts.",
    llm=llm,
    verbose=True,
    allow_delegation=False,
    max_iter=10
)
print("CrewAI Agents defined.")




CrewAI Agents defined.


In [7]:
# @title 5. Define Tasks
pdf_extraction_task = Task(
    description=(
        "Extract text content from the PDF located at '{pdf_path}'. "
        "The tool will attempt to use OCR for pages that appear to be image-based or have minimal text. "
        "Ensure all readable text, whether embedded or via OCR, is captured."
    ),
    expected_output="A single string containing all extracted text from the PDF, with page breaks noted. OCR text should be integrated where applicable.",
    agent=pdf_analyzer_agent,
)

structure_identification_task = Task(
    description=(
        "Analyze the provided text (output of PDF extraction, possibly containing OCR'd content) and identify its logical structure. "
        "Determine headings (H1, H2, H3), paragraphs, lists, and any apparent code blocks or tables. "
        "Output should be the original text with clear annotations or a structured format "
        "(e.g., <H1>Title</H1><P>Paragraph.</P><LIST_ITEM>Item</LIST_ITEM>)."
    ),
    expected_output=(
        "The original text content, annotated or structured to clearly define elements. "
        "For example: '## Main Title\\n\\nThis is a paragraph.\\n\\n* Item 1'"
    ),
    agent=structure_identifier_agent,
    context=[pdf_extraction_task]
)

markdown_conversion_task = Task(
    description=(
        "Take the structurally annotated text (which may include OCR results) and convert it into well-formatted Markdown. "
        "Use appropriate syntax for headings, paragraphs, lists, code blocks. "
        "Represent tables/equations as placeholders (e.g., '[TABLE_DATA_HERE]')."
    ),
    expected_output="A single string containing the final, clean Markdown representation of the document.",
    agent=markdown_converter_agent,
    context=[structure_identification_task]
)
print("CrewAI Tasks defined.")

CrewAI Tasks defined.


In [8]:
# @title 6. Create and Run the Crew

# --- Define the Crew ---
pdf_to_markdown_crew = Crew(
    agents=[pdf_analyzer_agent, structure_identifier_agent, markdown_converter_agent],
    tasks=[pdf_extraction_task, structure_identification_task, markdown_conversion_task],
    process=Process.sequential,
    verbose=True,
    memory=False
)

def process_pdf(input_pdf_path: str) -> str: # Returns path to the markdown file or an error string
    """
    Processes a PDF file using the CrewAI setup, saves the output to a Markdown file.
    Args:
        input_pdf_path (str): The path to the input PDF file.
    Returns:
        str: The path to the generated Markdown file, or an error message string.
    """
    if not os.path.exists(input_pdf_path):
        return "Error: Input PDF file not found."

    print(f"Starting PDF to Markdown conversion for: {input_pdf_path}")
    inputs = {'pdf_path': input_pdf_path}
    markdown_content_str = "" # Initialize
    try:
        # Kick off the crew's process
        crew_output = pdf_to_markdown_crew.kickoff(inputs=inputs)
        print("\n--- Crew Execution Summary ---")
        print(f"Input PDF: {input_pdf_path}")

        # Extract the raw string output from the crew's result
        if hasattr(crew_output, 'raw_output') and isinstance(crew_output.raw_output, str):
            markdown_content_str = crew_output.raw_output
        elif isinstance(crew_output, str):
            markdown_content_str = crew_output
        else:
            print(f"Warning: Unexpected output type from crew.kickoff(): {type(crew_output)}")
            markdown_content_str = str(crew_output) # Fallback

        if not markdown_content_str.strip() or "Error:" in markdown_content_str : # Check if content is empty or an error message
             print(f"No valid markdown content generated or error in content: {markdown_content_str[:200]}") # Log snippet
             return f"Error: No valid markdown content from crew. Output: {markdown_content_str[:200]}"


        # Generate output Markdown filename
        base_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
        output_md_filename = f"{base_name}_output.md"
        # In Colab, save to /content/ directory
        output_md_filepath = f"/content/{output_md_filename}"

        # Save the markdown content to a file
        with open(output_md_filepath, "w", encoding="utf-8") as f:
            f.write(markdown_content_str)
        print(f"Markdown content saved to: {output_md_filepath}")

        return output_md_filepath # Return the path to the saved file

    except Exception as e:
        print(f"An error occurred during crew execution or file saving: {e}")
        import traceback
        traceback.print_exc()
        return f"Error during conversion process: {str(e)}"

        return markdown_output
    except Exception as e:
        print(f"An error occurred during crew execution: {e}")

        return f"Error during conversion: {str(e)}"



In [11]:
# @title 7. Upload PDF and Run
from google.colab import files
import time

uploaded_pdf_path = None

print("Please upload a PDF file.")
uploaded = files.upload()

if uploaded:
    for fn in uploaded.keys():
        print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')
        uploaded_pdf_path = f"/content/{fn}"
        with open(uploaded_pdf_path, 'wb') as f:
            f.write(uploaded[fn])
        print(f"File saved to: {uploaded_pdf_path}")
        break # Process the first uploaded file
else:
    print("No file uploaded.")

if uploaded_pdf_path:
    print("\nStarting PDF processing...")

    time.sleep(1)


    final_markdown = process_pdf(uploaded_pdf_path)

    print("\n\n--- FINAL MARKDOWN OUTPUT ---")
    print(final_markdown)


else:
    print("Skipping processing as no PDF was uploaded or path is not set.")



Please upload a PDF file.


Saving CamScanner 02-24-2024 21.10.pdf to CamScanner 02-24-2024 21.10.pdf
User uploaded file "CamScanner 02-24-2024 21.10.pdf" with length 551684 bytes
File saved to: /content/CamScanner 02-24-2024 21.10.pdf

Starting PDF processing...
Starting PDF to Markdown conversion for: /content/CamScanner 02-24-2024 21.10.pdf


[1m[95m# Agent:[00m [1m[92mPDF Content Analyst[00m
[95m## Task:[00m [92mExtract text content from the PDF located at '/content/CamScanner 02-24-2024 21.10.pdf'. The tool will attempt to use OCR for pages that appear to be image-based or have minimal text. Ensure all readable text, whether embedded or via OCR, is captured.[00m




[1m[95m# Agent:[00m [1m[92mPDF Content Analyst[00m
[95m## Thought:[00m [92mI need to extract the text content from the given PDF file. The tool will automatically handle OCR where necessary.[00m
[95m## Using tool:[00m [92mPDF Content Extractor[00m
[95m## Tool Input:[00m [92m
"{\"pdf_file_path\": \"/content/CamScanner 02-24-2024 21.10.pdf\", \"force_ocr_all_pages\": false, \"force_ocr_pages\": []}"[00m
[95m## Tool Output:[00m [92m
Error processing PDF: object of type 'generator' has no len()


You ONLY have access to the following tools, and should NEVER make up tools that are not listed here:

Tool Name: PDF Content Extractor
Tool Arguments: {'pdf_file_path': {'description': None, 'type': 'str'}, 'force_ocr_all_pages': {'description': None, 'type': 'bool'}, 'force_ocr_pages': {'description': None, 'type': 'list'}}
Tool Description: Extracts text content from a given PDF file. Input should be the path to the PDF file. Automatically attempts OCR on pages with minim



Forcing OCR on page 1 as per force_ocr_all_pages flag.
Attempting OCR on page 1...

[OCR error on page 1: Unable to get page count. Is poppler installed and in PATH?]


[1m[95m# Agent:[00m [1m[92mPDF Content Analyst[00m
[95m## Thought:[00m [92mThe tool returned an error. I will try forcing OCR on all pages to see if that resolves the issue.[00m
[95m## Using tool:[00m [92mPDF Content Extractor[00m
[95m## Tool Input:[00m [92m
"{\"pdf_file_path\": \"/content/CamScanner 02-24-2024 21.10.pdf\", \"force_ocr_all_pages\": true, \"force_ocr_pages\": []}"[00m
[95m## Tool Output:[00m [92m

--- Page 1 ---

[OCR error on page 1: Unable to get page count. Is poppler installed and in PATH?][00m






[1m[95m# Agent:[00m [1m[92mPDF Content Analyst[00m
[95m## Final Answer:[00m [92m
The tool is unable to process the PDF file due to an error with the OCR engine or the file itself.[00m




[1m[95m# Agent:[00m [1m[92mDocument Structure Semantic Analyzer[00m
[95m## Task:[00m [92mAnalyze the provided text (output of PDF extraction, possibly containing OCR'd content) and identify its logical structure. Determine headings (H1, H2, H3), paragraphs, lists, and any apparent code blocks or tables. Output should be the original text with clear annotations or a structured format (e.g., <H1>Title</H1><P>Paragraph.</P><LIST_ITEM>Item</LIST_ITEM>).[00m


[1m[95m# Agent:[00m [1m[92mDocument Structure Semantic Analyzer[00m
[95m## Final Answer:[00m [92m
I am unable to process the PDF file.[00m




[1m[95m# Agent:[00m [1m[92mMarkdown Conversion Specialist[00m
[95m## Task:[00m [92mTake the structurally annotated text (which may include OCR results) and convert it into well-formatted Markdown. Use appropriate syntax for headings, paragraphs, lists, code blocks. Represent tables/equations as placeholders (e.g., '[TABLE_DATA_HERE]').[00m


[1m[95m# Agent:[00m [1m[92mMarkdown Conversion Specialist[00m
[95m## Final Answer:[00m [92m
Okay, I'm ready. Please provide the structured text you want me to convert to Markdown. I will do my best to produce clean, standard-compliant Markdown. Just paste the text here, and I'll get started![00m





--- Crew Execution Summary ---
Input PDF: /content/CamScanner 02-24-2024 21.10.pdf
Markdown content saved to: /content/CamScanner 02-24-2024 21.10_output.md


--- FINAL MARKDOWN OUTPUT ---
/content/CamScanner 02-24-2024 21.10_output.md
