In [43]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
#lib_path = "/content/drive/MyDrive/pdf2json_libs"

#!pip install --target=$lib_path marker-pdf torch
#!pip install --target=$lib_path --force-reinstall sympy==1.12
#!pip install --target=$lib_path PyMuPDF

In [46]:
lib_path = "/content/drive/My Drive/pdf2json_libs"

import sys
sys.path.append(lib_path)

In [47]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered

converter = PdfConverter(artifact_dict=create_model_dict())

file_path = "/content/drive/MyDrive/finaltest.pdf"
rendered = converter(file_path)
text, _, images = text_from_rendered(rendered)

Loaded layout model datalab-to/surya_layout on device cuda with dtype torch.float16
Loaded texify model datalab-to/texify on device cuda with dtype torch.float16
Loaded recognition model vikp/surya_rec2 on device cuda with dtype torch.float16
Loaded table recognition model datalab-to/surya_tablerec on device cuda with dtype torch.float16
Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16


Recognizing layout: 100%|██████████| 2/2 [00:01<00:00,  1.36it/s]
Running OCR Error Detection: 100%|██████████| 2/2 [00:00<00:00, 69.86it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Detecting bboxes: 0it [00:00, ?it/s]


In [48]:
output_md = "output(md).md"
with open(output_md, "w", encoding="utf-8") as md_file:
    md_file.write(text)

print(f"Markdown output saved as {output_md}")

Markdown output saved as output(md).md


In [49]:
def extract_images_from_pdf(pdf_path, output_dir="images"):
    import os
    os.makedirs(output_dir, exist_ok=True)
    doc = fitz.open(pdf_path)
    image_paths = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        images = page.get_images()
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_path = f"{output_dir}/page_{page_num}_img_{img_index}.png"
            with open(image_path, "wb") as f:
                f.write(base_image["image"])
            image_paths.append(image_path)

    return image_paths

In [51]:
import fitz
import re
import json
from datetime import datetime

def format_pdf_date(pdf_date):
    #Convert 'D:YYYYMMDDHHMMSS' to 'DD-MM-YYYY' format
    if pdf_date and pdf_date.startswith("D:"):
        try:
            return datetime.strptime(pdf_date[2:16], "%Y%m%d%H%M%S").strftime("%d-%m-%Y")
        except ValueError:
            return "Invalid Date Format"
    return "Unknown"

def markdown_to_json(md_text,pdf_path):
    doc = fitz.open(pdf_path)

    md = doc.metadata
    metadatad = {
        "No. of Pages": len(doc),
        "Creation Date": format_pdf_date(md.get("creationDate", "")),
        "Last Modification Date:": format_pdf_date(md.get("modDate", ""))
    }

    data = {
        "title": "",
        "Authors": [],
        "metadata": metadatad,
        "contents": [],
        "text": {}
    }

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text")

        if page_num == 0:
            lines = text.split('\n')
            data["title"] = lines[0]
            auth = re.sub(r"\band\b", "", lines[1])
            data["Authors"] = re.findall(r"[A-Za-z]+(?:\s[A-Za-z]+)*", auth)

    current_section = None  # Track the current section
    img_num = 0
    img_paths = extract_images_from_pdf(pdf_path)

    lines = md_text.split("\n")
    for i,line in enumerate(lines):
        line = line.strip()

        if i==0:
          continue

        if line.startswith("# ") or line.startswith("## ") or line.startswith("### ") or line.startswith("#### "):
          if  line.startswith("# "):
            current_section = line[2:].strip()
          elif line.startswith("## "):
            current_section = line[3:].strip()
          elif line.startswith("### "):
            current_section = line[4:].strip()
          elif line.startswith("#### "):
            current_section = line[5:].strip()
          data["contents"].append(current_section)
          data["text"][current_section] = {"description": "", "diagram": ""}

        # Extract images
        elif line.startswith("![](") and ")" in line:
            img_url = re.findall(r'!\[\]\((.*?)\)', line)
            if img_url and current_section:
                data["text"][current_section]["diagram"] = img_paths[img_num]
                img_num += 1

        # Extract descriptions (plain text under headings)
        elif line and current_section:
            if "description" in data["text"][current_section]:
                data["text"][current_section]["description"] += line + " "

    return json.dumps(data, indent=4)

# Example usage
markdown_text = text

json_output = markdown_to_json(markdown_text,file_path)


In [52]:
output_filename = "output(json).json"
with open(output_filename, "w", encoding="utf-8") as json_file:
    json_file.write(json_output)

print(f"JSON output saved as {output_filename}")

JSON output saved as output(json).json


In [53]:
import shutil

folder_to_zip = "/content/images"
zip_filename = "/content/images.zip"
shutil.make_archive(zip_filename.replace(".zip", ""), 'zip', folder_to_zip)

'/content/images.zip'

In [55]:
from google.colab import files
import shutil

folder_to_zip = "/content/images"
zip_filename = "/content/images.zip"
shutil.make_archive(zip_filename.replace(".zip", ""), 'zip', folder_to_zip)

files.download(zip_filename)
files.download("/content/output(md).md")
files.download("/content/output(json).json")

print("Files Downloaded Successfully!!!")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Files Downloaded Successfully!!!
