In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [50]:
#lib_path = "/content/drive/MyDrive/pdf2json_libs"

#!pip install --target=$lib_path marker-pdf torch
#!pip install --target=$lib_path --force-reinstall sympy==1.12
#!pip install --target=$lib_path PyMuPDF

In [51]:
lib_path = "/content/drive/My Drive/pdf2json_libs"

import sys
sys.path.append(lib_path)

In [52]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered

# Initialize the converter
converter = PdfConverter(artifact_dict=create_model_dict())

# Define the file path
file_path = "/content/drive/MyDrive/Attention_all_you_need.pdf"

# Convert the PDF
rendered = converter(file_path)

text, _, images = text_from_rendered(rendered)


Loaded layout model datalab-to/surya_layout on device cpu with dtype torch.float32
Loaded texify model datalab-to/texify on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded table recognition model datalab-to/surya_tablerec on device cpu with dtype torch.float32
Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32


Recognizing layout: 100%|██████████| 3/3 [02:06<00:00, 42.04s/it]
Running OCR Error Detection: 100%|██████████| 3/3 [00:17<00:00,  5.87s/it]
Detecting bboxes: 0it [00:00, ?it/s]
Texify inference: 100%|██████████| 3/3 [00:26<00:00,  8.90s/it]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 1/1 [00:40<00:00, 40.97s/it]


In [53]:
import os

# Create the 'images' folder if it doesn't exist
images_folder = "images"
os.makedirs(images_folder, exist_ok=True)

# Save each image in the dictionary
img_paths=[]
for filename, img in images.items():
    img_path = os.path.join(images_folder, filename)
    img_paths.append(img_path)
    img.save(img_path)
    print(f"Saved: {img_path}")


Saved: images/_page_2_Figure_0.jpeg
Saved: images/_page_3_Figure_0.jpeg
Saved: images/_page_12_Figure_1.jpeg
Saved: images/_page_13_Figure_0.jpeg
Saved: images/_page_14_Figure_0.jpeg


In [54]:
import re

def extract_tables_and_text(text):
    table_pattern = r'(\|.*\|[\r\n]+(\|[-:]+[-:|]*\|[\r\n]+)+(\|.*\|[\r\n]+)+)'
    matches = re.findall(table_pattern, text)

    tables = []

    for match in matches:
        table_text = match[0].strip()
        rows = table_text.split('\n')
        header = [col.strip() for col in rows[0].split('|') if col.strip()]
        structured_table = []

        for row in rows[2:]:  # Skipping separator row
            values = [col.strip() for col in row.split('|') if col.strip()]
            structured_table.append(dict(zip(header, values)))

        tables.append(structured_table)
        text = text.replace(table_text, "")

    return tables, text.strip()  # Returns tables as a list of dictionaries and remaining text as a string



In [60]:
output_md = "output(md).md"
z = text
z = re.sub(r'<span id="page-\d+-\d+"></span>', '',z)
with open(output_md, "w", encoding="utf-8") as md_file:
    md_file.write(z)

print(f"Markdown output saved as {output_md}")

Markdown output saved as output(md).md


In [56]:
import fitz
import re
import json
from datetime import datetime

def extract_title(text):
    title = re.search(r'^(#{1,4})\s+(.*)', text, re.MULTILINE)
    return title.group(2) if title else None

def extract_authors(text):
    headings = list(re.finditer(r'(?m)^#{1,4} ', text))

    if len(headings) < 2:
        return []

    start, end = headings[0].end(), headings[1].start()
    content = text[start:end]

    potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s[A-Z]\.)?(?:\s[A-Z][a-z]+)*\b', content)

    blacklist = {
    "Google", "Brain", "Research", "University", "Department", "Institute", "Laboratory", "Center",
    "School", "College", "Academy", "Faculty", "Division", "Group", "Project", "Consortium",
    "Organization", "Corporation", "Society", "Institute", "Foundation", "Company", "Inc", "Ltd",
    "Shanghai", "China", "Toronto", "MIT", "Harvard", "Stanford", "Oxford", "Cambridge", "Berkeley",
    "Technology", "Science", "Engineering", "Automation", "Computing", "Data", "Intelligence", "AI",
    "Physics", "Mathematics", "Statistics", "Biology", "Neuroscience", "Cognitive", "Medical", "Health",
    "Economics", "Management", "Humanities", "Social", "Psychology", "Behavioral", "Information",
    "Communication", "Robotics", "Electronics", "Systems", "Networking", "Security", "Cryptography",
    "Applied", "Artificial", "Deep", "Vision", "Natural", "Language", "Processing", "Cybernetics",
    "Computational", "Quantum", "Nanotechnology", "Bionics", "Genomics", "Bioinformatics", "Medicine"
    }

    names = [name for name in potential_names if not any(word in blacklist for word in name.split())]

    return names[1:]


def format_pdf_date(pdf_date):
    #Convert 'D:YYYYMMDDHHMMSS' to 'DD-MM-YYYY' format
    if pdf_date and pdf_date.startswith("D:"):
        try:
            return datetime.strptime(pdf_date[2:16], "%Y%m%d%H%M%S").strftime("%d-%m-%Y")
        except ValueError:
            return "Invalid Date Format"
    return "Unknown"

def markdown_to_json(tables,text,pdf_path):
    doc = fitz.open(pdf_path)

    md = doc.metadata
    metadatad = {
        "No. of Pages": len(doc),
        "Creation Date": format_pdf_date(md.get("creationDate", "")),
        "Last Modification Date:": format_pdf_date(md.get("modDate", ""))
    }



    data = {
        "title": extract_title(text),
        "Authors": extract_authors(text),
        "metadata": metadatad,
        "contents": [],
        "text": {},
        "tables": tables

    }

    current_section = None  # Track the current section
    img_num = 0

    lines = text.split("\n")
    first_heading_skipped = False  # Flag to skip first heading


    for line in lines:
      line = line.strip()
      line = re.sub(r'<span id="page-\d+-\d+"></span>', '', line)

    # Detect headings dynamically
      headings = re.match(r'^(#{1,4})\s+(.*)', line)
      if headings:
        if not first_heading_skipped:
            first_heading_skipped = True  # Skip the first heading
            continue

        current_section = headings.group(2).strip()  # Extract heading text
        data["contents"].append(current_section)
        data["text"][current_section] = {"description": "", "diagrams": []}


        # Extract images
      elif line.startswith("![](") and ")" in line:
          img_url = re.findall(r'!\[\]\((.*?)\)', line)
          if img_url and current_section:
              data["text"][current_section]["diagrams"].append(img_paths[img_num])
              img_num += 1

        # Extract descriptions (plain text under headings)
      elif line and current_section:
          if "description" in data["text"][current_section]:
              data["text"][current_section]["description"] += line + " "

    return json.dumps(data, indent=4)

a = text
tables, uptext = extract_tables_and_text(a)
json_output = markdown_to_json(tables,uptext,file_path)

In [57]:
output_filename = "output(json).json"
with open(output_filename, "w", encoding="utf-8") as json_file:
    json_file.write(json_output)

print(f"JSON output saved as {output_filename}")

JSON output saved as output(json).json


In [58]:
import shutil

folder_to_zip = "/content/images"
zip_filename = "/content/images.zip"
shutil.make_archive(zip_filename.replace(".zip", ""), 'zip', folder_to_zip)

'/content/images.zip'

In [59]:
from google.colab import files
import shutil

folder_to_zip = "/content/images"
zip_filename = "/content/images.zip"
shutil.make_archive(zip_filename.replace(".zip", ""), 'zip', folder_to_zip)

files.download(zip_filename)
files.download("/content/output(md).md")
files.download("/content/output(json).json")

print("Files Downloaded Successfully!!!")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Files Downloaded Successfully!!!
