In [23]:
#importing all the necessary libraries and packages, 
import os
import fitz  #From my PyMuPDF
import json
from pathlib import Path
from PIL import Image
from scripts.match_captions_to_images import map_captions_to_images


In [24]:
def extract_text_and_images(pdf_path, output_json_dir, output_image_dir):
    doc=fitz.open(pdf_path)
    paper_id =Path(pdf_path).stem

    title, abstract, body = "", "", ""

    #parsing first few pages
    for i, page in enumerate(doc):
        text=page.get_text("text")

        if i==0:
            lines=text.strip().split("\n")
            if len(lines)>0:
                title=lines[0].strip()
            if len(lines) > 1:
                abstract = lines[1].strip()

        body+=text+"\n"

        #Extracting images
        for img_index, img in enumerate(page.get_images(full=True)):
            xref=img[0]
            base_image=doc.extract_image(xref)
            image_bytes=base_image["image"]

            image_filename=f"{paper_id}_fig{img_index}.png"
            image_output_path=os.path.join(output_image_dir, image_filename)

            with open(image_output_path, "wb") as img_file:
                img_file.write(image_bytes)

    #Saving all the extracted texts into JSON file
    output = {
        "paper_id": paper_id,
        "title": title.strip(),
        "abstract": abstract.strip(),
        "body_text": body.strip()
    }

    json_output_path = os.path.join(output_json_dir, f"{paper_id}.json")
    with open(json_output_path, "w") as f:
        json.dump(output, f, indent=2)
    doc.close()


In [20]:
import re

def extract_captions_from_text(text):
    pattern=r"(Figure|Fig\.?|Table)\s+(\d+)[\.:]?\s+(.*?)(?=\n[A-Z]|\Z)"
    matches=re.findall(pattern, text, re.IGNORECASE | re.DOTALL)

    captions=[]
    for label, number, caption_text in matches:
        clean= caption_text.strip().replace("\n", " ")
        full_caption =f"{label} {number}: {clean}"
        captions.append(full_caption)

    return captions

captions=extract_captions_from_text(text)

print(f"Extracted {len(captions)} captions")
for i, c in enumerate(captions):
    print(f"{i+1}: {c[:150]}...")


Extracted 26 captions
1: Figure 1: Overview of the VidThinker annotation pipeline for VideoITG. The pipeline consists...
2: Fig. 1: Our annotation pipeline is inspired by the human reasoning...
3: Figure 2: Illustration of four instruction types and their corresponding frame selection strategies in...
4: Table 1: Comparison of dataset statistics for temporal grounding and highlight detection datasets....
5: Fig. 3: (b)....
6: Fig. 3: (b), we start by...
7: Figure 3: VideoITG model design: (a) Text generation aligns video and language tokens for...
8: Fig. 3: (b)....
9: Fig. 3: (b). The main drawback of this paradigm is that in...
10: Table 2: Performance comparison of VideoITG integrated with different Video-LLMs, varying in...
11: Table 3: The performance (accuracy) of SOTA methods on video benchmarks. For InternVL2.5-8B...
12: Table 4: presents a comprehensive analysis on the design of our VideoITG framework, directly...
13: Table 4: Empirical studies on the VideoITG-40k dataset and 

In [27]:
pdf_dir="../pdfs"
json_dir="../json"
image_dir="../data/images"

os.makedirs(json_dir, exist_ok=True)
os.makedirs(image_dir, exist_ok=True)

for file in os.listdir(pdf_dir):
    if not file.endswith(".pdf"):
        continue

    pdf_path=os.path.join(pdf_dir, file)

    try:
        extract_text_and_images(
            pdf_path=pdf_path,
            output_json_dir=json_dir,
            output_image_dir=image_dir
        )
    except Exception as e:
        print(f"[!] Skipping {file} due to error: {e}")
        os.remove(pdf_path)
        print(f"[🗑️] Removed corrupted: {file}")


[!] Skipping 2507.12384v1.pdf due to error: Failed to open file '../pdfs/2507.12384v1.pdf'.
[🗑️] Removed corrupted: 2507.12384v1.pdf
MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: syntax error: syntax error in content stream

MuPDF error: syntax error: unknown keyword: '@pgfcolorspaces'

MuPDF error: syntax error: unknown keyword: 'put'

MuPDF error: syntax error: unknown keyword: '@resources'

MuPDF error: syntax error: unknown keyword: 'put'

MuPDF error: syntax error: unknown keyword: '@pgfcolorspaces'

MuPDF error: syntax error: unknown keyword: 'put'

MuPDF error: syntax error: unknown keyword: '@resources'

MuPDF error: syntax error: unknown keyword: 'put'

MuPDF erro

In [None]:
map_captions_to_images(
    json_dir=json_dir,
    image_dir=image_dir,
    output_file="../data/captions.json"
)
