In [4]:
import os
from pathlib import Path

import fitz  # PyMuPDF
import pandas as pd
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

cwd = Path().resolve()
print("CWD:", cwd)

# Locate PDF (works whether CWD is project root or /notebooks)
pdf_path = cwd / "data" / "raw" / "tables-charts.pdf"
if not pdf_path.exists():
    pdf_path = cwd.parent / "data" / "raw" / "tables-charts.pdf"

print("PDF path:", pdf_path)
print("Exists:", pdf_path.exists())

# Directories for images and processed data
page_img_dir = (cwd / "data" / "page_images")
if not page_img_dir.exists():
    page_img_dir = (cwd.parent / "data" / "page_images")

page_img_dir.mkdir(parents=True, exist_ok=True)

processed_dir = cwd / "data" / "processed"
if not processed_dir.exists():
    processed_dir = cwd.parent / "data" / "processed"

print("Page image dir:", page_img_dir)
print("Processed dir:", processed_dir)


CWD: C:\Users\SEC\OneDrive\Desktop\docinsight\notebooks
PDF path: C:\Users\SEC\OneDrive\Desktop\docinsight\data\raw\tables-charts.pdf
Exists: True
Page image dir: C:\Users\SEC\OneDrive\Desktop\docinsight\data\page_images
Processed dir: C:\Users\SEC\OneDrive\Desktop\docinsight\notebooks\data\processed


In [5]:
doc = fitz.open(pdf_path)
print("Pages in PDF:", len(doc))

for page_index in range(len(doc)):
    page = doc[page_index]
    pix = page.get_pixmap(dpi=150)  # 150 dpi is usually enough
    out_path = page_img_dir / f"page{page_index}.png"
    pix.save(out_path)
    print("Saved:", out_path.name)

doc.close()


Pages in PDF: 14
Saved: page0.png
Saved: page1.png
Saved: page2.png
Saved: page3.png
Saved: page4.png
Saved: page5.png
Saved: page6.png
Saved: page7.png
Saved: page8.png
Saved: page9.png
Saved: page10.png
Saved: page11.png
Saved: page12.png
Saved: page13.png


In [6]:
blip_model_name = "Salesforce/blip-image-captioning-base"
print("Loading BLIP model:", blip_model_name)

processor = BlipProcessor.from_pretrained(blip_model_name)
blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name)

print("BLIP model loaded.")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading BLIP model: Salesforce/blip-image-captioning-base


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

BLIP model loaded.


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [7]:
page_captions = []

image_files = sorted(
    [p for p in page_img_dir.iterdir() if p.suffix.lower() in [".png", ".jpg", ".jpeg"]],
    key=lambda p: int(p.stem.replace("page", ""))  # sort by page number
)

print("Found page images:", len(image_files))

for img_path in image_files:
    image = Image.open(img_path).convert("RGB")
    inputs = processor(image, return_tensors="pt")
    out = blip_model.generate(**inputs, max_new_tokens=40)
    caption = processor.decode(out[0], skip_special_tokens=True)

    page_num = int(img_path.stem.replace("page", ""))

    print(f"{img_path.name} → {caption}")

    page_captions.append({
        "chunk_id": f"pageimg_{page_num}",
        "chunk_type": "chart",          # treat page-level visual as a chart/figure chunk
        "page_number": page_num,
        "text": caption,
        "char_len": len(caption)
    })

df_chart_chunks = pd.DataFrame(page_captions)
print("Chart chunks shape:", df_chart_chunks.shape)
df_chart_chunks.head()


Found page images: 14
page0.png → a table of contents for a table of contents
page1.png → a table of contents for the text and the text
page2.png → a table of the number and type of the elements in the periodics
page3.png → a table with the number of the numbers in each column
page4.png → a graph graphing graphing graphing graphing graphing graphing graphing graphing graphing graph
page5.png → nci class 12 math question paper
page6.png → a graph plot with a line graph
page7.png → a diagram of the effect of the effect of the effect of the effect of the effect of the effect of
page8.png → a diagram of the number of different types of the genome
page9.png → a diagram of a block diagram
page10.png → a diagram of a flow flow diagram
page11.png → a flow diagram for a flow flow
page12.png → a sample of a research paper
page13.png → a document with the title title and title title
Chart chunks shape: (14, 5)


Unnamed: 0,chunk_id,chunk_type,page_number,text,char_len
0,pageimg_0,chart,0,a table of contents for a table of contents,43
1,pageimg_1,chart,1,a table of contents for the text and the text,45
2,pageimg_2,chart,2,a table of the number and type of the elements...,63
3,pageimg_3,chart,3,a table with the number of the numbers in each...,53
4,pageimg_4,chart,4,a graph graphing graphing graphing graphing gr...,94


In [8]:
# Load current master chunks (text + tables)
master_path = processed_dir / "tables-charts_master_chunks.csv"

if master_path.exists():
    df_master = pd.read_csv(master_path)
    print("Loaded existing master chunks:", df_master.shape)
else:
    # Fallback: if master doesn't exist yet, try loading plain text chunks
    chunks_path = processed_dir / "tables-charts_chunks.csv"
    df_master = pd.read_csv(chunks_path)
    df_master["chunk_type"] = "text"
    df_master["chunk_id"] = df_master["chunk_id"].astype(str)
    print("Loaded text-only chunks:", df_master.shape)

# Append chart/page-image caption chunks
df_master = pd.concat([df_master, df_chart_chunks], ignore_index=True)
df_master = df_master.reset_index(drop=True)

print("New master shape:", df_master.shape)
print(df_master["chunk_type"].value_counts())

# Save updated master file
df_master.to_csv(master_path, index=False)
print("Saved updated master chunks to:", master_path)


Loaded existing master chunks: (73, 7)
New master shape: (87, 7)
chunk_type
text     65
chart    14
table     8
Name: count, dtype: int64
Saved updated master chunks to: C:\Users\SEC\OneDrive\Desktop\docinsight\notebooks\data\processed\tables-charts_master_chunks.csv
