In [11]:
import os
import fitz  
import pandas as pd
from pymupdf4llm import to_markdown

In [12]:
def is_scanned_pdf(pdf_path):
    """
    Returns True if the PDF likely lacks embedded text, False if born-digital/text-PDF.
    Returns None on error.
    """
    try:
        doc = fitz.open(pdf_path)
        # Consider all pages for robust detection
        pages_to_check = min(3, len(doc))
        for i in range(pages_to_check):
            text = doc[i].get_text().strip()
            if text:
                return False  # Found embedded text, so not scanned
        return True  # No text found in checked pages
    except Exception:
        return None

def analyze_pdf(pdf_path):
    """Analyze a single PDF file and return a dict with its properties"""
    record = {
        'filename': os.path.basename(pdf_path),
        'file_size_mb': round(os.path.getsize(pdf_path) / (1024*1024), 3)
    }
    try:
        doc = fitz.open(pdf_path)
        record['page_count'] = len(doc)
        toc = doc.get_toc()
        record['has_toc'] = bool(toc)
        record['toc_entries'] = len(toc) if toc else 0
        record['is_scanned'] = is_scanned_pdf(pdf_path)
        try:
            md = to_markdown(doc, pages=[0])  # check only first page for output
            record['markdown_status'] = 'success' if md.strip() else 'empty'
        except Exception:
            record['markdown_status'] = 'error'
    except Exception as e:
        record['page_count'] = None
        record['has_toc'] = None
        record['toc_entries'] = None
        record['is_scanned'] = None
        record['markdown_status'] = 'error'
    return record

# PDF Folder with 383 pdfs
pdf_folder = "383-pdfs"  

In [13]:
# List all PDFs in the folder, analyze each, store results
records = []
file_list = sorted([f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')])

for fname in file_list:
    fpath = os.path.join(pdf_folder, fname)
    rec = analyze_pdf(fpath)
    records.append(rec)

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



In [14]:
df = pd.DataFrame(records)
df.to_csv("pdfs_analysis_table.csv", index=False)
print("✅ Saved detail table to pdfs_analysis_table.csv")
df.head()  # show first rows for confirmation

✅ Saved detail table to pdfs_analysis_table.csv


Unnamed: 0,filename,file_size_mb,page_count,has_toc,toc_entries,is_scanned,markdown_status
0,12936_2015_Article_885_pdf.pdf,0.641,15,True,30,False,success
1,1_s20_S2667114X21000248_main.pdf,0.235,6,True,25,False,success
2,1_s20_S2667114X21000418_main.pdf,1.32,13,True,26,False,success
3,2010-Incidence_of_malaria_and_efficacy_of_comb...,0.53,8,False,0,False,success
4,2011-Intermittent_preventive_treatment_of_mala...,0.39,14,False,0,False,success


In [15]:
df.to_csv(os.path.join(pdf_folder, "pdfs_analysis_table.csv"), index=False)
print(f"✅ Saved detail table to {os.path.join(pdf_folder, 'pdfs_analysis_table.csv')}")

✅ Saved detail table to 383-pdfs\pdfs_analysis_table.csv
