In [1]:
import pdfplumber
import pandas as pd
import pytesseract
from pdf2image import convert_from_path


def extract(pdf_path, page_number):
    tabdata = []
    with pdfplumber.open(pdf_path) as p:
        page = p.pages[page_number - 1]  
        table = page.extract_tables()
        for tab in table:
            df = pd.DataFrame(tab)  
            tabdata.append(df)
    return tabdata


def performpageop(pdf_path, page_number):
    images = convert_from_path(pdf_path, first_page=page_number, last_page=page_number)
    text = []
    for image in images:
        img = pytesseract.image_to_string(image)
        text.append(img)
    return "\n".join(text)
pdf_files = {
    "cardio_structured.pdf": 6,
    "prot_sap_102.pdf": 50,
    "prot_sap_1.pdf": 14
}


output_file = "new_tables.xlsx"
writer = pd.ExcelWriter(output_file, engine='openpyxl')


for pdf_name, page_number in pdf_files.items():
    print(f"Processing {pdf_name} - Page {page_number}...")
    pdf_path = f"./{pdf_name}"  
   
    tables = extract(pdf_path, page_number)

    if tables:
        for i, table in enumerate(tables):
            table.to_excel(writer, sheet_name=f"{pdf_name}_Page{page_number}_Table{i+1}", index=False)
    else:
       
        text = performpageop(pdf_path, page_number)
        df_text = pd.DataFrame([text.split("\n")])
        df_text.to_excel(writer, sheet_name=f"{pdf_name}_Page{page_number}_OCR", index=False)


writer.close()
print(f"Extraction completed! Saved as {output_file}")

Processing cardio_structured.pdf - Page 6...




Processing prot_sap_102.pdf - Page 50...
Processing prot_sap_1.pdf - Page 14...
Extraction completed! Saved as new_tables.xlsx
