In [None]:
import os
import camelot
import json
from PyPDF2 import PdfReader

In [None]:

def extract_table_details(pdf_path):
    tables_info = {"tables": []}

    try:
        tables = camelot.read_pdf(pdf_path, pages='all')
        print(f"Found {len(tables)} table(s) in {os.path.basename(pdf_path)}.")

        for table_idx, table in enumerate(tables):
            # Extract the bounding box coordinates
            bbox = table._bbox
            rows, cols = table.shape

            table_info = {
                "coordinates": {
                    "x1": bbox[0],
                    "y1": bbox[1],
                    "x2": bbox[2],
                    "y2": bbox[3]
                },
                "rows": rows,
                "columns": cols
            }
            tables_info["tables"].append(table_info)
    except Exception as e:
        print(f"An error occurred while processing {os.path.basename(pdf_path)}: {e}")

    return tables_info



In [None]:
def process_pdfs(input_folder, json_folder, details_folder):
    # Create folders if they don't exist
    os.makedirs(json_folder, exist_ok=True)
    os.makedirs(details_folder, exist_ok=True)

    for pdf_filename in os.listdir(input_folder):
        if pdf_filename.endswith('.pdf'):
            pdf_path = os.path.join(input_folder, pdf_filename)
            table_details = extract_table_details(pdf_path)

            # Save the table details to a JSON file
            json_filename = pdf_filename.replace('.pdf', '_table_details.json')
            json_path = os.path.join(details_folder, json_filename)
            with open(json_path, "w") as json_file:
                json.dump(table_details, json_file, indent=4)

            print(f"Saved table details for {pdf_filename} to {json_path}.")


In [None]:
if __name__ == "__main__":
    input_folder = "input_folder"
    json_folder = "json_folder"
    details_folder = "output_folder"

    process_pdfs(input_folder, json_folder, details_folder)
