In [6]:
import pdfplumber
import pandas as pd


def extract_dimensions(pdf_path):
    dimensions = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                dimensions.extend(table)  # Append all rows in each table
    return dimensions


def format_dimensions(dimensions):
    # Define static columns and dynamic size categories
    static_columns = ["Dim", "Description", "Tol (-)", "Tol (+)"]
    size_categories = ["XS", "S", "M", "L", "XL"]
    sub_headers = ["Increment", "Sample", "Deviation"]

    # Create multi-level headers
    top_header = static_columns + [size for size in size_categories for _ in sub_headers]
    sub_header = static_columns + sub_headers * len(size_categories)

    # Validate and align rows with the required columns
    expected_cols = len(top_header)
    formatted_data = []
    for row in dimensions:
        if len(row) < expected_cols:  # Pad short rows with None
            row.extend([None] * (expected_cols - len(row)))
        elif len(row) > expected_cols:  # Trim excess columns
            row = row[:expected_cols]
        formatted_data.append(row)

    # Convert to DataFrame
    df = pd.DataFrame(formatted_data, columns=top_header)

    # Combine headers for Excel output
    combined_headers = pd.DataFrame([top_header, sub_header])
    return combined_headers, df


def save_to_excel(headers, df, output_file):
    with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
        # Write the combined headers manually
        headers.to_excel(writer, index=False, header=False, startrow=0)
        # Write the data rows below the headers
        df.to_excel(writer, index=False, header=False, startrow=headers.shape[0])
    print(f"Formatted data successfully saved to {output_file}")


def main():
    pdf_path = r"E:\git\Automated-Email-Parsing-and-Document-Generation\input_doc.pdf"  # Replace with your PDF path
    output_file = "NEWBook1_final_output_corrected.xlsx"  # Desired output file path

    dimensions = extract_dimensions(pdf_path)
    headers, formatted_df = format_dimensions(dimensions)
    save_to_excel(headers, formatted_df, output_file)


if __name__ == "__main__":
    main()


Formatted data successfully saved to NEWBook1_final_output_corrected.xlsx


In [3]:
pip install xlsxwriter

Collecting xlsxwriter
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
   ---------------------------------------- 0.0/159.9 kB ? eta -:--:--
   -- ------------------------------------- 10.2/159.9 kB ? eta -:--:--
   ------- ------------------------------- 30.7/159.9 kB 435.7 kB/s eta 0:00:01
   ------- ------------------------------- 30.7/159.9 kB 435.7 kB/s eta 0:00:01
   --------- ----------------------------- 41.0/159.9 kB 217.9 kB/s eta 0:00:01
   ---------------------- ---------------- 92.2/159.9 kB 438.1 kB/s eta 0:00:01
   ---------------------- ---------------- 92.2/159.9 kB 438.1 kB/s eta 0:00:01
   ---------------------- ---------------- 92.2/159.9 kB 438.1 kB/s eta 0:00:01
   ---------------------- ---------------- 92.2/159.9 kB 438.1 kB/s eta 0:00:01
   ---------------------- ---------------- 92.2/159.9 kB 438.1 kB/s eta 0:00:01
   ----------------------------- -------- 122.9/159.9 kB 266.9 kB/s eta 0:


[notice] A new release of pip is available: 24.1.2 -> 24.3.1
[notice] To update, run: C:\Users\Asus\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
