In [1]:
import pdfplumber
import pandas as pd

def extract_dimensions_with_sizes(pdf_path):
    dimensions_1_9 = []
    dimensions_10_15 = []
    headers = []  # To store the main and sub-headers dynamically
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if len(table) > 0:
                    # Extract the headers (first row) and data rows
                    current_headers = table[0]
                    if not headers:  # Capture headers only once
                        headers = current_headers
                    for row in table[1:]:
                        if len(row) == len(headers):  # Ensure valid row
                            row_data = dict(zip(headers, row))
                            if "Displaying 1-9 results" in str(row):
                                dimensions_1_9.append(row_data)
                            elif "Displaying 10-15 results" in str(row):
                                dimensions_10_15.append(row_data)
    return dimensions_1_9, dimensions_10_15, headers

def process_combined_dimensions(dimensions_1_9, dimensions_10_15, headers):
    processed_data_1_9 = []
    processed_data_10_15 = []

    # Process first section (1-9 results)
    for record in dimensions_1_9:
        processed_record = {}
        for header, value in record.items():
            if header and header.strip():
                processed_record[header.strip()] = value
        processed_data_1_9.append(processed_record)

    # Process second section (10-15 results)
    for record in dimensions_10_15:
        processed_record = {}
        for header, value in record.items():
            if header and header.strip():
                processed_record[header.strip()] = value
        processed_data_10_15.append(processed_record)

    # Combine both sections
    combined_data = processed_data_1_9 + processed_data_10_15

    # Convert to DataFrame
    df = pd.DataFrame(combined_data)
    return df

def flatten_and_save(df, output_file):
    # Flatten multi-level columns for saving
    flat_columns = ["_".join(filter(None, col)).strip() for col in df.columns]
    df.columns = flat_columns

    # Save the data to Excel
    df.to_excel(output_file, index=False)
    print(f"Final combined and flattened output saved to {output_file}")

def main():
    pdf_path = r"E:\git\Automated-Email-Parsing-and-Document-Generation\input_doc.pdf"  # Replace with your PDF path
    dimensions_1_9, dimensions_10_15, headers = extract_dimensions_with_sizes(pdf_path)

    # Combine and process dimensions dynamically
    combined_df = process_combined_dimensions(dimensions_1_9, dimensions_10_15, headers)

    # Create a multi-level column structure
    final_columns = [
        ("Dim", ""), ("Description", ""), ("Tol (-)", ""), ("Tol (+)", ""),
        ("XS", "Increment"), ("XS", "Sample"), ("XS", "Deviation"),
        ("S", "Increment"), ("S", "Sample"), ("S", "Deviation"),
        ("M", "Increment"), ("M", "Sample"), ("M", "Deviation"),
        ("L", "Increment"), ("L", "Sample"), ("L", "Deviation"),
        ("XL", "Increment"), ("XL", "Sample"), ("XL", "Deviation")
    ]

    # Initialize a DataFrame with multi-level columns
    final_data = pd.DataFrame(columns=pd.MultiIndex.from_tuples(final_columns))

    # Populate data (example for XS and S, similar for M, L, XL)
    final_data[("Dim", "")] = combined_df.get("Dim", None)
    final_data[("Description", "")] = combined_df.get("Description", None)
    final_data[("Tol (-)", "")] = combined_df.get("Tol (-)", None)
    final_data[("Tol (+)", "")] = combined_df.get("Tol (+)", None)
    final_data[("XS", "Increment")] = combined_df.get("XS", None)
    final_data[("S", "Increment")] = combined_df.get("S", None)

    # Add placeholders for missing sub-columns
    for size in ["XS", "S", "M", "L", "XL"]:
        if size not in ["XS", "S"]:
            final_data[(size, "Increment")] = None
        final_data[(size, "Sample")] = None
        final_data[(size, "Deviation")] = None

    # Flatten columns and save to Excel
    output_file = "final_combined_output_flat.xlsx"
    flatten_and_save(final_data, output_file)

if __name__ == "__main__":
    main()


Final combined and flattened output saved to final_combined_output_flat.xlsx
