In [1]:
import pdfplumber
import pandas as pd

def extract_dimensions_with_sizes(pdf_path):
    dimensions = []  # List to store all extracted rows
    headers = []  # To store headers dynamically
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if len(table) > 0:
                    # Extract the headers (first row) and data rows
                    current_headers = table[0]
                    if not headers:  # Capture headers only once
                        headers = current_headers
                    for row in table[1:]:
                        if len(row) == len(headers):  # Ensure valid row
                            row_data = dict(zip(headers, row))
                            dimensions.append(row_data)
    return dimensions, headers

def process_combined_dimensions(dimensions, headers):
    processed_data = []

    # Process all extracted dimensions
    for record in dimensions:
        processed_record = {}
        for header, value in record.items():
            if header and header.strip():
                processed_record[header.strip()] = value
        processed_data.append(processed_record)

    # Convert to DataFrame
    df = pd.DataFrame(processed_data)

    # Filter only rows with size columns (e.g., "M," "L," "XL")
    size_columns = ["M", "L", "XL"]
    relevant_columns = [col for col in df.columns if col in size_columns or col not in size_columns]

    # Reorganize DataFrame to match expected output structure
    df = df[relevant_columns]

    return df

def save_to_excel_formatted(df, output_file):
    # Save the DataFrame to an Excel file with formatting
    with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
        df.to_excel(writer, index=False, sheet_name="Sheet1")
        worksheet = writer.sheets["Sheet1"]

        # Set column widths for better readability
        for i, col in enumerate(df.columns):
            column_width = max(df[col].astype(str).map(len).max(), len(col))
            worksheet.set_column(i, i, column_width)

    print(f"Formatted data successfully saved to {output_file}")

def main():
    pdf_path = r"E:\git\Automated-Email-Parsing-and-Document-Generation\input_doc.pdf"  # Replace with your PDF path
    dimensions, headers = extract_dimensions_with_sizes(pdf_path)

    # Combine and process dimensions dynamically
    combined_df = process_combined_dimensions(dimensions, headers)

    # Debug: Print intermediate results
    print("Extracted Headers:", headers)
    print("Extracted Dimensions (Sample):", dimensions[:5])

    # Save to Excel
    output_file = "demo3_final_combined_output.xlsx"
    save_to_excel_formatted(combined_df, output_file)

if __name__ == "__main__":
    main()


Extracted Headers: ['Dim', 'Description', 'Comment', 'Tol\n(-)', 'Tol\n(+)', 'XS', None, None, 'S', None, None]
Extracted Dimensions (Sample): [{'Dim': '1B1', 'Description': 'ChestWidth', 'Comment': '', 'Tol\n(-)': '-1.00', 'Tol\n(+)': '1.00', 'XS': 'Increm\nent', None: 'Deviation', 'S': 'Incremen\nt'}, {'Dim': '1X1', 'Description': 'BottomHem Width', 'Comment': '', 'Tol\n(-)': '-1.00', 'Tol\n(+)': '1.00', 'XS': '46.00', None: '', 'S': '50.00'}, {'Dim': '1F3', 'Description': 'AcrossShoulder', 'Comment': '', 'Tol\n(-)': '-1.00', 'Tol\n(+)': '1.00', 'XS': '36.00', None: '', 'S': '38.00'}, {'Dim': '1F6', 'Description': 'ShoulderAngle', 'Comment': '', 'Tol\n(-)': '-0.50', 'Tol\n(+)': '0.50', 'XS': '2.50', None: '', 'S': '2.50'}, {'Dim': '1G5', 'Description': 'SleeveLength (Short)', 'Comment': '', 'Tol\n(-)': '-1.00', 'Tol\n(+)': '1.00', 'XS': '37.00', None: '', 'S': '39.00'}]
Formatted data successfully saved to demo3_final_combined_output.xlsx
