In [1]:
import pdfplumber
import pandas as pd

def extract_dimensions_with_sizes(pdf_path):
    dimensions_1_9 = []
    dimensions_10_15 = []
    headers = []  # To store the main and sub-headers dynamically
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if len(table) > 0:
                    # Extract the headers (first row) and data rows
                    current_headers = table[0]
                    if not headers:  # Capture headers only once
                        headers = current_headers
                    for row in table[1:]:
                        if len(row) == len(headers):  # Ensure valid row
                            row_data = dict(zip(headers, row))
                            if "Displaying 1-9 results" in table:
                                dimensions_1_9.append(row_data)
                            elif "Displaying 10-15 results" in table:
                                dimensions_10_15.append(row_data)
    return dimensions_1_9, dimensions_10_15, headers

def process_combined_dimensions(dimensions_1_9, dimensions_10_15, headers):
    processed_data_1_9 = []
    processed_data_10_15 = []

    # Process first section (1-9 results)
    for record in dimensions_1_9:
        processed_record = {}
        for header, value in record.items():
            if header and header.strip():
                processed_record[header.strip()] = value
        processed_data_1_9.append(processed_record)

    # Process second section (10-15 results)
    for record in dimensions_10_15:
        processed_record = {}
        for header, value in record.items():
            if header and header.strip():
                processed_record[header.strip()] = value
        processed_data_10_15.append(processed_record)

    # Combine both sections
    combined_data = processed_data_1_9 + processed_data_10_15

    # Convert to DataFrame
    df = pd.DataFrame(combined_data)
    return df

def save_to_excel_formatted(df, output_file):
    # Save the DataFrame to an Excel file with formatting
    with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
        df.to_excel(writer, index=False, sheet_name="Sheet1")
        worksheet = writer.sheets["Sheet1"]

        # Set column widths for better readability
        for i, col in enumerate(df.columns):
            column_width = max(df[col].astype(str).map(len).max(), len(col))
            worksheet.set_column(i, i, column_width)

    print(f"Formatted data successfully saved to {output_file}")

def main():
    pdf_path = r"E:\git\Automated-Email-Parsing-and-Document-Generation\input_doc.pdf"  # Replace with your PDF path
    dimensions_1_9, dimensions_10_15, headers = extract_dimensions_with_sizes(pdf_path)

    # Combine and process dimensions dynamically
    combined_df = process_combined_dimensions(dimensions_1_9, dimensions_10_15, headers)

    # Save to Excel
    output_file = "final_combined_output.xlsx"
    save_to_excel_formatted(combined_df, output_file)

if __name__ == "__main__":
    main()


Formatted data successfully saved to final_combined_output.xlsx
