In [2]:
import pandas as pd
from PyPDF2 import PdfReader
from pymongo import MongoClient
import pdfplumber
import re

In [3]:
# Function to extract dimensions and sizes dynamically from the PDF
def extract_dimensions_with_sizes(pdf_path):
    dimensions = []
    headers = []  # To store the main and sub-headers dynamically
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if len(table) > 0:
                    # Extract the headers (first row) and data rows
                    current_headers = table[0]
                    if not headers:  # Capture headers only once
                        headers = current_headers
                    for row in table[1:]:
                        if len(row) == len(headers):  # Ensure valid row
                            row_data = dict(zip(headers, row))
                            dimensions.append(row_data)
    return dimensions, headers

# Function to process and structure extracted data
def process_dimensions(dimensions, headers):
    processed_data = []

    for record in dimensions:
        processed_record = {}
        for header, value in record.items():
            if header and header.strip():
                processed_record[header.strip()] = value
        processed_data.append(processed_record)

    # Convert to DataFrame
    df = pd.DataFrame(processed_data)
    return df

# Function to save DataFrame to MongoDB
def save_to_mongodb(df, db_name, collection_name):
    client = MongoClient("mongodb://localhost:27017/")
    db = client[db_name]
    collection = db[collection_name]

    # Convert DataFrame to dictionary records and insert into MongoDB
    records = df.to_dict(orient='records')
    collection.insert_many(records)
    print(f"Data saved to MongoDB database '{db_name}', collection '{collection_name}'")

# Function to save DataFrame to Excel with formatting
def save_to_excel_formatted(df, output_file):
    with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
        df.to_excel(writer, index=False, sheet_name="Sheet1")
        worksheet = writer.sheets["Sheet1"]

        # Set column widths for better readability
        for i, col in enumerate(df.columns):
            column_width = max(df[col].astype(str).map(len).max(), len(col))
            worksheet.set_column(i, i, column_width)

    print(f"Formatted data successfully saved to {output_file}")

# Main execution
def main():
    pdf_path = r"E:\git\Automated-Email-Parsing-and-Document-Generation\input_doc.pdf"  # Replace with your PDF file path
    output_excel_path = "output_formatted.xlsx"  # Replace with your desired output Excel file path
    db_name = "PDFData_sample"
    collection_name = "DynamicDimensionsNew"

    # Extract dimensions dynamically
    dimensions, headers = extract_dimensions_with_sizes(pdf_path)

    # Process and structure dimensions
    dataframe = process_dimensions(dimensions, headers)

    # Save to MongoDB
    save_to_mongodb(dataframe, db_name, collection_name)

    # Save to Excel with formatting
    save_to_excel_formatted(dataframe, output_excel_path)

if __name__ == "__main__":
    main()


Data saved to MongoDB database 'PDFData_sample', collection 'DynamicDimensionsNew'
Formatted data successfully saved to output_formatted.xlsx


In [12]:
import pdfplumber
import pandas as pd
from pymongo import MongoClient
import random

client = MongoClient("mongodb://localhost:27017/")  # Update with your MongoDB URI

db = client["PDFData_sample"]
collection = db["spec_sheet"]

def extract_spec_sheet_data(pdf_path):
    spec_sheet_data = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            tables = page.extract_tables()
            for table in tables:
                # Identify the Spec Sheet table
                if "Spec Sheet:" in page.extract_text():
                    headers = table[0]  # First row is assumed to be headers
                    cleaned_headers = [str(header).strip() if header else f"Column_{i}" for i, header in enumerate(headers)]

                    for row in table[1:]:
                        if len(row) == len(cleaned_headers):
                            # Create a clean row dictionary with valid string keys
                            row_data = dict(zip(cleaned_headers, row))
                            spec_sheet_data.append(row_data)
    return spec_sheet_data

def save_spec_sheet_to_mongo(data):
    if data:
        collection.insert_many(data)
        print(f"{len(data)} records saved to MongoDB collection 'spec_sheet'.")
    else:
        print("No data to save to MongoDB.")

def save_to_excel(df, output_file):
    # Remove the '_id' column if it exists
    if '_id' in df.columns:
        df = df.drop(columns=['_id'])

    # Move the 'Qty' column to the end if it exists
    if 'Qty' in df.columns:
        qty_column = df.pop('Qty')
        df['Qty'] = qty_column

        # Convert 'Qty' to numeric, handling errors
        df['Qty'] = pd.to_numeric(df['Qty'], errors='coerce')

        # Add 'perrate' and 'total' columns
        df['perrate'] = [round(random.uniform(0.2, 0.8), 2) for _ in range(len(df))]  # Assign random values in range 0.2 to 0.8 with 2 decimals
        df['total'] = df['Qty'] * df['perrate']

        # Add a final row for the grand total
        grand_total = pd.DataFrame({
            'Qty': [None],
            'perrate': [None],
            'total': [df['total'].sum()]
        })
        grand_total.index = ['Grand Total']
        df = pd.concat([df, grand_total])

    df.to_excel(output_file, index=False)
    print(f"Spec Sheet data successfully saved to {output_file}")

def main():
    pdf_path = "input_doc.pdf"  # Replace with your PDF path

    # Extract Spec Sheet data
    spec_sheet_data = extract_spec_sheet_data(pdf_path)

    # Save to MongoDB
    save_spec_sheet_to_mongo(spec_sheet_data)

    # Save to Excel
    if spec_sheet_data:
        df = pd.DataFrame(spec_sheet_data)
        output_file = "spec_sheet_output.xlsx"
        save_to_excel(df, output_file)

if __name__ == "__main__":
    main()

8 records saved to MongoDB collection 'spec_sheet'.
Spec Sheet data successfully saved to spec_sheet_output.xlsx


  df = pd.concat([df, grand_total])
