In [1]:
import pandas as pd
from PyPDF2 import PdfReader
from pymongo import MongoClient
import pdfplumber
import re

In [2]:
# Function to extract dimensions and sizes dynamically from the PDF
def extract_dimensions_with_sizes(pdf_path):
    dimensions = []
    headers = []  # To store the main and sub-headers dynamically
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if len(table) > 0:
                    # Extract the headers (first row) and data rows
                    current_headers = table[0]
                    if not headers:  # Capture headers only once
                        headers = current_headers
                    for row in table[1:]:
                        if len(row) == len(headers):  # Ensure valid row
                            row_data = dict(zip(headers, row))
                            dimensions.append(row_data)
    return dimensions, headers

# Function to process and structure extracted data
def process_dimensions(dimensions, headers):
    processed_data = []

    for record in dimensions:
        processed_record = {}
        for header, value in record.items():
            if header and header.strip():
                processed_record[header.strip()] = value
        processed_data.append(processed_record)

    # Convert to DataFrame
    df = pd.DataFrame(processed_data)
    return df

# Function to save DataFrame to MongoDB
def save_to_mongodb(df, db_name, collection_name):
    client = MongoClient("mongodb://localhost:27017/")
    db = client[db_name]
    collection = db[collection_name]

    # Convert DataFrame to dictionary records and insert into MongoDB
    records = df.to_dict(orient='records')
    collection.insert_many(records)
    print(f"Data saved to MongoDB database '{db_name}', collection '{collection_name}'")

# Function to save DataFrame to Excel with formatting
def save_to_excel_formatted(df, output_file):
    with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
        df.to_excel(writer, index=False, sheet_name="Sheet1")
        worksheet = writer.sheets["Sheet1"]

        # Set column widths for better readability
        for i, col in enumerate(df.columns):
            column_width = max(df[col].astype(str).map(len).max(), len(col))
            worksheet.set_column(i, i, column_width)

    print(f"Formatted data successfully saved to {output_file}")

# Main execution
def main():
    pdf_path = r"E:\git\Automated-Email-Parsing-and-Document-Generation\input_doc.pdf"  # Replace with your PDF file path
    output_excel_path = "output_formatted.xlsx"  # Replace with your desired output Excel file path
    db_name = "PDFData_sample"
    collection_name = "DynamicDimensionsNew"

    # Extract dimensions dynamically
    dimensions, headers = extract_dimensions_with_sizes(pdf_path)

    # Process and structure dimensions
    dataframe = process_dimensions(dimensions, headers)

    # Save to MongoDB
    save_to_mongodb(dataframe, db_name, collection_name)

    # Save to Excel with formatting
    save_to_excel_formatted(dataframe, output_excel_path)

if __name__ == "__main__":
    main()


Data saved to MongoDB database 'PDFData_sample', collection 'DynamicDimensionsNew'
Formatted data successfully saved to output_formatted.xlsx


In [None]:
#__________________________VERSION 2_______________________

In [25]:
import pdfplumber
import pandas as pd
from pymongo import MongoClient

# Function to extract dimensions and sizes dynamically from the PDF
def extract_dimensions_with_sizes(pdf_path):
    dimensions = []
    headers = []  # To store the main and sub-headers dynamically
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if len(table) > 0:
                    # Extract the headers (first row) and data rows
                    current_headers = table[0]
                    if not headers:  # Capture headers only once
                        headers = current_headers
                    for row in table[1:]:
                        if len(row) == len(headers):  # Ensure valid row
                            row_data = dict(zip(headers, row))
                            dimensions.append(row_data)
    return dimensions, headers

# Function to process and structure extracted data
def process_dimensions(dimensions, headers):
    processed_data = []

    for record in dimensions:
        processed_record = {}
        for header, value in record.items():
            if header and header.strip():
                processed_record[header.strip()] = value
        processed_data.append(processed_record)

    # Convert to DataFrame
    df = pd.DataFrame(processed_data)
    return df

# Function to save DataFrame to MongoDB
def save_to_mongodb(df, db_name, collection_name):
    client = MongoClient("mongodb://localhost:27017/")
    db = client[db_name]
    collection = db[collection_name]

    # Convert DataFrame to dictionary records and insert into MongoDB
    records = df.to_dict(orient='records')
    collection.insert_many(records)
    print(f"Data saved to MongoDB database '{db_name}', collection '{collection_name}'")

# Function to save DataFrame to Excel with formatting
def save_to_excel_formatted(df, output_file):
    with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
        df.to_excel(writer, index=False, sheet_name="Sheet1")
        worksheet = writer.sheets["Sheet1"]

        # Set column widths for better readability
        for i, col in enumerate(df.columns):
            column_width = max(df[col].astype(str).map(len).max(), len(col))
            worksheet.set_column(i, i, column_width)

    print(f"Formatted data successfully saved to {output_file}")

# Main execution
def main():
    pdf_path = "input_doc.pdf"  # Replace with your PDF file path
    output_excel_path = "output_formatted.xlsx"  # Replace with your desired output Excel file path
    db_name = "newPDFData"
    collection_name = "DynamicDimensions"

    # Extract dimensions dynamically
    dimensions, headers = extract_dimensions_with_sizes(pdf_path)

    # Process and structure dimensions
    dataframe = process_dimensions(dimensions, headers)

    # Save to MongoDB
    save_to_mongodb(dataframe, db_name, collection_name)

    # Save to Excel with formatting
    save_to_excel_formatted(dataframe, output_excel_path)

if __name__ == "__main__":
    main()


Data saved to MongoDB database 'PDFData', collection 'FinalDimensions'
Formatted data successfully saved to outputFinal.xlsx


In [27]:
#--------------------------------------------VERSION 3------------------------

In [29]:
import pdfplumber
import pandas as pd
from pymongo import MongoClient

# Function to extract dimensions and sizes dynamically from the PDF
def extract_dimensions_with_sizes(pdf_path):
    dimensions = []
    headers = []  # To store the main and sub-headers dynamically
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if len(table) > 0:
                    # Extract the headers (first row) and data rows
                    current_headers = table[0]
                    if not headers:  # Capture headers only once
                        headers = current_headers
                        print(f"Extracted Headers: {headers}")  # Debugging headers
                    for row in table[1:]:
                        if len(row) == len(headers):  # Ensure valid row
                            row_data = dict(zip(headers, row))
                            dimensions.append(row_data)
                        else:
                            print(f"Row mismatch with headers: {row}")  # Debugging row mismatches
    return dimensions, headers

# Function to process and structure extracted data
def process_dimensions(dimensions, headers):
    processed_data = []

    # Ensure all required columns are present
    required_columns = ["Dim", "Description", "Tol (-)", "Tol (+)", "XS", "S", "M", "L", "XL"]
    headers = [header.strip() if header else "" for header in headers]
    for col in required_columns:
        if col not in headers:
            headers.append(col)  # Add missing columns

    for record in dimensions:
        processed_record = {}
        for header, value in record.items():
            if header and header.strip():
                processed_record[header.strip()] = value

        # Add missing columns with default values
        for col in required_columns:
            if col not in processed_record:
                processed_record[col] = "N/A"

        processed_data.append(processed_record)

    # Convert to DataFrame
    df = pd.DataFrame(processed_data, columns=required_columns)
    return df

# Function to save DataFrame to MongoDB
def save_to_mongodb(df, db_name, collection_name):
    client = MongoClient("mongodb://localhost:27017/")
    db = client[db_name]
    collection = db[collection_name]

    # Convert DataFrame to dictionary records and insert into MongoDB
    records = df.to_dict(orient='records')
    collection.insert_many(records)
    print(f"Data saved to MongoDB database '{db_name}', collection '{collection_name}'")

# Function to save DataFrame to Excel with formatting
def save_to_excel_formatted(df, output_file):
    with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
        df.to_excel(writer, index=False, sheet_name="Sheet1")
        worksheet = writer.sheets["Sheet1"]

        # Set column widths for better readability
        for i, col in enumerate(df.columns):
            column_width = max(df[col].astype(str).map(len).max(), len(col))
            worksheet.set_column(i, i, column_width)

    print(f"Formatted data successfully saved to {output_file}")

# Main execution
def main():
    pdf_path = "input_doc.pdf"  # Replace with your PDF file path
    output_excel_path = "output_formatted.xlsx"  # Replace with your desired output Excel file path
    db_name = "newPDFData"
    collection_name = "DynamicDimensions"

    # Extract dimensions dynamically
    dimensions, headers = extract_dimensions_with_sizes(pdf_path)

    # Process and structure dimensions
    dataframe = process_dimensions(dimensions, headers)

    # Save to MongoDB
    save_to_mongodb(dataframe, db_name, collection_name)

    # Save to Excel with formatting
    save_to_excel_formatted(dataframe, output_excel_path)

if __name__ == "__main__":
    main()


Extracted Headers: ['Dim', 'Description', 'Comment', 'Tol\n(-)', 'Tol\n(+)', 'XS', None, None, 'S', None, None]
Row mismatch with headers: ['Incremen\nt', 'Sampl\ne', 'Deviatio\nn', 'Incremen\nt', 'Sampl\ne', 'Deviatio\nn', 'Incremen\nt', 'Sampl\ne', 'Deviatio\nn']
Row mismatch with headers: ['54.00', '', '', '58.00', '', '', '64.00', '', '']
Row mismatch with headers: ['40.00', '', '', '42.00', '', '', '44.00', '', '']
Row mismatch with headers: ['2.50', '', '', '2.50', '', '', '2.50', '', '']
Row mismatch with headers: ['41.00', '', '', '43.00', '', '', '45.00', '', '']
Row mismatch with headers: ['22.80', '', '', '23.60', '', '', '24.40', '', '']
Row mismatch with headers: ['19.25', '', '', '20.50', '', '', '21.75', '', '']
Row mismatch with headers: ['17.25', '', '', '18.50', '', '', '20.50', '', '']
Row mismatch with headers: ['17.80', '', '', '18.60', '', '', '19.40', '', '']
Row mismatch with headers: ['Displaying 1-9 results', None, None, None, None, None, None, None, None]
Row

In [36]:
import pdfplumber
import pandas as pd
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")  # Update with your MongoDB URI

db = client["pdf_data"]
collection = db["dimensions"]

def extract_and_store_in_mongo(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if len(table) > 0:
                    headers = table[0]  # First row is assumed to be headers
                    cleaned_headers = [str(header).strip() if header else f"Column_{i}" for i, header in enumerate(headers)]

                    for row in table[1:]:
                        if len(row) == len(cleaned_headers):
                            # Create a clean row dictionary with valid string keys
                            row_data = dict(zip(cleaned_headers, row))
                            collection.insert_one(row_data)  # Insert row into MongoDB

def process_data_from_mongo():
    # Query MongoDB for all columns and process rows dynamically
    records = collection.find()
    data = []
    for record in records:
        # Convert MongoDB record to dictionary and remove irrelevant fields
        cleaned_record = {k: v for k, v in record.items() if k != "_id" and v not in (None, "", "N/A")}
        data.append(cleaned_record)

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Remove empty columns
    df = df.dropna(axis=1, how="all")

    # Limit rows to 2-10 dynamically (assuming 1-based indexing in the table)
    if len(df) > 10:
        df = df.iloc[1:10]

    return df

def save_to_excel(df, output_file):
    df.to_excel(output_file, index=False)
    print(f"Data successfully saved to {output_file}")

def main():
    pdf_path = "input_doc.pdf"  # Replace with your PDF path
    extract_and_store_in_mongo(pdf_path)

    # Process data from MongoDB
    processed_df = process_data_from_mongo()

    # Save to Excel
    output_file = "mongo_output_cleaned.xlsx"
    save_to_excel(processed_df, output_file)

if __name__ == "__main__":
    main()


Data successfully saved to mongo_output_cleaned.xlsx
