In [2]:
import pdfplumber
import pandas as pd
from pymongo import MongoClient

# MongoDB connection
client = MongoClient("mongodb://localhost:27017/")  # Update with your MongoDB URI
db = client["pdf_data"]
collection = db["dimensions"]

def extract_and_store_in_mongo(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if len(table) > 0:
                    headers = table[0]  # First row is assumed to be headers
                    cleaned_headers = [str(header).strip() if header else f"Column_{i}" for i, header in enumerate(headers)]
                    
                    for row in table[1:]:
                        if len(row) == len(cleaned_headers):
                            # Create a clean row dictionary with valid string keys
                            row_data = dict(zip(cleaned_headers, row))
                            collection.insert_one(row_data)  # Insert row into MongoDB


def process_data_from_mongo():
    # Query MongoDB for relevant columns (M, L, XL and their sub-headers)
    records = collection.find({}, {"M": 1, "L": 1, "XL": 1, "_id": 0})  # Fetch specific columns
    data = list(records)
    return pd.DataFrame(data)

def save_to_excel(df, output_file):
    df.to_excel(output_file, index=False)
    print(f"Data successfully saved to {output_file}")

def main():
    pdf_path = r"E:\git\Automated-Email-Parsing-and-Document-Generation\input_doc.pdf"  # Replace with your PDF path
    extract_and_store_in_mongo(pdf_path)

    # Process data from MongoDB
    processed_df = process_data_from_mongo()

    # Save to Excel
    output_file = "mongo_processed_output.xlsx"
    save_to_excel(processed_df, output_file)

if __name__ == "__main__":
    main()


Data successfully saved to mongo_processed_output.xlsx
