In [1]:
import os
import PyPDF2
import concurrent.futures
from pymongo import MongoClient

# MongoDB connection setup
client = MongoClient('mongodb://localhost:27017/')
db = client['pdf_db']
collection = db['pdf_metadata']

# Function to extract metadata from a PDF
def extract_metadata(pdf_path):
    pdf_name = os.path.basename(pdf_path)
    size = os.path.getsize(pdf_path)
    
    # PDF parsing
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            num_pages = len(reader.pages)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
        
        # Store metadata in MongoDB
        metadata = {
            "document_name": pdf_name,
            "path": pdf_path,
            "size": size,
            "num_pages": num_pages,
            "text": text
        }
        collection.insert_one(metadata)
        
        print(f"Processed {pdf_name} with {num_pages} pages.")
        return metadata
    
    except Exception as e:
        print(f"Failed to process {pdf_name}: {e}")
        return None

# Function to process a folder of PDFs
def process_pdfs(folder_path):
    pdf_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.pdf')]
    
    # Concurrent processing of PDFs
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(executor.map(extract_metadata, pdf_files))
    
    return results




In [2]:
# Usage
folder_path = r"C:\Users\pv437\Desktop\Data Scince Folder\Pankaj Assignments\Wasserstoff\Deployment\Flask\Downloaded_pdfs"
process_pdfs(folder_path)

Processed pdf5.pdf with 1 pages.
Processed pdf12.pdf with 7 pages.
Processed pdf14.pdf with 11 pages.
Processed pdf8.pdf with 8 pages.
Processed pdf1.pdf with 11 pages.
Processed pdf9.pdf with 8 pages.
Processed pdf13.pdf with 12 pages.
Processed pdf11.pdf with 14 pages.
Processed pdf15.pdf with 13 pages.
Processed pdf7.pdf with 12 pages.
Processed pdf6.pdf with 13 pages.
Processed pdf18.pdf with 24 pages.
Processed pdf16.pdf with 41 pages.
Processed pdf17.pdf with 106 pages.
Processed pdf10.pdf with 54 pages.
Processed pdf4.pdf with 102 pages.
Processed pdf2.pdf with 131 pages.
Processed pdf3.pdf with 402 pages.


[{'document_name': 'pdf1.pdf',
  'path': 'C:\\Users\\pv437\\Desktop\\Data Scince Folder\\Pankaj Assignments\\Wasserstoff\\Deployment\\Flask\\Downloaded_pdfs\\pdf1.pdf',
  'size': 650025,
  'num_pages': 11,
  'text': '1950 \nPatntalal \nJankid-:u \nv, \nMohanlal and \nAnother, \nPata1tjali \nS11stri J. \n!950 \nDeo, 21. 1008 SUPREME COURT REPORTS [1950] \nof section 14, it seems to me, they would be bringing \nthemselves under the bar of section 18 (2). The \nrespondents cannot therefore claim that the loss of the \ngoods was explosion damage within the meaning of the \nOrdinance so as to bring the case within section 14 and \nat the same time contend that the loss was not "due \nto or did not in any way arise ont of the explosion" in \norder to a void the bar under section 18. Both sec\xad\ntion 14 and section 18 have in view the physical cause \nfor the loss or damage to property for which compen\xad\nsation is claimed and not the cause of action in rela\xad\ntion to the person agains