In [9]:
import os
import logging
import fitz  # PyMuPDF
import concurrent.futures
from pymongo import MongoClient
import os 

> **Parsing**

In [10]:
# pdf parsing function

logging.basicConfig(filename='pdf_pipeline.log', level=logging.ERROR)

def par_pdf(file_path):
    try:
        doc = fitz.open(file_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        logging.error("Error processing {file_path} : " + str(e))
        return None
    
# import pdfs from a folder
def import_pdfs(folder_path):
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".pdf")]
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = executor.map(par_pdf, pdf_files)
    
    return list(results)

> **Store Metadata**

In [11]:
#mongoDB setup on localhost
client = MongoClient("mongodb://localhost:27017/")
db = client['pdf_summarization']
collection = db['pdf_documents']

# function to store metadata
def store(file_path, text):
    try:
        metadata = {
            "file_name": os.path.basename(file_path),
            "file_path": file_path,
            "size": os.path.getsize(file_path),
            "summary": None,
            "keywords": None
        }
        collection.insert_one(metadata)
        print("metadata stored for" + str({metadata ['file_name']}))
    except Exception as e:
        print(" error storing metadata" + str(e))

# funtion to update summary and keywords function
def update(file_name,summary, keywords):
    try:
        collection.update_one(
            {"file_name": file_name},
            {"$set": {"summary":summary ,"keywords": keywords}}
        )
    except Exception as e:
        print("error updating metadata: " + str(e))    

In [12]:
import spacy
from collections import Counter
import PyPDF2
from collections import defaultdict

def summarize_text(text, num_sentences=3):
    # Process the text with spaCy
    doc = nlp(text)

    # Create a dictionary to hold sentence scores
    sentence_scores = defaultdict(int)

    # Score each sentence based on noun and adjective frequencies
    for sent in doc.sents:
        for token in sent:
            if token.pos_ in ["NOUN", "PROPN", "ADJ"] and not token.is_stop:
                sentence_scores[sent.text] += 1  # Score sentences based on important words

    # Select the top N sentences as the summary
    summarized_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
    
    return ' '.join(summarized_sentences)

# Loading the English NLP model
nlp = spacy.load("en_core_web_sm")

def extract_keywords(text):
    doc = nlp(text.lower())
    keywords = [token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "ADJ"] and not token.is_stop]
    return [keyword for keyword, _ in Counter(keywords).most_common(10)]

> **Use**

In [13]:
def process_pdf(file_path):
    text = par_pdf(file_path)
    if text:
        summary = summarize_text(text)
        keywords = extract_keywords(text)
        # Store 
        store(file_path, text)
        # Update 
        update(os.path.basename(file_path),summary, keywords)

def main(folder_path):
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".pdf")]
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(process_pdf, pdf_files)

# Example run
main("pdfs")

metadata stored for{'Biological_pretreatment_of_lignocellulosic_biomass_An_environment.pdf'}
metadata stored for{'Persons with partial work ability at work - PDF Room.pdf'}
metadata stored for{'Joy at Work Work at Joy_ Living and Working Mindfully Every Day - PDF Room.pdf'}
metadata stored for{'Don’t Sweat the Small Stuff at Work - PDF Room.pdf'}
