## Internship Task: Domain-Specific PDF Summarization & Keyword Extraction Pipeline

### 1. PDF Ingestion & Parsing

In [2]:
# First import the necessary libraries
import os # for file management
import fitz # fitz or PyMuPDF for PDF parsing
import concurrent.futures # for concurrency
import logging # for error handling and logging

In [6]:
# Now here going to set up logging
logging.basicConfig(level=logging.INFO, filename='pdf_pipline.log', format='%(asctime)s - %(levelname)s - %(message)s')

def get_pdf_info(pdf_path):   
    """Extract metadata from the PDF."""
    try:
        doc = fits.open(pdf_path)
        metadata = {
            'document_name': os.path.basename(pdf_path),
            'path': pdf_path,
            'size':os.path.getsize(pdf_path),
            'num_pages': doc.page_count
        }
        return metadata
    except Exception as e:
        logging.error(f"Error reading {pdf_path}: {e}")
        return None


def process_pdf(folder_path):
    """Process a single pdf document."""
    metadata = get_pdf_info(pdf_path)
    if metadata:
        # Here we can add further processing (e.g., sumrization, keyword extraction)
        return metadata
    return None

def process_pdfs_from_folder(folder_path):
    """Process all pdf documents in the specified folder."""
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endwith('.pdf')]
    results = []
    
    with concurrent.futures.ThreadPoolExecutor() as executor:    # ThreadPoolExecutor for concurrent processing of each pdf
        future_to_pdf = {execuor.submit(process_pdf, pdf): pdf for pdf in pdf_files}
        for future in concurrent.futures.as_completed(future_to_pdf):
            pdf_path = future_to_pdf[future]
            try:
                result = future.result()
                if result:
                    results.append(result)
            except Exxception as e:
                logging.error(f"Error processing {pdf_path}: {e}")
                
    return results

# Now main function to trigger the pipeline
def main():
    json_location = "C:\\Users\\Subhash Gupta\\OneDrive\\Desktop\\Dataset.json"
    file_location_dict = json.load(open(json_location, "r"))
    process_pdfs_in_folder(file_location_dict)

### 2. MongoDB Dataset Storage & JSON Updates

In [10]:
from pymongo import MongoClient

# Writing MongoDB setup
client = MongoClient('mongodb://localhost:27017/')
db = client['pdf_database']
collection = db['pdf_metadata']

def store_metadata(metadata):   # tracking each document's basic detail 
    """Store pdf metadata in MongoDB."""
    try:
        collection.insert_one(metadata)
        logging.info(f"Stored metadata for {metadata['document_name']}")
    except Exception as e:
        logging.error(f"Error storing metadata in mongoDB: {e}")

def update_metadata_with_summary(document_name, summary, keywords):
    """Update the MongoDB entry for the given document with summary and keywords."""
    try:
        collection.update_one(
            {'document_name': document_name},
            {'$set': {'summary': summary, 'keywords': keywords}}
        )
        logging.info(f"Update {document_name} with summary and keywords.")
    except Exception as e:
        logging.error(f"Error updating {document_name} in MongoDB: {e}")

### 3. Summarization & Keyword Extraction

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

def summarize_document(text, num_sentence=2):
    """Generate a simple summary of the document."""
    # Placeholder for summerization logic
    return ' '.join(text.split()[:num_sentences]) # returning the few words as a placeholder summary
def extract_keywords(text, num_keywords=5):
    """Extract keywords from the document."""
    vectorizer = TfidfVectorizer(stop_words='english', max_feature=num_keywords) # TfidfVectorizer provides meaningful keywords for each documents
    X = vectorizer.fit_transform([text])
    keywords = vectorizer.get_feature_names_out()
    return keywords.tolist()

def process_document(pdf_path):
    """Full processing of the pdf: summerization and keyword extraction."""
    try:
        doc = fitz.open(pdf_path)
        text = ''
        for page in doc:
            text += page.get_text()

        summary = summerize_document(text, num_sentences=3)
        keywords = extract_keywords(text, num_keywords=5)
        
        # Update MongoDB with summary and keywords
        document_name = os.path.basename(pdf_path)
        update_metadata_with_summary(document_name, summary, keywords)
        
    except Exception as e:
        logging.error(f"Error processing document {pdf_path}: {e}")

### 4. JSON Structure & MongoDB Updates

In [18]:
import json

def format_json(metadata, summary, keywords):
    """Format the output as a JSON object."""
    output = {
        'metadata' : metadata,
        'summary': summary,
        'keywords': keywords
    }
    return json.dumps(output)

def srore_json_output(pdf_path):
    """Store the JSON output in MongoDB."""
    try:
        metadata = get_pdf_info(pdf_path)
        summary, keywords = process_document(pdf_path)
        json_output = format_json(metadata, summary, keywords)
        
        # Now update MongoDB
        document_name = metadata['document_name']
        collection.update.one(
            {'document_name': document_name},
            {'$set': {'json_output': json_output}}
        )
    except Exception as e:
        logging.error(f"Error storing JSON output for {pdf_path}: {e}")

### 5. Concurrency & Performance

In [21]:
import time

def measure_performance(json_file_path):
    """Here measure the performance of PDF processing"""
    # Load the JSON file to get the folder path
    with open(json_file_path, "r") as f:
        data = json.load(f)
        
    folder_path = data.get("path_file")
    if not folder_path:
        logging.error("No 'path_file' key found in JSON.")
        return
        
    start_time = time.time()
    
    # Process the PDFs and store metadata
    pdf_metadata_list = process_pdfs_from_folder(folder_path)
    
    end_time = time.time()
    toltal_time = end_time - start_time
    
    logging.info(f"Processed {len(pdf_metadata_list)} PDFs in {total_time:.2f} seconds.")

# Set the path to the JSON file containing the directory path
json_file_path = r"C:\Users\Subhash Gupta\OneDrive\Desktop\Dataset.json"
measure_performance(json_file_path)