In [2]:
!pip install pymongo


Collecting pymongo
  Downloading pymongo-4.10.1-cp311-cp311-win_amd64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.10.1-cp311-cp311-win_amd64.whl (876 kB)
   ---------------------------------------- 0.0/876.5 kB ? eta -:--:--
   ----------- ---------------------------- 245.8/876.5 kB 5.0 MB/s eta 0:00:01
   ---------------- ----------------------- 358.4/876.5 kB 4.4 MB/s eta 0:00:01
   ---------------- ----------------------- 368.6/876.5 kB 2.5 MB/s eta 0:00:01
   ------------------------- -------------- 553.0/876.5 kB 2.7 MB/s eta 0:00:01
   ------------------------------- -------- 686.1/876.5 kB 2.9 MB/s eta 0:00:01
   ----------------------------------- ---- 768.0/876.5 kB 2.6 MB/s eta 0:00:01
   ---------------------------------------  870.4/876.5 kB 2.6 MB/s eta 0:00:01
   ---------------------------------------- 876.5/876.5 kB 2.3 MB/s eta 0:00:00
Downloading dnspytho

In [3]:
import os  # For interacting with the file system
import PyPDF2  # For reading and extracting text from PDFs
from pymongo import MongoClient  # For connecting to MongoDB and storing results
import nltk  # For natural language processing
from nltk.corpus import stopwords  # To filter out common stopwords for keyword extraction
from sklearn.feature_extraction.text import TfidfVectorizer  # For keyword extraction using TF-IDF


In [4]:

# Download stopwords required for keyword extraction
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Windows\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:

# Set up the connection to the MongoDB database
# We assume MongoDB is running locally on the default port
client = MongoClient('mongodb://localhost:27017/')
db = client['pdf_db']  # Database where we store PDF processing results
collection = db['pdf_documents']  # Collection to store each document's summary and keywords


In [6]:

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    """
    This function takes the path of a PDF file as input and extracts all the text from it.
    Returns the extracted text or None if an error occurs.
    """
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)  # Open the PDF file
            text = ''
            for page in reader.pages:
                text += page.extract_text()  # Extract text from each page and append
            return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {str(e)}")  # Print error if something goes wrong
        return None

In [7]:
# Function to generate a basic summary of the extracted text
def generate_summary(text, max_sentences=2):
    """
    This function generates a summary by taking the first 'max_sentences' sentences from the text.
    This is a very basic form of summarization.
    """
    sentences = text.split('.')  # Split the text into sentences
    summary = '. '.join(sentences[:max_sentences]) + '.'  # Join the first 'max_sentences' sentences
    return summary


In [8]:

# Function to extract keywords using TF-IDF (Term Frequency - Inverse Document Frequency)
def extract_keywords(text, num_keywords=5):
    """
    This function extracts important keywords from the text using the TF-IDF method.
    It returns the top 'num_keywords' most relevant keywords.
    """
    stop_words = set(stopwords.words('english'))  # Define stopwords to ignore common words
    vectorizer = TfidfVectorizer(stop_words=stop_words)  # Initialize TF-IDF vectorizer
    tfidf_matrix = vectorizer.fit_transform([text])  # Apply TF-IDF to the text
    feature_array = vectorizer.get_feature_names_out()  # Get the feature names (words)
    tfidf_sorting = tfidf_matrix.toarray().flatten().argsort()[::-1]  # Sort words by TF-IDF score
    top_keywords = feature_array[tfidf_sorting][:num_keywords]  # Get the top keywords
    return top_keywords

In [9]:
# Function to store the extracted summary and keywords in MongoDB
def store_in_mongo(file_name, summary, keywords):
    """
    This function takes the file name, generated summary, and extracted keywords,
    and stores them in the MongoDB database.
    """
    document = {
        'file_name': file_name,  # Store the name of the PDF file
        'summary': summary,  # Store the generated summary
        'keywords': keywords  # Store the extracted keywords
    }
    collection.insert_one(document)  # Insert the document into MongoDB
    print(f"Stored in MongoDB: {file_name}")  # Confirm that the data has been stored

In [10]:

# Function to process all PDFs in a folder
def process_pdfs_from_folder(folder_path):
    """
    This function processes all PDF files in the given folder.
    It extracts text, generates summaries, extracts keywords, and stores the results in MongoDB.
    """
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]  # List all PDF files in the folder
    
    for pdf_file in pdf_files:
        file_path = os.path.join(folder_path, pdf_file)  # Get the full path of the PDF file
        print(f"Processing: {file_path}")  # Print the file being processed
        
        # Step 1: Extract text from the PDF
        text = extract_text_from_pdf(file_path)
        
        if text:
            # Step 2: Generate a summary from the extracted text
            summary = generate_summary(text)
            
            # Step 3: Extract keywords from the extracted text
            keywords = extract_keywords(text)
            
            # Step 4: Store the summary and keywords in MongoDB
            store_in_mongo(pdf_file, summary, keywords)

# Define the folder where the PDFs are stored
pdf_folder = ""C:\Users\Windows\Documents\html.i""

# Start processing the PDFs in the folder
process_pdfs_from_folder(pdf_folder)


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'path_to_your_pdf_folder'