In [1]:
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['pdf_pipeline']
collection = db['documents']

In [2]:
import nltk
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to C:\Users\Rajkumar
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Rajkumar
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def compute_tf(sentence,word):
    words=word_tokenize(sentence.lower())
    return words.count(word.lower())/len(words)

def compute_idf(word,corpus):
    num_sentences=len(corpus)
    num_sentences_with_word=sum(1 for sentence in corpus if word in word_tokenize(sentence.lower()))
    return math.log((sum_sentences+1)/(num_sentences_with_word+1))+1

def summarize_text(text,summary_length=3):
    corpus=sent_tokenize(text)
    tf_idf_score={}
    for sentence in corpus:
        sentence_score=0
        for word in word_tokenize(sentence.lower()):
            if word not in stopword.words('english'):
                tf=compute_tf(sentence,word)
                idf=compute_idf(word,corpus)
                sentence_score+=tf*idf
        tf_idf_score[sentence]=sentence_score
        top_sentences=sorted(tf_idf_score,key=tf_idf_score.get,reverse=True)[:summary_length]
        return ' '.join(top_sentences)
            

In [4]:
from collections import Counter
from nltk.util import ngrams


In [5]:
def extract_keywords(text,num_keywords=5):
    word=[word for word in word_tokenize(text.lower()) if word.isalnum() and word not in stopword.words('english')]
    unigram=counter(word)
    bigram=counter([' '.join(gram) for gram in ngrams(word,2)])
    combined=unigram+bigram
    keywords=[word for word,freq in combined.most_common(num_keywords)]
    return keywords

In [6]:
from pdfminer.high_level import extract_text
def extract_text_from_pdf(filepath):
    try:
        extract_text(filepath)
    except Exception as e:
        print(f'error extracting text from {filepath}:{e}')
        return " "

In [7]:
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
class pdfhandler(FileSystemEventHandler):
    def on_create(self,event):
        if event.src_path.endswith('.pdf'):
            print(f'pdf detected:{event.src_path}')
            process_pdf(event.src_path)
def monitor_folder(folder_path):
    observer=Observer()
    observer.schedule(pdfhandler(),folder_path,recursive=False)
    observer.start()
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()
            

In [None]:
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
import time

class TestHandler(FileSystemEventHandler):
    def on_created(self, event):
        print(f"File detected: {event.src_path}")

folder_to_watch = "D:\\pdf_folder"

observer = Observer()
observer.schedule(TestHandler(), folder_to_watch, recursive=False)
observer.start()

print("Observer started. Waiting for file events...")
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    observer.stop()

observer.join()

Observer started. Waiting for file events...


In [None]:
import os
def process_pdf(filepath):
    text=extract_text_from_pdf(filepath)
    if not text:
        return
    summary=summarize_text(text)
    keywords=extract_keywords(text)
    document={
        'file_name':os.path.basename(filepath),
        'file_path':filepath,
        'keywords':keywords,
        'summary':summary,
        'processed_at': time.strftime('%Y-%m-%d %H:%M:%S')
    
    
    }
    collection.insert_one(document)
    print(f'stored document:{os.path.basename(filepath)} in MongoDB')

In [None]:
if __name__ == "__main__":
    folder_to_watch="D:\\pdf_folder"
    print(f"Monitoring folder: {folder_to_watch}")
    monitor_folder(folder_to_watch)