In [18]:
import PyPDF2
import pdfminer
import spacy
import sys
import io
from pdfminer.high_level import extract_text
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import fitz

In [8]:
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

In [16]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def highlight_key_sentences(text, nlp, threshold=None):
    doc = nlp(text)
    sentences = list(doc.sents)
    sentence_vectors = [sent.vector for sent in sentences if sent.vector_norm > 0]

    if len(sentence_vectors) == 0:
        return []

    if threshold is None:
        avg_cosine_similarity = np.mean(cosine_similarity(sentence_vectors))
        threshold = avg_cosine_similarity * 1.1

    key_sentences = []
    for sent in sentences:
        if sent.vector_norm > 0:
            similarity = cosine_similarity([sent.vector], sentence_vectors).mean()
            if similarity > threshold:
                key_sentences.append(sent)

    return key_sentences

In [10]:
def add_highlights_to_pdf(input_pdf, output_pdf, key_sentences):
    doc = fitz.open(input_pdf)
    for sentence in key_sentences:
        for page in doc:
            areas = page.search_for(str(sentence))
            for area in areas:
                highlight = page.add_highlight_annot(area)
                highlight.update()

    doc.save(output_pdf)
    doc.close()

In [11]:
def key_sentence_highlighter(input_pdf, output_pdf):
    nlp = spacy.load("en_core_web_sm")
    text = extract_text_from_pdf(input_pdf)
    key_sentences = highlight_key_sentences(text, nlp)
    add_highlights_to_pdf(input_pdf, output_pdf, key_sentences)

In [22]:
input_pdf = "Impromptu.pdf"
output_pdf = "Impromptu-highlighter.pdf"

In [23]:
import time
time_begin = time.time()
key_sentence_highlighter(input_pdf, output_pdf)
print("Number of seconds taken = ",time.time()-time_begin)

Number of seconds taken =  375.79170417785645
