In [5]:
!pip install PyPDF2 python-docx spacy





In [6]:
import os
import sys
import PyPDF2
import docx
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
from collections import defaultdict

In [7]:
def read_pdf(file_path):
    with open(file_path, "rb") as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text = " ".join([page.extract_text() for page in pdf_reader.pages])
    return text

In [8]:
def read_docx(file_path):
    doc = docx.Document(file_path)
    text = " ".join([para.text for para in doc.paragraphs])
    return text

In [9]:
def read_document(file_path):
    _, ext = os.path.splitext(file_path)
    if ext == ".pdf":
        return read_pdf(file_path)
    elif ext == ".docx":
        return read_docx(file_path)
    elif ext == ".txt":
        return read_txt(file_path)
    else:
        raise ValueError("Unsupported file format.")

In [10]:
def summarize(text, compression_ratio=0.2):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    word_frequencies = defaultdict(int)
    for word in doc:
        if word.text.lower() not in STOP_WORDS and word.text.lower() not in punctuation:
            word_frequencies[word.text.lower()] += 1

    max_frequency = max(word_frequencies.values())
    for word in word_frequencies:
        word_frequencies[word] = word_frequencies[word] / max_frequency

    sentence_scores = defaultdict(int)
    for sentence in doc.sents:
        for word in sentence:
            sentence_scores[sentence] += word_frequencies[word.text.lower()]

    n_sentences = max(1, int(len(list(doc.sents)) * compression_ratio))
    summarized_sentences = nlargest(n_sentences, sentence_scores, key=sentence_scores.get)
    summarized_text = " ".join([str(sentence) for sentence in summarized_sentences])
    return summarized_text

In [16]:
def save_summary_to_txt(input_file, output_file="SummaryOfDocument.txt"):
    text = read_document(input_file)
    summary = summarize(text)
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(summary)
    print(f"Summary saved to {os.path.abspath('SummaryOfDocument.txt')}")

In [18]:
input_file = "Impromptu.pdf"

In [19]:
save_summary_to_txt(input_file)

Summary saved to C:\Users\somu.teja\Downloads\Experiment\SummarizingADocument\SummaryOfDocument.txt
