In [None]:
!pip install pymupdf transformers torch torchvision torchaudio
!pip install PyMuPDF


Collecting pymupdf
  Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.24.14


In [5]:
import fitz
from transformers import BartForConditionalGeneration, BartTokenizer
import textwrap


def extract_text_from_pdf(pdf_path):
    #Extract text from a PDF file.
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(doc.page_count):
            page = doc[page_num]
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""

def text_summarizer_from_pdf(pdf_path):
    #Summarize the text extracted from a PDF file.
    pdf_text = extract_text_from_pdf(pdf_path)
    if not pdf_text:
        return "No text extracted from PDF."

    model_name = "facebook/bart-large-cnn"
    try:
        model = BartForConditionalGeneration.from_pretrained(model_name)
        tokenizer = BartTokenizer.from_pretrained(model_name)
    except Exception as e:
        print(f"Error loading model: {e}")
        return "Model loading failed."

    inputs = tokenizer.encode("summarize: " + pdf_text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    formatted_summary = "\n".join(textwrap.wrap(summary, width=80))
    return formatted_summary

def save_summary_as_pdf(pdf_path, summary):
    #Save the summary as a new PDF file.
    try:
        doc = fitz.open()
        page = doc.new_page()
        page.insert_text((10, 100), summary, fontname="helv", fontsize=12)  # Adjust the vertical position as needed

        output_pdf_path = pdf_path.replace(".pdf", "_summary.pdf")
        doc.save(output_pdf_path)
        doc.close()
        return output_pdf_path
    except Exception as e:
        print(f"Error saving summary as PDF: {e}")
        return ""

# Main execution
pdf_file_path = r"/content/Paper1131_NikitaAhire_summary.pdf"
summary = text_summarizer_from_pdf(pdf_file_path)
print("\n--- Generated Summary ---\n")
print(summary)
output_pdf_path = save_summary_as_pdf(pdf_file_path, summary)
print()
print("Summary saved as PDF:", output_pdf_path)


--- Generated Summary ---

summarize: No text extracted from PDF. Use the weekly Newsquiz to test your
knowledge of stories you saw on CNN.com and CNN iReport.com. Today's Daily
Discussion includes the latest news stories from CNN and CNN Tech.

Summary saved as PDF: /content/Paper1131_NikitaAhire_summary_summary.pdf
