<a href="https://colab.research.google.com/github/subhashjprasad/pdf-summarizer/blob/main/PDFSummarizerEasyUse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Run Following Cell Once

In [None]:
%%capture
!pip install levenshtein

!pip install datasets
!pip install transformers

from huggingface_hub import hf_hub_download
import re
from PIL import Image

from transformers import NougatProcessor, VisionEncoderDecoderModel
from datasets import load_dataset
import torch

processor = NougatProcessor.from_pretrained("facebook/nougat-base")
model = VisionEncoderDecoderModel.from_pretrained("facebook/nougat-base")

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
model.to(device)

!apt-get install poppler-utils
!pip install pdf2image

from pdf2image import convert_from_path, convert_from_bytes
from IPython.display import display, Image

import textwrap
wrapper = textwrap.TextWrapper(width=100)

import torch
from transformers import pipeline

hf_name = 'pszemraj/led-large-book-summary'

summarizer = pipeline(
    "summarization",
    hf_name,
    device=0 if torch.cuda.is_available() else -1,
)

Rerun Following Cells For Each New PDF

In [None]:
%%capture
pdf_path = '_____.pdf' # replace with pdf path
images = convert_from_bytes(open(pdf_path, 'rb').read(), size=800)

pixel_values = []
for i in range(len(images)):
    pixel_values.append(processor(images[i], return_tensors="pt").pixel_values)

outputs = []
for i in range(len(pixel_values)):
    outputs.append(model.generate(
        pixel_values[i].to(device),
        min_length=1,
        max_new_tokens=5000,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
    ))

full_text = []

for i in range(len(outputs)):
    sequence = processor.batch_decode(outputs[i], skip_special_tokens=True)[0]
    sequence = processor.post_process_generation(sequence, fix_markdown=False)
    full_text.append(sequence)

    sequence_list = wrapper.wrap(text = sequence)
    print(f"Page {i + 1}:", '\n')
    for element in sequence_list:
        print(element)
    print('\n')

page_number = 1
summary_text = []

for page in full_text:
    print(f"Page {page_number} Summary: \n------------------------\n")
    summary_text.append(f"Page {page_number} Summary: \n------------------------\n")

    result = summarizer(
        page,
        min_length=16,
        max_length=512,
        no_repeat_ngram_size=3,
        encoder_no_repeat_ngram_size=3,
        repetition_penalty=3.5,
        num_beams=4,
        early_stopping=True,
    )

    result_wrap_list = wrapper.wrap(text = result[0]['summary_text'])
    for element in result_wrap_list:
        print(element)
        summary_text.append(element)

    print("\n------------------------\n")
    summary_text.append("\n------------------------\n")
    page_number += 1

In [None]:
for s in summary_text:
    print(s)