In [7]:
import re
import os
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langchain.document_loaders import PyPDFLoader
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Spacer, Image as ReportLabImage, Paragraph
from reportlab.lib.enums import TA_CENTER
from llmsherpa.readers import LayoutPDFReader
from bs4 import BeautifulSoup
import requests
from PIL import Image as PILImage
from spire.pdf import PdfDocument
from spire.pdf.common import ImageFormat

##### Downloading PDF file and initialization of tokenizer and model

In [2]:
pdf_url = 'https://arxiv.org/pdf/1706.03762.pdf'
loader = PyPDFLoader(pdf_url)
text = str(loader.load())

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

If you dont need also extratrion just change value to False

In [3]:
photo_extr = True

#### Headers extraction and preparation
1) extract by using webs crapping
2) format headers

In [4]:
# converting PDF into HTML to find headers by using HTML tags like <h1>, <h2> etc
llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_reader = LayoutPDFReader(llmsherpa_api_url)
doc = pdf_reader.read_pdf(pdf_url)
doc = doc.to_html()
soup = BeautifulSoup(doc, 'html.parser')
headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td'])

# all headers and other useful lines conteins in headers_text
headers_text = [header.get_text(strip=True).strip() for header in headers]
headers_text = list(dict.fromkeys(headers_text))

# in arXiv documents there are can issues with Introduction header, so if this header not include we add it
pattern = re.compile(r'^\d+(\.\d+)*\s')
numbered_headers = [header for header in headers_text if pattern.match(header)]
intro_present = any(re.match(r'^1\s+Introduction', header, re.IGNORECASE) for header in numbered_headers)
if not intro_present:
    index_for_intro = next((i for i, header in enumerate(numbered_headers) if header.startswith('2 ')), 0)
    numbered_headers.insert(index_for_intro, '1 Introduction')

# extract all useful headers
updated_pattern = re.compile(r'''
    ^                         # Start of line
    (\d+(\.\d+)*)             # Section or subsection number (e.g., "3", "3.2", "3.2.1")
    (\s+[A-Za-z].*)           # Space and section title starting with a letter
    $                         # End of line
    |                         # OR
    ^\d+\s\d+\.\d+\s          # Starts with numbers separated by spaces, with a dot between numbers
    (\d+\.\d+\s+)*            # Followed by a series of numbers with dots and spaces
    \d+K?\.\d+                # Ends with a number with a decimal part, possibly with "K"
    (\s\d+)*                  # Followed by spaces and numbers
    $                         # End of line
''', re.VERBOSE)
headers = [header for header in numbered_headers if updated_pattern.match(header)]
#same problem like with Introduction
headers.append("REFERENCES" if "REFERENCES" in text else "References")

#### Photo extraction

In [8]:
if photo_extr:
    response = requests.get(pdf_url)
    
    if response.status_code == 200:
        with open('arXiv_doc.pdf', 'wb') as file:
            file.write(response.content)
        doc = PdfDocument()
        doc.LoadFromFile('arXiv_doc.pdf')
        
    images = []
    
    for i in range(doc.Pages.Count):
        page = doc.Pages.get_Item(i)
        for image in page.ExtractImages():
            images.append(image)
            
    index = 0

    images_folder = 'images'
    
    if not os.path.exists(images_folder):
        os.makedirs(images_folder)
    
    for image in images:
        imageFileName = r'images\image_{0}.png'.format(index).format(index)
        index += 1
        image.Save(imageFileName, ImageFormat.get_Png())
        
    doc.Close()

#### Text summarization

In [10]:
def summerize(text):
    inputs_no_trunc = tokenizer(text, max_length=None, return_tensors='pt', truncation=False) 
    
    chunk_start = 0
    chunk_end = tokenizer.model_max_length  # == 1024 for Bart
    inputs_batch_lst = []
    space_token_id = tokenizer.encode(' ', add_special_tokens=False)[0]
    
    while chunk_start <= len(inputs_no_trunc['input_ids'][0]):
        try:
            current_chunk = inputs_no_trunc['input_ids'][0][chunk_start:chunk_end].tolist()
            end_index = len(current_chunk) - 1 - current_chunk[::-1].index(space_token_id)
            chunk_end = chunk_start + end_index
        except ValueError:
            pass
            
        inputs_batch = inputs_no_trunc['input_ids'][0][chunk_start:chunk_end] # get batch of n tokens
        inputs_batch = torch.unsqueeze(inputs_batch, 0)
        inputs_batch_lst.append(inputs_batch)
        chunk_start = chunk_end + 1
        chunk_end = min(chunk_start + tokenizer.model_max_length, len(inputs_no_trunc['input_ids'][0]))
    
    summary_ids_lst = [model.generate(inputs, num_beams=4, max_length=1024, early_stopping=True) for inputs in inputs_batch_lst]
                                                                     
    summary_batch_lst = []
    for summary_id in summary_ids_lst:
        summary_batch = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_id]
        summary_batch_lst.append(summary_batch[0])
    summary_all = '\n'.join(summary_batch_lst)
    return summary_all

#### Preparation for adding text into a PDF

In [11]:
patternForBrackets = re.compile(r'\[\s*\d+(?:,\s*\d+)*\s*\]') # in arXiv documents exist links like [2], [3, 1], [12, 45], so we delete them
pdf_path = "summary.pdf"
doc = SimpleDocTemplate(pdf_path, pagesize=A4)
styles = getSampleStyleSheet()
style = styles['Normal']
elements = []

#### Text adding

In [12]:
for header in range(len(headers)-1):
    startHeader = headers[header]
    endHeader = headers[header + 1]

    # header size depends on subheading it or not
    dot_count = startHeader.count('.')
    if dot_count == 0:
        header_style = styles['Heading2']
    elif dot_count != 0:
        header_style = styles['Heading5']

    # search for text between startHeader and endHeader
    pattern = re.compile(re.escape(startHeader) + "(.*?)" + re.escape(endHeader), re.DOTALL)
    match = pattern.search(text)

    # adding
    if match:
        text_between = match.group(1)
        text_without_brackets = patternForBrackets.sub('', text_between) # deletion of brackets
        # same problem with '\\n' and '\n' as with the brackets
        text_final = text_without_brackets.replace("\\n","") # deletion of '\\n' sign
        text_final = text_final.replace("\n","") # deletion of '\n' sign

        summary_text = summerize(text_final)        
        elements.append(Paragraph(startHeader, header_style))
        elements.append(Paragraph(summary_text, style))
        elements.append(Spacer(1, 12))        

Token indices sequence length is longer than the specified maximum sequence length for this model (1656 > 1024). Running this sequence through the model will result in indexing errors


#### Photo adding

In [13]:
if photo_extr:
    styles = getSampleStyleSheet()
    for img_num in range(len(images)):
        image_path = f'images/image_{img_num}.png'
        pil_image = PILImage.open(image_path)
        real_width, real_height = pil_image.size

        # there are can be problems with image size, so we need to prepare it
        dpi = 72  
        width_in_points = real_width / dpi * 7 # in case if your image is too large just change 7 to 5 or 3
        height_in_points = real_height / dpi * 7
        width_in_points = float(width_in_points)
        height_in_points = float(height_in_points)
    
        img = ReportLabImage(image_path, width_in_points, height_in_points)
        elements.append(img)
        
        # Subheading adding under the hoto
        centered_style = ParagraphStyle(name='CenteredStyle', parent=styles['Normal'], alignment=TA_CENTER)
        elements.append(Paragraph(f"Figure {img_num}", centered_style))
        elements.append(Spacer(1, 12))

#### Biulding PDF summary

In [14]:
doc.build(elements)