# REQUIRED PACKAGE INSTALL

In [None]:
# pip install function
import subprocess
import sys

def install(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
    except:
        print("No such package")

In [None]:
# pip install packages
install('pytesseract')
install('pdf2image')
install('PyPDF2')
install('PyMuPDF')

In [7]:
import pytesseract
from pdf2image import convert_from_path
import PyPDF2
import io
import fitz # imports the pymupdf library

In [None]:
# uncomment the code below if you don't have tesseract added to path
# pytesseract.pytesseract.tesseract_cmd = r"Tesseract-OCR/tesseract.exe"

# INPUT LAYER

In [8]:
# pdf_doc = r'files/woodBoy.pdf'
pdf_doc = r'files/Brabender_P1.pdf'
# pdf_doc = r'files/Once upon a time.pdf'

# PARSING LAYER

In [9]:
# imagePDF_to_text function
def imagePDF_to_text(file):
    # poppler path
    poppler_path=r"Release-24.02.0-0/poppler-24.02.0/Library/bin"
    # convert to image
    images = convert_from_path(file, poppler_path=poppler_path)
    
    pdf_writer = PyPDF2.PdfWriter()
    page_text = ''
    for image in images:
        page = pytesseract.image_to_pdf_or_hocr(image, extension='pdf')
        pdf = PyPDF2.PdfReader(io.BytesIO(page))

        page_text += (pdf.pages[0]).extract_text()
        pdf_writer.add_page(pdf.pages[0])
    
    return page_text

# PDF_to_text function
def PDF_to_text(file):
  try:
    # works only for text-based pdf
    doc = fitz.open(file) # open a document
    for page in doc: # iterate the document pages
      text = page.get_text() # get plain text encoded as UTF-8
      if not text.strip():
        raise Exception('PDF is a scanned document')
    print('Done!')
    return text
  except Exception as e:
    print(f"Error while processing PDF: {e}")
    print('Running OCR engine...')
    text = imagePDF_to_text(file)
    print('Done!\n')
    return text

In [10]:
text = PDF_to_text(pdf_doc)

Error while processing PDF: PDF is a scanned document
Running OCR engine...
Done!



In [11]:
print(text)

Brabender'’ 
... where quality is measured. 
UY scl £ 
4 : 
. . iq 4 - £ 2 . Ps , 5} pt 3s . . ie cs 
, Instruction Manuat —~ . * : ~ ~>*s ; 
: ee "Pe . A. s < . 
rr Be ‘> “" ie ot Ss 7 
eet tae 7 ‘ > = fy P ; 7 
45 4 > Pees ‘ 
MetaStation 4E 
ID no. 8 156 70.xxx 
3 x 400 V, 50/60 Hz Copyright 
All content, pictures, texts and graphics are protected by copyright. All kind of 
translation, forwarding to third persons, reproduction and distribution - even of 
extracts - is prohibited without our prior express written consent. 
Brabender® and other brands not specially marked are registered trademarks of 
Brabender GmbH & Co. KG. 
© Copyright by 
Brabender GmbH & Co. KG 
KulturstraRe 49-51 
47055 Duisburg 
Germany 
Proprietary rights, brands and trademarks of third parties 
Any products registered as trademarks are not particularly marked in the present 
documentation. Existing property rights (patents, trademarks, registered or industrial 
designs) must be observed by all means. 
Origina

## Transformers

In [None]:
# export the searchable PDF to searchable.pdf
# out_file = f'{pdf_doc[:-4]}_searchable.pdf'
# with open(out_file, "wb") as f:
#     pdf_writer.write(f)

In [None]:
# # pip install 'git+https://github.com/huggingface/transformers.git'
# from transformers import AutoTokenizer, AutoModelForCausalLM

# model_id = "CohereForAI/c4ai-command-r-plus"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id)

# # Format message with the command-r-plus chat template
# messages = [{"role": "user", "content": "Hello, how are you?"}]
# input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
# ## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>

# gen_tokens = model.generate(
#     input_ids, 
#     max_new_tokens=100, 
#     do_sample=True, 
#     temperature=0.3,
#     )

# gen_text = tokenizer.decode(gen_tokens[0])
# print(gen_text)


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")


In [None]:
# Use a facebook pipeline as a high-level helper
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [None]:
len(text)

In [None]:
if '\n\n' in text:
    print('yes')
else:
    print('no')

In [None]:
# # Initialize variables
# max_length = 512  # Maximum sequence length supported by the model
# segments = []
# current_segment = []

# # Split the input into segments
# for word in words:
#     current_segment.append(word)
#     if len(current_segment) == max_length:
#         segments.append(current_segment)
#         current_segment = []
# # Add the last segment if it's not empty
# if current_segment:
#     segments.append(current_segment)


# # Process each segment separately
# for segment in segments:
#     # Convert segment tokens back to text
#     # segment_text = tokenizer.convert_tokens_to_string(segment)
#     # Encode segment
#     # inputs = tokenizer(segment_text, return_tensors='pt')
#     # Forward pass through the model
#     print(text+'\n')
#     outputs = summarizer(text, max_length=100, min_length=50, do_sample=False)
#     # outputs = model(**inputs)
#     print(outputs)
#     # Process model outputs as needed


In [None]:
# Split the input document into smaller segments
tokens = tokenizer.tokenize(text)

# Generate summaries for each segment
summaries = []
max_segment_length = 1024  # Max segment length for T5 model
segments = []
current_segment = []

# Split the input into segments
for token in tokens:
    current_segment.append(token)
    if len(current_segment) == max_segment_length:
        segments.append(current_segment)
        current_segment = []
# Add the last segment if it's not empty
if current_segment:
    segments.append(current_segment)

for segment in segments:
    # Convert segment tokens back to text
    segment_text = tokenizer.convert_tokens_to_string(segment)
    # Tokenize segment
    segment_tokens = tokenizer.encode(segment_text, return_tensors="pt", max_length=max_segment_length, truncation=True)
    # Generate summary for the segment
    summary_ids = model.generate(segment_tokens, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summaries.append(summary)

# Combine summaries to form the final summary
final_summary = "\n".join(summaries)
print(final_summary)