# Test Processor

In [None]:
from process_pdf import PDFProcessor

In [None]:
chunker = PDFProcessor()

In [None]:
file_content = r"D:\exxonpocaws\datafiles\a500.pdf"
s3 = None
input_bucket = None

In [None]:
chunked_content = chunker.process_pdf(file_content, s3, input_bucket)

In [None]:
chunked_content

# Test Lambda Function

In [None]:
from lambda_function import lambda_handler
import json
from dotenv import load_dotenv


In [None]:

# Sample context (mock it if needed)
class Context:
    def __init__(self):
        self.aws_request_id = "test"


In [None]:
context = Context()

In [None]:
# Load the event.json file
with open('test_event.json', 'r') as file:
    event = json.load(file)

In [None]:
load_dotenv()

In [None]:
# Invoke the function
response = lambda_handler(event, context)
print(response)

# Download Models

In [None]:
from transformers import CLIPProcessor, CLIPModel

In [None]:
# Specify your local folder for storing the model
local_folder = "./local_models/clip-vit-base-patch32"

# Download and save the model and processor locally
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", cache_dir=local_folder)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", cache_dir=local_folder)

# Save the model locally
model.save_pretrained("./local_models/model/clip-vit-base-patch32")
processor.save_pretrained("./local_models/processor/clip-vit-base-patch32")

In [3]:
# Use a pipeline as a high-level helper
from transformers import pipeline

In [9]:
local_folder = "./local_models/blip-image-captioning-base/"

In [7]:
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

In [10]:
pipe.save_pretrained(local_folder)

In [1]:
num_pages = 133
num_threads=4

In [None]:
chunk_size = -(-num_pages // num_threads)  # Ceiling division for chunk size
page_ranges = [list(range(i, min(i + chunk_size, num_pages))) for i in range(0, num_pages, chunk_size)]

print(page_ranges)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67], [68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101], [102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132]]


In [3]:
page_ranges

[[0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33],
 [34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67],
 [68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101],
 [102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132]]

In [38]:
page_ranges = [range(0,34), range(34,68), range(68, 102), range(102, 133)]

In [17]:
import pymupdf4llm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [19]:
futures=[]
with ThreadPoolExecutor() as executor:
    for pages in page_ranges:
        futures.append(executor.submit(pymupdf4llm.to_markdown, r"C:\Users\prana\Downloads\archive\Manuals\coffee machine1.pdf", page_chunks=True, pages = pages))

Processing C:\Users\prana\Downloads\archive\Manuals\coffee machine1.pdf...
[                                        ] (0/3[=                                       ] ( 1/3Processing C:\Users\prana\Downloads\archive\Manuals\coffee machine1.pdf...
[                                        ] (0/34)Processing C:\Users\prana\Downloads\archive\Manuals\coffee machine1.pdf...
[                                        ] (0/3[=                                       ] ( 1/34)[=                                       ] ( 1/34)Processing C:\Users\prana\Downloads\archive\Manuals\coffee machine1.pdf...


In [33]:
futures[-1].result()[0]

{'metadata': {'format': 'PDF 1.4',
  'title': 'ManualsLib - Makes it easy to find manuals online!',
  'author': 'Provided By MANUALSLIB.COM - http://www.manualslib.com/',
  'subject': 'Search through 700.000 manuals online & and download pdf manuals.',
  'keywords': 'manuals, instruction manuals, user manuals, service manuals, user guides, pdf manuals, owners manuals, installation guides ',
  'creator': 'pdftk 1.44 - www.pdftk.com',
  'producer': 'itext-paulo-155 (itextpdf.sf.net-lowagie.com)',
  'creationDate': 'D:20150810100907Z',
  'modDate': 'D:20150810100907Z',
  'trapped': '',
  'encryption': None,
  'file_path': 'C:\\Users\\prana\\Downloads\\archive\\Manuals\\coffee machine1.pdf',
  'page_count': 133,
  'page': 103},
 'toc_items': [],
 'tables': [],
 'images': [{'number': 29,
   'bbox': Rect(78.53807830810547, 306.070068359375, 167.00527954101562, 522.8566284179688),
   'transform': (88.46719360351562,
    0.0,
    -0.0,
    216.7865753173828,
    78.53807830810547,
    306.0700