In [2]:
import os
import sys
sys.path.append('../..')

from py3810.myUtils import pickle_dump, pickle_load

# Set the path to the directory containing the Excel file
path_lumen_dump = "../langchain/docs/lumen/"
path_lumen_docs = path_lumen_dump + "docs/"

In [2]:
lumen_urls_pdf = pickle_load(filename_pickle='lumen_urls_pdf', path_pickle_dump=path_lumen_dump)
lumen_urls_pdf

['https://www.lumenoptometric.com/wp-content/uploads/2020/05/Intro_ScleralLenses.pdf',
 'https://www.lumenoptometric.com/wp-content/uploads/2020/07/Lumen-HPA300-NP-Template.pdf']

In [3]:
import re

def remove_newlines(text):
  # r'\n|\s{2,}' finds both newlines (\n) and multiple whitespaces (\s{2,}) 
  # and replace them with " ", .strip() strips leading and trailing spaces 
  return re.sub(r'\n|\s{2,}', ' ', text).strip()

text = "   \n  This is a string  \n  with   multiple \n \n \nnewlines.  "
text_cleaned = remove_newlines(text)
print(f'text: {text}')
print('=========')
print(f'text_cleaned: {text_cleaned}')

text:    
  This is a string  
  with   multiple 
 
 
newlines.  
text_cleaned: This is a string with multiple newlines.


In [4]:
import langchain  # Assuming langchain is installed
from langchain_community.document_loaders import PyPDFLoader

def convert_pdfs_to_langchain(pdf_urls):
  """
  This function converts a list of PDF URLs to Langchain document objects
  using PyPDFLoader for efficient processing.

  Args:
      pdf_urls: A list of URLs pointing to PDF files.

  Returns:
      A list of Langchain TextDocument objects.
  """
  langchain_docs = []
  for url in pdf_urls:
    try:
      # Use PyPDFLoader to read the PDF content locally (avoiding download)
      loader = PyPDFLoader(url)
      docs = loader.load()
      for doc in docs:
        # append only if page_content has text
        if doc.page_content != "":
          doc.page_content = remove_newlines(doc.page_content)  
          langchain_docs.append(doc)

        # langchain_docs.append(doc)
    except Exception as e:
      print(f"Error processing PDF from {url}: {e}")
  return langchain_docs

# Example usage (same as before)
pdf_urls = \
  ['https://www.lumenoptometric.com/wp-content/uploads/2020/05/Intro_ScleralLenses.pdf',
   'https://www.lumenoptometric.com/wp-content/uploads/2020/07/Lumen-HPA300-NP-Template.pdf']
langchain_documents = convert_pdfs_to_langchain(pdf_urls)

# You can now use the Langchain documents for further processing
for doc in langchain_documents:
  print(f"Langchain document text: {doc.page_content[:100]}...")


Langchain document text: INTRODUCTION TO SCLERAL LENSESCLEAR & COMFORTABLE VISION WITHOUT COMPROMISE...
Langchain document text: EXCEPTIONALLY CLEAR, STABLE, AND COMFORTABLE vision is no longer out of reach. The future is here an...


In [5]:
docs_pdfs = convert_pdfs_to_langchain(lumen_urls_pdf)

In [6]:
pickle_dump(file_to_pickle=docs_pdfs, filename_pickle='lumen_docs_pdfs', path_pickle_dump=path_lumen_docs)

### Add metadata to lumen_docs_pdfs

In [3]:
# _docs_pdfs = pickle_load(filename_pickle='lumen_docs_pdfs', path_pickle_dump=path_lumen_docs)

In [4]:
# _docs_pdfs

[Document(page_content='INTRODUCTION TO SCLERAL LENSESCLEAR & COMFORTABLE VISION WITHOUT COMPROMISE', metadata={'source': 'https://www.lumenoptometric.com/wp-content/uploads/2020/05/Intro_ScleralLenses.pdf', 'page': 0}),
 Document(page_content='EXCEPTIONALLY CLEAR, STABLE, AND COMFORTABLE vision is no longer out of reach. The future is here and the answer lies with scleral lenses. WILL I SEE A DIFFERENCE BETWEEN SCLERAL AND SOFT LENSES? Scleral lenses are built from highly- customizable, firm lens materials, which offer clearer vision relative to the standard soft lenses. Example: consider how vision is better through a window versus clear, flexible plastic sheeting. CAN SCLERALS BE WORN THE WHOLE DAY? They can be worn for the full course of the day. Some patients may need to remove and clean the lenses mid day to maintain the best possible comfort and vision. WILL THEY HELP WITH MY DRY EYES? The moisture chamber under the scleral lens will greatly diminish dryness, leading to much mor

In [6]:
# for _doc in _docs_pdfs:
#   print(_doc.page_content)
#   print(_doc.metadata)

INTRODUCTION TO SCLERAL LENSESCLEAR & COMFORTABLE VISION WITHOUT COMPROMISE
{'source': 'https://www.lumenoptometric.com/wp-content/uploads/2020/05/Intro_ScleralLenses.pdf', 'page': 0}
EXCEPTIONALLY CLEAR, STABLE, AND COMFORTABLE vision is no longer out of reach. The future is here and the answer lies with scleral lenses. WILL I SEE A DIFFERENCE BETWEEN SCLERAL AND SOFT LENSES? Scleral lenses are built from highly- customizable, firm lens materials, which offer clearer vision relative to the standard soft lenses. Example: consider how vision is better through a window versus clear, flexible plastic sheeting. CAN SCLERALS BE WORN THE WHOLE DAY? They can be worn for the full course of the day. Some patients may need to remove and clean the lenses mid day to maintain the best possible comfort and vision. WILL THEY HELP WITH MY DRY EYES? The moisture chamber under the scleral lens will greatly diminish dryness, leading to much more comfortable vision. HOW LONG DOES A SCLERAL LENS LAST? A se

In [8]:
# metadata = {
#   'source': 'https://www.lumenoptometric.com/wp-content/uploads/2020/05/Intro_ScleralLenses.pdf',
#   'title': 'SCLERAL LENSES CLEAR & COMFORTABLE VISION WITHOUT COMPROMISE vision is no longer out of reach',
#   'description' : 'SCLERAL LENSES FAQ answers to common questions on scleral lenses'
# }

In [9]:
# for _doc in _docs_pdfs:
  # _doc.metadata = metadata

In [10]:
# pickle_dump(file_to_pickle=_docs_pdfs, filename_pickle='lumen_docs_pdfs_', path_pickle_dump=path_lumen_docs)

In [11]:
# _docs_pdfs = pickle_load(filename_pickle='lumen_docs_pdfs_', path_pickle_dump=path_lumen_docs)

In [12]:
# _docs_pdfs

[Document(page_content='INTRODUCTION TO SCLERAL LENSESCLEAR & COMFORTABLE VISION WITHOUT COMPROMISE', metadata={'source': 'https://www.lumenoptometric.com/wp-content/uploads/2020/05/Intro_ScleralLenses.pdf', 'title': 'SCLERAL LENSES CLEAR & COMFORTABLE VISION WITHOUT COMPROMISE vision is no longer out of reach', 'description': 'SCLERAL LENSES FAQ answers to common questions on scleral lenses'}),
 Document(page_content='EXCEPTIONALLY CLEAR, STABLE, AND COMFORTABLE vision is no longer out of reach. The future is here and the answer lies with scleral lenses. WILL I SEE A DIFFERENCE BETWEEN SCLERAL AND SOFT LENSES? Scleral lenses are built from highly- customizable, firm lens materials, which offer clearer vision relative to the standard soft lenses. Example: consider how vision is better through a window versus clear, flexible plastic sheeting. CAN SCLERALS BE WORN THE WHOLE DAY? They can be worn for the full course of the day. Some patients may need to remove and clean the lenses mid day