In [None]:
from langchain.document_loaders import PyPDFLoader

def documents_loader(path: str):
    """
    Load documents from a given path.
    Args:
        path (str): The path to the document.
    Returns:
        list: A list of loaded documents.
    """
    loader = PyPDFLoader(path)
    documents = loader.load()
    return documents


In [8]:
documents = documents_loader("./Data/PDF_Files/the-gale-encyclopedia-of-medicine_compress.pdf")
print(len(documents))

637


In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into smaller chunks.
    Args:
        documents (list): List of documents to split.
        chunk_size (int): Size of each chunk.
        chunk_overlap (int): Overlap between chunks.
    Returns:
        list: List of split documents.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    text_chunks = text_splitter.split_documents(documents)
    return text_chunks

In [17]:
chunks = split_documents(documents, chunk_size=1000, chunk_overlap=200)
print(f"Number of chunks: {len(chunks)}")

Number of chunks: 3426
