# Document Loading

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

In [5]:
def load_pdf(data):
    """
    Load PDF documents from a specified directory.

    Parameters:
    - data (str): The path to the directory containing PDF files.

    Returns:
    - documents (list): A list of loaded PDF documents.

    Example:
    >>> documents = load_pdf("/path/to/pdf_directory")
    >>> for document in documents:
    ...     print(document.text)
    
    Note:
    The function uses the DirectoryLoader to load PDF files with the specified glob pattern.
    The PyPDFLoader is used to handle the loading of individual PDF files.

    """
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader
                            )
    
    documents = loader.load()

    return documents

In [3]:
pages = load_pdf("data")

In [6]:
pages

[Document(page_content='mebooksfree.com mebooksfree.com mebooksfree.com mebooksfree.com\nmebooksfree.com mebooksfree.com mebooksfree.com mebooksfree.com\nmebooksfree.com mebooksfree.com mebooksfree.com mebooksfree.com\nmebooksfree.com mebooksfree.com mebooksfree.com mebooksfree.com\nmebooksfree.com mebooksfree.com mebooksfree.com mebooksfree.com', metadata={'source': 'data\\Shorter Oxford Textbook of Psychiatry.pdf', 'page': 0}),
 Document(page_content='   i\nShorter Oxford Textbook  \nof\xa0Psychiatry \nmebooksfree.com mebooksfree.com mebooksfree.com mebooksfree.com\nmebooksfree.com mebooksfree.com mebooksfree.com mebooksfree.com\nmebooksfree.com mebooksfree.com mebooksfree.com mebooksfree.com\nmebooksfree.com mebooksfree.com mebooksfree.com mebooksfree.com\nmebooksfree.com mebooksfree.com mebooksfree.com mebooksfree.com', metadata={'source': 'data\\Shorter Oxford Textbook of Psychiatry.pdf', 'page': 1}),
 Document(page_content='ii\nmebooksfree.com mebooksfree.com mebooksfree.com mebo

### Data Cleaning

In [7]:
"""
The following script is designed to remove watermarks from a digital book.
It iterates through each page of the book, assuming each page is represented as a 'document' object.
For each page, it modifies the 'page_content' attribute by removing the last 319 characters, assuming the watermark is present in that region.
This process is expected to help eliminate watermarks from the entire book.
"""

for document in pages:
    document.page_content = document.page_content[:-319]

In [8]:
pages

[Document(page_content='', metadata={'source': 'data\\Shorter Oxford Textbook of Psychiatry.pdf', 'page': 0}),
 Document(page_content='   i\nShorter Oxford Textbook  \nof\xa0Psychiatry \n', metadata={'source': 'data\\Shorter Oxford Textbook of Psychiatry.pdf', 'page': 1}),
 Document(page_content='ii\n', metadata={'source': 'data\\Shorter Oxford Textbook of Psychiatry.pdf', 'page': 2}),
 Document(page_content='1   iii\nShorter Oxford \nTextbook of \nPsychiatry\nSEVENTH EDITION\nPaul Harrison\nPhilip\xa0Cowen\nTom\xa0Burns\nMina\xa0Fazel  \n', metadata={'source': 'data\\Shorter Oxford Textbook of Psychiatry.pdf', 'page': 3}),
 Document(page_content='1iv\nGreat Clarendon Street, Oxford,  OX2\xa06DP,\nUnited Kingdom\nOxford University Press is a department of the University of Oxford.\nIt furthers the University’s objective of excellence in research, scholarship,\nand education by publishing worldwide. Oxford is a registered trade mark\xa0of\nOxford University Press in the UK and in certai