USE GPU

To implement simple PDF Document search using Open Source Generative AI model.

## The Code

### Imports

Generate a token by creating a HuggingFace account

In [10]:
from langchain.document_loaders import TextLoader  #for textfiles
from langchain.text_splitter import CharacterTextSplitter #text splitter
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader  #load pdf
from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredURLLoader  #load urls into docoument-loader
import pandas as pd

from warnings import filterwarnings
filterwarnings('ignore')

import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "xxxxxxxxxxxx"

### Load the dataset into a DataFrame

In [11]:
# Download the zip file from the kaggle website and Load the flder into colab
def load_dataset():
    # set variables
    main_folder = '../input/celeba-dataset/'
    images_folder = main_folder + 'img_align_celeba/img_align_celeba/'

    EXAMPLE_PIC = images_folder + '000506.jpg'

    # import the data set that include the attribute for each picture
    df_attr = pd.read_csv(main_folder + 'list_attr_celeba.csv')
    df_attr.set_index('image_id', inplace=True)
    df_attr.replace(to_replace=-1, value=0, inplace=True) #replace -1 by 0
    df_attr.shape

### Loading the PDF



*   https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf
*   Example: use the PyPDFLoader from the LangChain library here to load our PDF file





In [12]:
def load_and_split_pdf(pdf_path):
    # Use PyPDFLoader to load the PDF file
    pdf_loader = PyPDFLoader(pdf_path)
    document = pdf_loader.load_and_split()
    
    return document

### Chunking the text



*   https://python.langchain.com/docs/modules/data_connection/document_transformers/
*   Example: use the RecursiveCharacterTextSplitter here to split the data which works by taking a large text and splitting it based on a specified chunk size.





In [13]:
def chunk_text(text):
    # Use RecursiveCharacterTextSplitter to chunk the text based on the specified size
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512,
                                                   chunk_overlap=100,
                                                   separators=['\n', '\r\n', '\r', '\f', '\v', '\u2028', '\u2029'])
    chunks = text_splitter.split_documents(text)
    
    return chunks

### Storing the Embeddings in a Vector Store:



*    https://python.langchain.com/docs/modules/data_connection/vectorstores/

*   Example: using FAISS. FAISS, short for Facebook AI Similarity Search, is a powerful library designed for efficient searching and clustering of dense vectors.




In [14]:
def store_embeddings(text_chunks, model_name="flaubert/flaubert-large-cased", vector_dim=768):
    # Initialize Hugging Face FlauBERT model
    embeddings = HuggingFaceEmbeddings()

    # Compute embeddings for each text chunk
    chunk_embeddings = FAISS.from_documents(text_chunks, embeddings)

    # Return the vector store
    return chunk_embeddings

### Similarity Search with Open Source Model



*    connect here to the hugging face hub to fetch the Flan-T5 XL model.
*    Define a host of model settings for the model, such as temperature and max_length.
*    The load_qa_chain function provides a simple method for feeding documents to an LLM





In [15]:
def similarity_search():
    # Connect to the Hugging Face Hub to fetch the Flan-T5 XL model
    model_settings = {
        "temperature": 0.95,
        "max_length": 1000000,
    }
    
    hugging_face_hub = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs=model_settings)
    qa_chain = load_qa_chain(hugging_face_hub, chain_type="stuff")
        
    return hugging_face_hub, qa_chain

### Creating QA Chain and Querying



*    Use the RetrievalQAChain to retrieve documents using a Retriever and then uses a QA chain to answer a question based on the retrieved documents





In [16]:
def create_qa_chain(model, vector, chain_type="stuff"): 
    # Create a Retrieval-based Question Answering chain
    retrieval_qa_chain = RetrievalQA.from_chain_type(retriever=vector.as_retriever(search_kwargs={"k": 3}), 
                                                     llm = model,
                                                     chain_type=chain_type)
    
    return retrieval_qa_chain

### Main function to upload a PDF and search over it

In [17]:
def main():
    pdf_path = input("Enter the path to the PDF file: ")

    # Step 1: Load PDF and Split Text
    text_from_pdf = load_and_split_pdf(pdf_path)

    # Step 2: Chunk Text
    chunks_of_text = chunk_text(text_from_pdf)

    # Step 3: Store Embeddings
    vector_store = store_embeddings(chunks_of_text)

    # Step 4: Similarity Search
    flan_t5_model, qa_chain = similarity_search()

    # Step 5: Create QA Chain
    retrieval_qa_chain = create_qa_chain(flan_t5_model, vector_store)

    # Step 6: Answer Questions
    question = input("Enter your question: ")
    answer = retrieval_qa_chain.run(question)
    print(f"{question}\n{answer}")

In [18]:
if __name__ == "__main__":
    main()

what is the title inside the pdf ?
The Transformative Power of Yoga: A Holistic Journey to Mind -Body Wellness
