In [92]:
import os
import time

In [93]:
import logging
from typing import List, Optional
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders.pdf import PyMuPDFLoader
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_community.llms import CTransformers
from langchain.chains import RetrievalQA
from langchain.chains.llm import LLMChain


In [94]:
index_name = "gen-ai-rag"

In [95]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def pdf_loader(path: str, glob_pattern: str = "*.pdf", loader_cls=PyMuPDFLoader) -> Optional[List[dict]]:

    try:
        logger.info(f"Starting to load documents from '{path}' with pattern '{glob_pattern}'")
        
        # Check if the directory exists
        if not os.path.isdir(path):
            logger.error(f"The directory '{path}' does not exist.")
            return None
        
        # Load the PDF files
        loader = DirectoryLoader(path, glob=glob_pattern, loader_cls=loader_cls)
        documents = loader.load()

        logger.info(f"Successfully loaded {len(documents)} documents.")

        return documents

    except FileNotFoundError as fnf_error:
        logger.error(f"File not found error: {fnf_error}")
    except AttributeError as attr_error:
        logger.error(f"Attribute error: {attr_error}. Check the structure of loaded documents.")
    except Exception as e:
        logger.error(f"An error occurred while loading PDF: {e}")

    return None

extracted_data = pdf_loader("D:\\Gen_AI\\END-TO-END-GenAI-RAG-APP\\data")


INFO:__main__:Starting to load documents from 'D:\Gen_AI\END-TO-END-GenAI-RAG-APP\data' with pattern '*.pdf'


INFO:__main__:Successfully loaded 2 documents.


In [96]:
def text_split(data):
    text_splitter = CharacterTextSplitter(chunk_size=120, chunk_overlap=0)
    text_chunks = text_splitter.split_documents(data)
    
    return text_chunks

docs = text_split(extracted_data)

In [97]:
model = "sentence-transformers/all-MiniLM-L6-v2"
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name=model)
    return embeddings

In [98]:
embeddings = download_hugging_face_embeddings()

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [100]:
vectorstore_from_docs = PineconeVectorStore.from_documents(
        docs,
        index_name=index_name,
        embedding=embeddings
    )

INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['d:\\Download\\anaconda_exe\\install\\envs\\genairag\\lib\\site-packages\\pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone


In [136]:
prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
if The data does not exist in the database then just say data is not present on the database,don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [137]:
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [138]:
config = {'max_new_tokens': 100, 'repetition_penalty': 1.1}
llm = CTransformers(
    model="../model/llama-2-7b-chat.ggmlv3.q2_K.bin",
    model_type="llama",
    config=config
)

In [139]:
# llm_chain = llm | PROMPT

In [140]:
qa = RetrievalQA.from_chain_type(  
    llm=llm,  
    chain_type="stuff",  
    retriever=vectorstore_from_docs.as_retriever(),
)

In [141]:
# Prepare the context and query
query = "Who is Asutosh Sidhya?"

# Call the invoke method with the correct input keys
result = qa.invoke({"query": query})  # Ensure you include context if required
print(result)



{'query': 'Who is Asutosh Sidhya?', 'result': " I don's 109 days ago were finalized for Clarify friend sat on the question: I donated their questions about the question at Brain were walking towards fellow students, Asutia Sidhuda discussed their questionThe project on February \nWhat is here walked back to Questions, but they encountered him with the question: I don's were working in ASutosh sat inquired about A TAsutya walked across from a friend. I don's"}
