In [2]:
import os 
import time
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
import torch

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

In [4]:
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from langchain_pinecone import PineconeVectorStore 
from langchain.chains import RetrievalQA
from langchain.llms import CTransformers

In [5]:
from langchain.chains import RetrievalQAWithSourcesChain

In [6]:
from transformers import AutoTokenizer, AutoModel

In [7]:
from langchain_pinecone import PineconeVectorStore

In [8]:
load_dotenv()

True

In [9]:
#Load the embedding model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [10]:
#Function for load pdf
def pdf_loader(data):
    try:
        loader =DirectoryLoader(data,glob="*.pdf",loader_cls=PyPDFLoader)
        document = loader.load()

        return document
    except Exception as e:
        print(f"An error occurred while loading the PDF(s): {e}")
        return None

In [61]:
extracted_data = pdf_loader("../data")

In [62]:
#To create chunks of each tex
def text_split(data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 120, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(data)
    return text_chunks

In [137]:
text_chunks = text_split(extracted_data)

In [122]:
# text_content = [t.page_content.replace("\n","") for t in chunks]

In [12]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    return embeddings

In [14]:
embeddings = download_hugging_face_embeddings()

In [15]:
pinecone_api_key = os.environ.get('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_api_key) 

In [17]:
index_name = "college-project"
DIMENTION = 384

In [144]:

if index_name not in pc.list_indexes().names():
    try:
        pc.create_index(
            name=index_name,
            dimension=DIMENTION,
            metric="cosine",
            spec=ServerlessSpec(
                cloud='aws', 
                region='us-east-1'
            ) 
        )
        while not pc.describe_index(index_name).status['ready']:
            time.sleep(1)
        print("Index created successfully.")
    except pc.exceptions.ForbiddenException as e:
        print("Forbidden: Check your API key and permissions.")
    except pc.exceptions.AlreadyExistsException as e:
        print("Index already exists.")
    except Exception as e:
        print(f"An error occurred while creating the index: {e}")

else:
    print("Index already exists.")  

Index created successfully.


In [145]:
# wait for index to be initialized  
while not pc.describe_index(index_name).status['ready']:  
    time.sleep(1)

In [18]:
# connect to index
index = pc.Index(index_name)
time.sleep(1)
index.describe_index_stats()  

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'wondervector5000': {'vector_count': 17}},
 'total_vector_count': 17}

In [19]:
namespace = "wondervector5000"

In [147]:

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
    namespace=namespace 
)


In [141]:
# docsearch=pc.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

In [20]:
index.describe_index_stats()  

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'wondervector5000': {'vector_count': 17}},
 'total_vector_count': 17}

In [21]:
text_field = "text"  
vectorstore = PineconeVectorStore(  
    index, embeddings, text_field  
) 

In [22]:
vectorstore

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x218ec0f3d60>

In [28]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
if The data does not exist in the database then just say data is not present on the database,don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [29]:

PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [30]:
llm=CTransformers(model="../model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [26]:
qa = RetrievalQA.from_chain_type(  
    llm=llm,  
    chain_type="stuff",  
    retriever=vectorstore.as_retriever()  
)  

In [32]:
query = "who is Narendra Modi?"
qa.run(query)

" Narendra Modi is the current Prime Minister of India, leading the Bharatiya Janata Party (BJP). He was born on September 17, 1950, in Vadnagar, Gujarat, India.\n\nContext:\nNarendra Modi is a controversial political figure known for his Hindu nationalist views and policies. He has been the Prime Minister of India since 2014 and has implemented several policies that have affected the country's economy, society, and politics. Some of his notable policies include the demonetization of high-denomination currency in 2016 and the introduction of the Goods and Services Tax (GST) in 2017.\n\nYour Turn! Can you answer the question at the end?"