In [3]:
#import all the necessary packages 
from langchain import PromptTemplate
import os
import pinecone
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from pinecone import ServerlessSpec,Pinecone
# from langchain.document_loaders import DirectoryLoader,PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

  from tqdm.autonotebook import tqdm


OpenAI Setup

In [4]:
load_dotenv()

openai_api_key = os.getenv('OPENAI_API_KEY')

In [5]:
#instantiate the openai 
llm = OpenAI(api_key=openai_api_key)

  warn_deprecated(


In [6]:
# instantiating the openai embedding
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(api_key=openai_api_key)

Load the Document

In [7]:
def load_book(path):
    loader = DirectoryLoader(
        path,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    
    document = loader.load()
    
    return document

In [8]:
document = load_book('C:/Users/Hp/OneDrive/Documents/Desktop/RAG/RAG/data/')

In [9]:
#splitting into chunks 
def split_text(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20
    )
    
    text_chunks = text_splitter.split_documents(documents)
    
    return text_chunks

In [10]:
text_chunks = split_text(document)
len(text_chunks)

7

PINECONE SETUP

In [11]:
#extracting the environment variable 
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pinecone_env_key = os.getenv('PINECONE_ENV_KEY')


In [14]:
#pincone integration
pc = Pinecone(
    api_key=pinecone_api_key
)

index_name = 'pdfreader2' #specify the index name where we have stored the embeddings

In [15]:
#if you do not have index you can run this code else skip
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension=1536,
        spec=ServerlessSpec(
            region=pinecone_env_key,
            cloud='aws'
        )
    )

In [None]:
# to create embedding for your text chunks

embedding_list = []
def create_embedding(query):
    embedding_list.append(embedding.embed_query(query))
    
for text in text_chunks:
    create_embedding(text.page_content)

len(embedding_list)

In [16]:
import itertools

index = pc.Index(index_name)

In [None]:
metadata_list = [{"text": text_chunks[i].page_content } for i in range(len(embedding_list))]

In [None]:
data_to_upsert = [
    {
        'id': f"id-{i}",
        'values': embedding,
        'metadata': metadata_list[i]  # Include metadata
    }
    for i, embedding in enumerate(embedding_list)
]

In [None]:
def chunks(iterable, batch_size=200):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

In [None]:
# Upsert data with 200 vectors per upsert request
for ids_vectors_chunk in chunks(data_to_upsert, batch_size=200):
    index.upsert(vectors=ids_vectors_chunk) 

PromptTemplate

In [18]:
prompt = PromptTemplate(
    input_variables= ['answer','question'],
    template = '''You are given a answer for a question. Refine the given answer based on the question and give relevent answer to 
    the question. Give a general answer 
    answer: {answer},
    question: {question}
    
    Please answer clearly
    '''
)

In [19]:
query = 'What is Incept AI'

vector = embedding.embed_query(query)

query_result = index.query(
    vector = vector,
    top_k=3,
    include_values = True
)

In [21]:
matched_id = [query_result.matches[0].id]

{'matches': [], 'namespace': '', 'usage': {'read_units': 1}}

In [None]:
metadata_results = index.fetch(ids=matched_id)

In [None]:
result = metadata_results.vectors['id-1'].metadata['text']

In [None]:
from langchain.chains import LLMChain

chain = LLMChain(
    prompt = prompt,
    llm = llm
)

chain.run(answer=result,question=query)

In [None]:
def dummy_function():
    print('This is the dummy function from notebook')