In [1]:
import os

load pdf data

In [2]:
from pypdf import PdfReader

def load_pdf(file_path):
    """
    Reads the text content from a PDF file and returns it as a single string.

    Parameters:
    - file_path (str): The file path to the PDF file.

    Returns:
    - str: The concatenated text content of all pages in the PDF.
    """
    # Logic to read pdf
    reader = PdfReader(file_path)

    # Loop over each page and store it in a variable
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    return text

In [4]:
pdf_text = load_pdf(file_path="D:\\chatbot\\pdf-query\\rag-pdf-chat\\We.pdf")

Store knowledge in Pinecone

In [5]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

index_name = "docs"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1024, 
        metric="cosine", 
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"
        ) 
    ) 


  from tqdm.autonotebook import tqdm


Chunk the content based 

In [6]:
from langchain_pinecone import PineconeEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import MarkdownHeaderTextSplitter
import os
import time

# Chunk the document based on h2 headers.
markdown_document = pdf_text
headers_to_split_on = [
    ("##", "Header 2")
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)
md_header_splits = markdown_splitter.split_text(markdown_document)

# Initialize a LangChain embedding object.
model_name = "multilingual-e5-large"  
embeddings = PineconeEmbeddings(  
    model=model_name,  
    pinecone_api_key=os.environ.get("PINECONE_API_KEY")  
)  

# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_documents(
    documents=md_header_splits,
    index_name="docs",
    embedding=embeddings, 
    namespace="wondervector5000" 
)

time.sleep(1)


Use Pinecone’s list and query operations to look at one of the records:

In [7]:
index = pc.Index(index_name)
namespace = "wondervector5000"

for ids in index.list(namespace=namespace):
    query = index.query(
        id=ids[0], 
        namespace=namespace, 
        top_k=1,
        include_values=True,
        include_metadata=True
    )
    print(query)

Use the chatbot

In [8]:
from langchain.chains import RetrievalQA 
from langchain_openai import ChatOpenAI

# Initialize a LangChain object for chatting with the LLM
# without knowledge from Pinecone.
llm = ChatOpenAI(
    openai_api_key=os.environ.get('OPENAI_API_KEY'),
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

# Initialize a LangChain object for chatting with the LLM
# with knowledge from Pinecone. 
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever()
)

# Define a few questions about the WonderVector5000.
query1 = """What is main purpose of information given?"""

query2 = """The Neural Fandango Synchronizer is giving me a 
headache. What do I do?"""

# Send each query to the LLM twice, first with relevant knowledge from Pincone 
# and then without any additional knowledge.
print("Query 1\n")
print("Chat with knowledge:")
print(qa.invoke(query1).get("result"))
print("\nChat without knowledge:")
print(llm.invoke(query1).content)
print("\nQuery 2\n")
print("Chat with knowledge:")
print(qa.invoke(query2).get("result"))
print("\nChat without knowledge:")
print(llm.invoke(query2).content)

Query 1

Chat with knowledge:
I don't have enough context to provide a specific answer. Could you please provide more details or specify which information you are referring to?

Chat without knowledge:
The main purpose of the information given is to provide knowledge, answer questions, or convey a message to the reader or audience. It may also be intended to educate, inform, persuade, entertain, or inspire. Ultimately, the purpose of the information will depend on the context in which it is presented and the goals of the communicator.

Query 2

Chat with knowledge:
I don't know the answer to that question.

Chat without knowledge:
If the Neural Fandango Synchronizer is giving you a headache, it is important to stop using it immediately and give yourself a break. Take some time to rest and relax, drink plenty of water, and consider taking over-the-counter pain medication if needed. If the headache persists or worsens, it is recommended to consult a healthcare professional for further ad