<a href="https://colab.research.google.com/github/sugarforever/LangChain-Tutorials/blob/main/LangChain_PDF_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

该Python notebook利用langchain的QA chain，结合Chroma来实现PDF文档Analysis-and-Comparison-between-Optimism-and-StarkNet.pdf的语义化搜索。

该PDF文档共61页。通过本notebook，我们演示该字数规模的文件的语义化索引的OpenAI API开销。

使用时，在本地创建`.env`，并如`.env.example`所示，设置有效的OpenAI API Key即可。

In [None]:
%pip install openai > /dev/null
%pip install chromadb > /dev/null
%pip install langchain > /dev/null

In [None]:
from langchain.document_loaders import PyMuPDFLoader

In [None]:
PDF_NAME='Analysis-and-Comparison-between-Optimism-and-StarkNet.pdf'
def load_pdf():
  return PyMuPDFLoader(PDF_NAME).load()

In [None]:
docs = load_pdf()

In [None]:
print (f'You have {len(docs)} document(s) in your data')
print (f'There are {len(docs[0].page_content)} characters in the first page of your document')

total = 0
for doc in docs:
  total += len(doc.page_content)
print (f'There are {total} characters in your document')

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
split_docs = text_splitter.split_documents(docs)

In [None]:
print (f'Now you have {len(split_docs)} documents')

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import os

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [None]:
persist_directory = 'starknet'
collection_name = 'starknet_index'

In [None]:
from langchain.callbacks import get_openai_callback

In [None]:
with get_openai_callback() as cb:
    vectorstore = Chroma.from_documents(split_docs, embeddings, collection_name=collection_name, persist_directory=persist_directory)
    vectorstore.persist()
    print(cb)


In [None]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [None]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)

chain = load_qa_chain(llm, chain_type="stuff")

# Load the vectorstore from disk
vectordb = Chroma(collection_name=collection_name, persist_directory=persist_directory, embedding_function=embeddings)

query = "What is starknet?"
docs = vectorstore.similarity_search(query, 3, include_metadata=True)

In [None]:
print(chain.document_prompt)

In [None]:
for doc in docs:
    print(doc.metadata)

In [None]:
print(chain.prompt_length(docs, question='What is starknet?'))

In [None]:
with get_openai_callback() as cb:
    print(chain.run(input_documents=docs, question=query))
    print(cb)