In [2]:
# getpass：安全地提示用戶輸入密碼或其他敏感信息
import os, pprint
# PyPDFLoader：加載 PDF 文件並將其內容轉換為可處理的文本或數據結構
from langchain_community.document_loaders import PyPDFLoader
# StrOutputParser：將模型生成的輸出解析為字串
from langchain_core.output_parsers import StrOutputParser
# RunnablePassthrough：將輸入直接傳遞到下一步而不進行任何處理的可運行單元
from langchain_core.runnables import RunnablePassthrough
# 與 MongoDB Atlas Vector Search 集成，以實現向量搜索功能
from langchain_mongodb import MongoDBAtlasVectorSearch
# ChatOpenAI 用於與 OpenAI 的聊天模型交互
# OpenAIEmbeddings 用於生成文本的向量嵌入
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
# 定義和管理提示模板，以生成特定的查詢或指令
from langchain.prompts import PromptTemplate
# 將長文本拆分為較小的片段，以便於處理和分析。
from langchain.text_splitter import RecursiveCharacterTextSplitter
# MongoDB
from pymongo import MongoClient
# SSL
import certifi
# Streamlit
import streamlit as st


os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
ATLAS_CONNECTION_STRING = st.secrets["MONGODB_URL"]

# 建立連線
client = MongoClient(ATLAS_CONNECTION_STRING, tlsCAFile=certifi.where())
# 定義資料庫與集合名稱
db_name = "MyDatabase2024"
collection_name = "MyCollection2024"
atlas_collection = client[db_name][collection_name]
vector_search_index = "vector_index"

# 載入 PDF
pdf_url = "https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP"
loader = PyPDFLoader(pdf_url)
# loader = PyPDFLoader("古典測量理論.pdf")
# loader = PyPDFLoader("證書及作品集_原始.pdf")
# loader = PyPDFLoader("論文01.pdf")

data = loader.load()
# 文件分割器
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
# 分割文件
docs = text_splitter.split_documents(data)
# 可輸出觀察看看
print(docs[0], "\n")

# 建立向量儲存
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=docs,
    embedding=OpenAIEmbeddings(disallowed_special=()),
    collection=atlas_collection,
    index_name=vector_search_index,
)

query = "MongoDB Atlas security"
results = vector_search.similarity_search(query)
pprint.pprint(results)

# 將 Atlas Vector Search 實例化為擷取器
retriever = vector_search.as_retriever(
    # 指定搜索類型為相似度搜索
    search_type="similarity",
    # 設定查詢時僅返回前 10 個相關性最高的文檔，並且只使用分數高於 0.75 的文檔
    search_kwargs={"k": 10, "score_threshold": 0.75},
)
# 定義提示模板
template = """
使用以下內容來回答最後的問題。
如果你不知道答案，就說你不知道，不要試圖編造答案。
{context}
問題：{question}
"""

# 根據模板建立一個提示對象
custom_rag_prompt = PromptTemplate.from_template(template)
# 建立一個 OpenAI 聊天模型
llm = ChatOpenAI()


# 定義格式化文檔的函數
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


# 建立一條鏈來回答有關您的數據的問題
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

# 問題
# question = "如何使用古典真分數理論？"
# question = "簡述這篇論文的研究方法"
question = "MongoDB Atlas auditing"

# 獲取與問題相關的文檔
print("\n相關文檔：")
documents = retriever.get_relevant_documents(question)
# print("\nSource documents:")
pprint.pprint(documents)


print("\n問題：" + question)

# 回答
answer = rag_chain.invoke(question)
print("\n回答：" + answer)


page_content='Mong oDB Atlas Best P racticesJanuary 20 19A MongoD B White P aper' metadata={'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 0} 

[Document(page_content='Introduction\nMongoD B Atlasprovides all of the f eatures of MongoD B,\nwithout the operational heavy lifting required for any new\napplication. MongoD B Atlas is available on-demand', metadata={'_id': {'$oid': '665521bd59f1d4c273218c95'}, 'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 2}),
 Document(page_content='instance size, region, and f eatures you need. MongoD B\nAtlas provides:\n•Security f eatures to protect access to your data\n•Built in replication for always-on availability , tolerating', metadata={'_id': {'$oid': '665521bd59f1d4c273218c97'}, 'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 2}),
 Document(page_content='16 Security\n17 Business Intelligence with MongoD B Atlas\n18 Consideration