In [None]:
# 改用 toml
import toml
import os
from openai import OpenAI
# 下載數據集
from datasets import load_dataset
import pandas as pd


# 載入 secrets.toml 文件
secrets = toml.load("secrets.toml")

# MongoDB URI
ATLAS_CONNECTION_STRING = secrets["MONGODB_URL"]
# 設置環境變數
os.environ["OPENAI_API_KEY"] = secrets["OPENAI_API_KEY"]
# 初始化 OpenAI 物件
openai_client = OpenAI()

# 數據
data = load_dataset(
    "explodinggradients/ragas-wikiqa", split="train"
)
# 顯示
df = pd.DataFrame(data)
df

In [None]:
# 切割文件
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 切割器
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base",
    keep_separator=False,
    chunk_size=200,
    chunk_overlap=30
)


# 自定義切割函數
def split_texts(texts):
    chunked_texts = []
    for text in texts:
        chunks = text_splitter.create_documents([text])
        chunked_texts.extend([chunk.page_content for chunk in chunks])
    return chunked_texts


# 顯示切割後的文件
df["chunks"] = df["context"].apply(lambda x: split_texts(x))
all_chunks = df["chunks"].tolist()
docs = [item for chunk in all_chunks for item in chunk]
docs

In [None]:
from pymongo import MongoClient
from tqdm.auto import tqdm
# SSL
import certifi

# 資料庫物件
client = MongoClient(
    ATLAS_CONNECTION_STRING,
    tlsCAFile=certifi.where()
)
DB_NAME = "ragas_evals"
db = client[DB_NAME]
batch_size = 128

EVAL_EMBEDDING_MODELS = [
    "text-embedding-ada-002",
    "text-embedding-3-small"
]

def get_embeddings(docs, model):
    docs = [doc.replace("\n", " ") for doc in docs]
    response = openai_client.embeddings.create(input=docs, model=model)
    return [r.embedding for r in response.data]

for model in EVAL_EMBEDDING_MODELS:
    embedded_docs = []
    print(f"Getting embeddings for the {model} model")
    for i in tqdm(range(0, len(docs), batch_size)):
        end = min(len(docs), i + batch_size)
        batch = docs[i:end]
        batch_embeddings = get_embeddings(batch, model)
        batch_embedded_docs = [
            {"text": batch[i], "embedding": batch_embeddings[i]}
            for i in range(len(batch))
        ]
        embedded_docs.extend(batch_embedded_docs)
    print(f"Finished getting embeddings for the {model} model")

    collection = db[model]
    collection.delete_many({})
    collection.insert_many(embedded_docs)
    print(f"Finished inserting embeddings for the {model} model")

確認連線正常

In [None]:
# import certifi
# from pymongo import MongoClient

# client = MongoClient(
#     ATLAS_CONNECTION_STRING,
#     tlsCAFile=certifi.where(),
#     serverSelectionTimeoutMS=5000,  # 可選配置
#     socketTimeoutMS=5000,           # 可選配置
#     connectTimeoutMS=5000           # 可選配置
# )

# # 測試連接
# try:
#     print(client.list_database_names())
# except Exception as e:
#     print(f"Error connecting to MongoDB Atlas: {e}")


正式進行連線

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_core.vectorstores import VectorStoreRetriever
from datasets import Dataset
from ragas import evaluate, RunConfig
from ragas.metrics import context_precision, context_recall
import nest_asyncio
from pymongo import MongoClient
import certifi

# 配置 MongoDB 連接
client = MongoClient(
    ATLAS_CONNECTION_STRING,
    #
    tls=True,
    tlsCAFile=certifi.where(),
    # 忽略無效的證書
    tlsAllowInvalidCertificates=True
)
# 測試基本連接
try:
    db = client.admin.command('ping')
    print("MongoDB 連接成功:", db)
except Exception as e:
    print("MongoDB 連接失敗:", e)

# 指定資料庫名稱
DB_NAME = "ragas_evals"
db = client[DB_NAME]

# 定義嵌入模型
EVAL_EMBEDDING_MODELS = [
    # OpenAI 的 text-embedding-ada-002 模型
    "text-embedding-ada-002",
    # OpenAI 的 text-embedding-3-small 模型
    "text-embedding-3-small"

]


# 定義獲取檢索器的函數
def get_retriever(model, k):
    # 創建嵌入模型對象
    embeddings = OpenAIEmbeddings(model=model)
    # 創建 MongoDB 向量檢索對象
    vector_store = MongoDBAtlasVectorSearch.from_connection_string(
        connection_string=ATLAS_CONNECTION_STRING,
        # 指定命名空間（資料庫和集合）
        namespace=f"{DB_NAME}.{model}",
        # 指定嵌入模型
        embedding=embeddings,
        # 向量索引名稱
        index_name="vector_index",
        # 文本鍵名
        text_key="text",
    )
    # 返回檢索器對象
    retriever = vector_store.as_retriever(
        # 指定檢索類型為相似度檢索
        search_type="similarity",
        # 指定檢索參數
        search_kwargs={"k": k}
    )
    return retriever


# 轉換數據框中的列為列表
# 指定檢索參數
QUESTIONS = df["question"].to_list()
# 除錯
print(f'QUESTIONS 的內容 {QUESTIONS}')
# 提取正確答案列表
GROUND_TRUTH = df["correct_answer"].tolist()
print(f'GROUND_TRUTH 的內容 {GROUND_TRUTH}')
# 允許嵌套使用 asyncio
nest_asyncio.apply()
# 遍歷所有嵌入模型進行評估
for model in EVAL_EMBEDDING_MODELS:
    #
    print(f"進行 model: {model}")
    # 構建數據字典
    data = {
        # 問題
        "question": [],
        # 正確答案
        "ground_truth": [],
        # 上下文
        "contexts": []
    }
    data["question"] = QUESTIONS
    data["ground_truth"] = GROUND_TRUTH
    # 獲取檢索器
    retriever = get_retriever(model, 2)
    # 除錯
    print("Retriever 建立完成")
    # 遍歷所有問題進行檢索
    for i in tqdm(range(0, len(QUESTIONS))):
        # 除錯
        print(f"進行 question {i+1}/{len(QUESTIONS)}")
        # 改寫用以除錯
        try:
            docs = retriever.get_relevant_documents(QUESTIONS[i])
            print(f"Retrieved {len(docs)} documents for question {i+1}")
            data["contexts"].append([doc.page_content for doc in docs])
        except Exception as e:
            print(f"Error retrieving documents for question {i+1}: {e}")
            continue
        # 先註解        
        # data["contexts"].append(
        #     [doc.page_content for doc in retriever.get_relevant_documents(QUESTIONS[i])]
        # )
    # 除錯用
    print("All questions processed")
    # 將數據字典轉換為 Dataset 對象
    dataset = Dataset.from_dict(data)
    # 配置運行參數
    run_config = RunConfig(max_workers=4, max_wait=180)
    # 使用 ragas 進行評估
    result = evaluate(
        dataset=dataset,
        metrics=[context_precision, context_recall],
        run_config=run_config,
        raise_exceptions=False,
    )
    # 輸出評估結果
    print(f"{model} 模型的結果：{result}")

檢查向量維度

In [None]:
from langchain_openai import OpenAIEmbeddings

# model_name = "text-embedding-ada-002"
model_name = "text-embedding-3-small"
embeddings = OpenAIEmbeddings(model=model_name)

sample_text = "Hello, world!"
embedding_vector = embeddings.embed_query(sample_text)
print(f"向量維度: {len(embedding_vector)}")


檢查數據集位置

In [None]:
from datasets import load_dataset

# 加載數據集
# data = load_dataset("explodinggradients/ragas-wikiqa", split="train")

# 獲取數據集的本地路徑
dataset_path = data.cache_files[0]['filename']
print(dataset_path)


基本連線測試

In [None]:
from pymongo import MongoClient
import certifi
# 改用 toml
import toml

# 載入 secrets.toml 文件
secrets = toml.load("secrets.toml")

# 使用您的 MongoDB Atlas 連接字串
ATLAS_CONNECTION_STRING = secrets["MONGODB_URL"]

try:
    client = MongoClient(ATLAS_CONNECTION_STRING, tlsCAFile=certifi.where())
    db = client.test
    print("成功連接到 MongoDB Atlas")
except Exception as e:
    print(f"連接失敗：{e}")


異步連線測試

In [None]:
import asyncio
from motor.motor_asyncio import AsyncIOMotorClient
import certifi
# 改用 toml
import toml

# 載入 secrets.toml 文件
secrets = toml.load("secrets.toml")

# 使用您的 MongoDB Atlas 連接字串
ATLAS_CONNECTION_STRING = secrets["MONGODB_URL"]

async def test_connection():
    try:
        client = AsyncIOMotorClient(ATLAS_CONNECTION_STRING, tlsCAFile=certifi.where())
        db = client.test
        print("成功連接到 MongoDB Atlas")
    except Exception as e:
        print(f"連接失敗：{e}")

loop = asyncio.get_event_loop()
loop.run_until_complete(test_connection())


禁用 IP V6 連線測試

In [None]:
from pymongo import MongoClient
import certifi
# 改用 toml
import toml

# 載入 secrets.toml 文件
secrets = toml.load("secrets.toml")

# 使用您的 MongoDB Atlas 連接字串
ATLAS_CONNECTION_STRING = secrets["MONGODB_URL"]

try:
    client = MongoClient(
        ATLAS_CONNECTION_STRING,
        tlsCAFile=certifi.where(),
        serverSelectionTimeoutMS=5000,
        socketTimeoutMS=5000,
        connectTimeoutMS=5000
    )
    db = client.test
    print("成功連接到 MongoDB Atlas")
except Exception as e:
    print(f"連接失敗：{e}")


當前腳本中連線測試

In [None]:
from pymongo import MongoClient
import certifi
import toml
# 載入 secrets.toml 文件
secrets = toml.load("secrets.toml")

# 使用您的 MongoDB Atlas 連接字串
ATLAS_CONNECTION_STRING = secrets["MONGODB_URL"]
try:
    client = MongoClient(
        ATLAS_CONNECTION_STRING,
        tlsCAFile=certifi.where(),
    )
    # 指定資料庫名稱
    db = client[DB_NAME]
    print("成功連接到 MongoDB Atlas")
except Exception as e:
    print(f"連接失敗：{e}")
