In [1]:
# 改用 toml
import toml
import os
from openai import OpenAI
# 下載數據集
from datasets import load_dataset
import pandas as pd


# 載入 secrets.toml 文件
secrets = toml.load("secrets.toml")

# MongoDB URI
ATLAS_CONNECTION_STRING = secrets["MONGODB_URL"]
# 設置環境變數
os.environ["OPENAI_API_KEY"] = secrets["OPENAI_API_KEY"]
# 初始化 OpenAI 物件
openai_client = OpenAI()

# 數據
data = load_dataset(
    "explodinggradients/ragas-wikiqa", split="train"
)
# 顯示
df = pd.DataFrame(data)
df

Unnamed: 0,question,correct_answer,incorrect_answer,question_id,generated_with_rag,context,generated_without_rag
0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,"As such, African immigrants are to be distingu...",From the Immigration and Nationality Act of 19...,Q0,\nAfrican Americans were immigrated to the Uni...,[African immigration to the United States refe...,African Americans were immigrated to the US in...
1,what are points on a mortgage,"Points, sometimes also called a ""discount poin...",Discount points may be different from originat...,Q1012,\nPoints on a mortgage are a form of pre-paid ...,"[Discount points, also called mortgage points ...",A mortgage point is a fee equal to 1% of the l...
2,how does interlibrary loan work,The user makes a request with their local libr...,Although books and journal articles are the mo...,Q102,\nInterlibrary loan works by allowing patrons ...,"[Interlibrary loan (abbreviated ILL, and somet...",Interlibrary loan is a service that allows lib...
3,WHAT IS A FY QUARTER,"A fiscal year (or financial year, or sometimes...",Fiscal years vary between businesses and count...,Q1027,\nA FY quarter is a three-month period within ...,[April.\n\n\n=== United States ===\n\n\n==== F...,A FY Quarter is a three-month period in the fi...
4,who wrote a rose is a rose is a rose,"The sentence ""Rose is a rose is a rose is a ro...",I know that in daily life we don't go around s...,Q1032,"\nGertrude Stein wrote the sentence ""A rose is...","[The sentence ""Rose is a rose is a rose is a r...","Gertrude Stein wrote ""A Rose is a Rose is a Ro..."
...,...,...,...,...,...,...,...
227,What happened during the Starving Time in Jame...,There is scientific evidence that the settlers...,"Also, the water that the colonists drank was b...",Q900,"\nDuring the Starving Time in Jamestown, the c...",[The Starving Time at Jamestown in the Colony ...,The Starving Time in Jamestown was a period of...
228,what food is in afghan,Accompanying these staples are dairy products ...,Afghanistan 's culinary specialties reflect it...,Q910,"\nAfghan cuisine includes mutton, beef, poultr...","[Afghan cuisine (Pashto: افغان پخلی, romanized...",Afghan food typically consists of dishes such ...
229,how kimberlite pipes form,Volcanic pipes are relatively rare.,Volcanic pipes are geological structures forme...,Q911,\nKimberlite pipes form as the result of viole...,[Volcanic pipes or volcanic conduits are subte...,Kimberlite pipes form when molten magma from t...
230,what county is coatesville indiana located in,"Coatesville is a town in Clay Township , Hendr...",The population was 523 at the 2010 Census .,Q967,"\nCoatesville, Indiana is located in Hendricks...","[Coatesville is a town in Clay Township, Hendr...",Coatesville Indiana is located in Putnam Count...


In [2]:
# 切割文件
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 切割器
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base",
    keep_separator=False,
    chunk_size=200,
    chunk_overlap=30
)


# 自定義切割函數
def split_texts(texts):
    chunked_texts = []
    for text in texts:
        chunks = text_splitter.create_documents([text])
        chunked_texts.extend([chunk.page_content for chunk in chunks])
    return chunked_texts


# 顯示切割後的文件
df["chunks"] = df["context"].apply(lambda x: split_texts(x))
all_chunks = df["chunks"].tolist()
docs = [item for chunk in all_chunks for item in chunk]
# docs

插入向量

In [6]:
from pymongo import MongoClient
from tqdm.auto import tqdm
# SSL
import certifi

# 資料庫物件
client = MongoClient(
    ATLAS_CONNECTION_STRING,
    tlsCAFile=certifi.where()
)
DB_NAME = "MyDatabase2024"
db = client[DB_NAME]
batch_size = 128

EVAL_EMBEDDING_MODELS = [
    "text-embedding-ada-002",
    "text-embedding-3-small"
]

def get_embeddings(docs, model):
    docs = [doc.replace("\n", " ") for doc in docs]
    response = openai_client.embeddings.create(input=docs, model=model)
    return [r.embedding for r in response.data]

for model in EVAL_EMBEDDING_MODELS:
    embedded_docs = []
    print(f"Getting embeddings for the {model} model")
    for i in tqdm(range(0, len(docs), batch_size)):
        end = min(len(docs), i + batch_size)
        batch = docs[i:end]
        batch_embeddings = get_embeddings(batch, model)
        batch_embedded_docs = [
            {"text": batch[i], "embedding": batch_embeddings[i]}
            for i in range(len(batch))
        ]
        embedded_docs.extend(batch_embedded_docs)
    print(f"Finished getting embeddings for the {model} model")

    collection = db["MyCollection2024"]
    collection.delete_many({})
    collection.insert_many(embedded_docs)
    print(f"Finished inserting embeddings for the {model} model")

Getting embeddings for the text-embedding-ada-002 model


  0%|          | 0/30 [00:00<?, ?it/s]

Finished getting embeddings for the text-embedding-ada-002 model
Finished inserting embeddings for the text-embedding-ada-002 model
Getting embeddings for the text-embedding-3-small model


  0%|          | 0/30 [00:00<?, ?it/s]

Finished getting embeddings for the text-embedding-3-small model
Finished inserting embeddings for the text-embedding-3-small model


進行評估

In [7]:
from langchain_openai import OpenAIEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_core.vectorstores import VectorStoreRetriever
from datasets import Dataset
from ragas import evaluate, RunConfig
from ragas.metrics import context_precision, context_recall
import nest_asyncio
from pymongo import MongoClient
import certifi

# 配置 MongoDB 連接
# client = MongoClient(
#     ATLAS_CONNECTION_STRING,
#     tlsCAFile=certifi.where()
# )
# 測試基本連接
try:
    db = client.admin.command('ping')
    print("MongoDB 連接成功:", db)
except Exception as e:
    print("MongoDB 連接失敗:", e)

# 指定資料庫名稱
# DB_NAME = "MyDatabase2024"
COLLECTION_NAME = "MyCollection2024"
# db = client[DB_NAME]

# 定義嵌入模型
# EVAL_EMBEDDING_MODELS = [
#     # OpenAI 的 text-embedding-ada-002 模型
#     "text-embedding-ada-002",
#     # OpenAI 的 text-embedding-3-small 模型
#     "text-embedding-3-small"
# ]


# 定義獲取檢索器的函數
def get_retriever(model, k):
    # 創建嵌入模型對象
    embeddings = OpenAIEmbeddings(model=model)
    # 創建 MongoDB 向量檢索對象
    vector_store = MongoDBAtlasVectorSearch.from_connection_string(
        connection_string=ATLAS_CONNECTION_STRING,
        # 指定命名空間（資料庫和集合）
        # namespace=f"{DB_NAME}.{model}",
        namespace=f"{DB_NAME}.{COLLECTION_NAME}",
        # 指定嵌入模型
        embedding=embeddings,
        # 向量索引名稱
        index_name="vector_index",
        # 文本鍵名
        text_key="text",
    )
    # 返回檢索器對象
    retriever = vector_store.as_retriever(
        # 指定檢索類型為相似度檢索
        search_type="similarity",
        # 指定檢索參數
        search_kwargs={"k": k}
    )
    return retriever


# 轉換數據框中的列為列表
# 指定檢索參數
QUESTIONS = df["question"].to_list()
# 除錯
print(f'QUESTIONS 的內容 {QUESTIONS}')
# 提取正確答案列表
GROUND_TRUTH = df["correct_answer"].tolist()
print(f'GROUND_TRUTH 的內容 {GROUND_TRUTH}')
# 允許嵌套使用 asyncio
nest_asyncio.apply()
# 遍歷所有嵌入模型進行評估
for model in EVAL_EMBEDDING_MODELS:
    #
    print(f"進行 model: {model}")
    # 構建數據字典
    data = {
        # 問題
        "question": [],
        # 正確答案
        "ground_truth": [],
        # 上下文
        "contexts": []
    }
    data["question"] = QUESTIONS
    data["ground_truth"] = GROUND_TRUTH
    # 獲取檢索器
    retriever = get_retriever(model, 2)
    # 除錯
    print("Retriever 建立完成")
    # 遍歷所有問題進行檢索
    for i in tqdm(range(0, len(QUESTIONS))):
        # 除錯
        print(f"進行 question {i+1}/{len(QUESTIONS)}")
        # 改寫用以除錯
        try:
            docs = retriever.get_relevant_documents(QUESTIONS[i])
            print(f"Retrieved {len(docs)} documents for question {i+1}")
            data["contexts"].append([doc.page_content for doc in docs])
        except Exception as e:
            print(f"Error retrieving documents for question {i+1}: {e}")
            continue
        # 先註解        
        # data["contexts"].append(
        #     [doc.page_content for doc in retriever.get_relevant_documents(QUESTIONS[i])]
        # )
    # 除錯用
    print("All questions processed")
    # 將數據字典轉換為 Dataset 對象
    dataset = Dataset.from_dict(data)
    # 配置運行參數
    run_config = RunConfig(max_workers=4, max_wait=180)
    # 使用 ragas 進行評估
    result = evaluate(
        dataset=dataset,
        metrics=[context_precision, context_recall],
        run_config=run_config,
        raise_exceptions=False,
    )
    # 輸出評估結果
    print(f"{model} 模型的結果：{result}")

MongoDB 連接成功: {'ok': 1}
QUESTIONS 的內容 ['HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US', 'what are points on a mortgage', 'how does interlibrary loan work', 'WHAT IS A FY QUARTER', 'who wrote a rose is a rose is a rose', 'what is a notary for', 'what bacteria grow on macconkey agar', 'who played the lead roles in the movie leaving las vegas', 'what is a CMM machine', 'what percentage of water in in the body', 'what is a day care for?', 'where does a flea live', 'what is a group of deer called', 'what state was john mccain a senator in during the 2008 election', 'how did harmon killebrew get strong', 'where do crocodiles live', 'what relates to erosion', 'where in oregon is albany', 'what year did isaac newton die', 'How much did Waterboy grossed', 'where fourth of july came from', 'what are layers of the ionosphere', 'what is a newsgroup message', 'what are some legal uses of meth', 'Where does the word baptism come from', 'what school did Zach Thomas play for before making it in to t

  0%|          | 0/232 [00:00<?, ?it/s]

進行 question 1/232
Error retrieving documents for question 1: ac-8uuuote-shard-00-02.yhwvqqt.mongodb.net:27017: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),ac-8uuuote-shard-00-01.yhwvqqt.mongodb.net:27017: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),ac-8uuuote-shard-00-00.yhwvqqt.mongodb.net:27017: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 665526e0e845ecfd21b9d5e2, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('ac-8uuuote-shard-00-00.yhwvqqt.mongodb.net', 27017) server

## 確認連線正常

In [None]:
import certifi
from pymongo import MongoClient

client = MongoClient(
    ATLAS_CONNECTION_STRING,
    tlsCAFile=certifi.where(),
    serverSelectionTimeoutMS=5000,  # 可選配置
    socketTimeoutMS=5000,           # 可選配置
    connectTimeoutMS=5000           # 可選配置
)

# 測試連接
try:
    print(client.list_database_names())
except Exception as e:
    print(f"Error connecting to MongoDB Atlas: {e}")


## 檢查向量維度

In [None]:
from langchain_openai import OpenAIEmbeddings

# model_name = "text-embedding-ada-002"
model_name = "text-embedding-3-small"
embeddings = OpenAIEmbeddings(model=model_name)

sample_text = "Hello, world!"
embedding_vector = embeddings.embed_query(sample_text)
print(f"向量維度: {len(embedding_vector)}")


## 檢查數據集位置

In [None]:
from datasets import load_dataset

# 加載數據集
# data = load_dataset("explodinggradients/ragas-wikiqa", split="train")

# 獲取數據集的本地路徑
dataset_path = data.cache_files[0]['filename']
print(dataset_path)


## 基本連線測試

In [None]:
from pymongo import MongoClient
import certifi
# 改用 toml
import toml

# 載入 secrets.toml 文件
secrets = toml.load("secrets.toml")

# 使用您的 MongoDB Atlas 超連結
ATLAS_CONNECTION_STRING = secrets["MONGODB_URL"]

try:
    client = MongoClient(ATLAS_CONNECTION_STRING, tlsCAFile=certifi.where())
    db = client.test
    print("成功連接到 MongoDB Atlas")
except Exception as e:
    print(f"連接失敗：{e}")


## 異步連線測試

In [None]:
import asyncio
from motor.motor_asyncio import AsyncIOMotorClient
import certifi
# 改用 toml
import toml

# 載入 secrets.toml 文件
secrets = toml.load("secrets.toml")

# 使用您的 MongoDB Atlas 超連結
ATLAS_CONNECTION_STRING = secrets["MONGODB_URL"]

async def test_connection():
    try:
        client = AsyncIOMotorClient(ATLAS_CONNECTION_STRING, tlsCAFile=certifi.where())
        db = client.test
        print("成功連接到 MongoDB Atlas")
    except Exception as e:
        print(f"連接失敗：{e}")

loop = asyncio.get_event_loop()
loop.run_until_complete(test_connection())


## 禁用 IP V6 連線測試

In [None]:
from pymongo import MongoClient
import certifi
# 改用 toml
import toml

# 載入 secrets.toml 文件
secrets = toml.load("secrets.toml")

# 使用您的 MongoDB Atlas 超連結
ATLAS_CONNECTION_STRING = secrets["MONGODB_URL"]

try:
    client = MongoClient(
        ATLAS_CONNECTION_STRING,
        tlsCAFile=certifi.where(),
        serverSelectionTimeoutMS=5000,
        socketTimeoutMS=5000,
        connectTimeoutMS=5000
    )
    db = client.test
    print("成功連接到 MongoDB Atlas")
except Exception as e:
    print(f"連接失敗：{e}")


## 當前腳本中連線測試

In [None]:
from pymongo import MongoClient
import certifi
import toml
# 載入 secrets.toml 文件
secrets = toml.load("secrets.toml")

# 使用您的 MongoDB Atlas 超連結
ATLAS_CONNECTION_STRING = secrets["MONGODB_URL"]
try:
    client = MongoClient(
        ATLAS_CONNECTION_STRING,
        tlsCAFile=certifi.where(),
    )
    # 指定資料庫名稱
    db = client[DB_NAME]
    print("成功連接到 MongoDB Atlas")
except Exception as e:
    print(f"連接失敗：{e}")
