# Milvus 构建 RAG 系统

## 1. 依赖关系和环境

In [1]:
! pip install --upgrade pymilvus sentence-transformers huggingface-hub langchain_community langchain-text-splitters pypdf tqdm



## 2. 数据

In [2]:
%%bash
# 下载 PDF 文件
if [ ! -f "The-AI-Act.pdf" ]; then
    wget -q https://artificialintelligenceact.eu/wp-content/uploads/2021/08/The-AI-Act.pdf
fi

In [3]:
from langchain_community.document_loaders import PyPDFLoader
# 使用 LangChain 的 PyPDFLoader 从 PDF 中提取文本
loader = PyPDFLoader("The-AI-Act.pdf")
docs = loader.load()
print(len(docs))

108


In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
# 将块大小设置为 1000，重叠设置为 200
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)

In [5]:
text_lines = [chunk.page_content for chunk in chunks]

## 3. embedding 模型

In [6]:
from sentence_transformers import SentenceTransformer
# TODO: 可以使用 GPU 加速， device="cuda"
embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cpu")


def emb_text(text):
    return embedding_model.encode([text], normalize_embeddings=True).tolist()[0]

In [7]:
test_embedding = emb_text("This is a test")
embedding_dim = len(test_embedding)
print(f"embedding_dim: {embedding_dim}")
print(test_embedding[:10])

embedding_dim: 384
[-0.07660678774118423, 0.025316720828413963, 0.012505537830293179, 0.004595226142555475, 0.025779981166124344, 0.0381670817732811, 0.08050811290740967, 0.003035373752936721, 0.02439219132065773, 0.004880373831838369]


## 3. 数据加载到 Milvus 中
> 对于 MilvusClient 的参数：
> - 将 uri 设置为本地文件，例如 ./hf_milvus_demo.db ，是最方便的方法，因为它会自动使用 Milvus Lite 将所有数据存储在此文件中。
> - 如果您有大量数据，例如超过一百万个向量，您可以在 Docker 或 Kubernetes 上设置性能更高的 Milvus 服务器。在此设置中，请使用服务器 uri，例如 http://localhost:19530 作为您的 uri 。
> - 如果您想使用 Milvus 的全托管云服务 Zilliz Cloud ，请调整 uri 和 token，分别对应 Zilliz Cloud 中 Public Endpoint 和 Api key 。

In [8]:
from pymilvus import MilvusClient

milvus_client = MilvusClient(uri="./hf_milvus_demo.db")

collection_name = "rag_collection"

In [9]:
# 检查 collection 是否已存在，如果存在则将其删除
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

In [None]:
# 创建一个新 collection
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=embedding_dim,
    metric_type="IP",  # 内积
    consistency_level="Strong",  # 强一致性
)

In [11]:
# 上传数据库
from tqdm import tqdm

data = []

for i, line in enumerate(tqdm(text_lines, desc="Creating embeddings")):
    data.append({"id": i, "vector": emb_text(line), "text": line})

insert_res = milvus_client.insert(collection_name=collection_name, data=data)
insert_res["insert_count"]

Creating embeddings: 100%|██████████| 424/424 [00:09<00:00, 42.81it/s]


424

## 4. 构建 RAG

In [12]:
# 用户查询
question = "What is the legal basis for the proposal?"

In [None]:
# 检索
search_res = milvus_client.search(
    collection_name=collection_name,
    data=[emb_text(question)],  # 使用模型对问题进行编码
    limit=3,  # 返回前 3 个最相似的结果
    search_params={"metric_type": "IP", "params": {}},  # 使用内积度量
    output_fields=["text"],  # 指定要返回的字段
)

In [14]:
import json

retrieved_lines_with_distances = [(res["entity"]["text"], res["distance"]) for res in search_res[0]]
print(json.dumps(retrieved_lines_with_distances, indent=4))

[
    [
        "EN 6  EN \n2. LEGAL BASIS, SUBSIDIARITY AND PROPORTIONALITY \n2.1. Legal basis \nThe legal basis for the proposal is in the first place Article 114 of the Treaty on the \nFunctioning of the European Union (TFEU), which provides for the adoption of measures to \nensure the establishment and functioning of the internal market.  \nThis proposal constitutes a core part of the EU digital single market strategy. The primary \nobjective of this proposal is to ensure the proper functioning of the internal market by setting \nharmonised rules in particular on the development, placing on the Union market and the use \nof products and services making use of AI technologies or provided as stand -alone AI \nsystems. Some Member States are already considering national rules to ensure that AI is safe \nand is developed and used in compliance with fundamental rights obligations. This will likely \nlead to two main problems: i) a fragmentation of the internal market on essential elemen