In [1]:
from pymilvus import MilvusClient, DataType
from pymilvus.model.hybrid import BGEM3EmbeddingFunction
from pymilvus.model.dense import JinaEmbeddingFunction

# # 初始化 BGE-M3 嵌入函数
embedding_model = BGEM3EmbeddingFunction(
    model_name='E:\CExperiment\ThreatRAG\models\embedding_model\\bge-m3',
    device='cpu',
    use_fp16=False
)

# embedding_model = JinaEmbeddingFunction(
#     model_name="jina-embeddings-v3", # Defaults to `jina-embeddings-v3`
#     api_key='jina_d1a2dcf77d894b6e82d185ed49316c1bOMnqubY9dJy0j21TXTgl216IR1VJ', # Provide your Jina AI API key
#     task="retrieval.passage", # Specify the task
# )
# 3.1. Create schema
schema = MilvusClient.create_schema(
    auto_id=False,
    enable_dynamic_field=True,
)

# 3.2. Add fields to schema
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=1024)


# 初始化 Milvus 客户端
client = MilvusClient(
    uri="http://localhost:19530"  # 修改为配置文件中指定的端口
)

# ... existing code ...

# 创建集合（如果已存在则先删除）
collection_name = "text_collection"
if client.has_collection(collection_name=collection_name):
    client.drop_collection(collection_name=collection_name)

# Prepare index parameters
index_params = client.prepare_index_params()

# Add indexes
index_params.add_index(
    field_name="vector",
    index_name="vector_index",
    index_type="IVF_FLAT",
    metric_type="IP",
    params={
        "nlist": 1024  # 添加 nlist 参数，值可以根据数据规模调整
    }
)

# 创建新的集合，维度为1024
client.create_collection(
    collection_name=collection_name,
    schema=schema,
    index_params=index_params
)

# 处理文本文件并插入到 Milvus
def process_and_insert_text(file_path,file_index):
    print("开始处理文本文件...")
    with open(file_path, 'r', encoding='utf-8') as file:
        texts = [line.strip() for line in file if line.strip()]
    
    # 生成向量
    vectors = embedding_model.encode_documents(texts)
    print(vectors)
    # 准备数据
    data = [
        {
            "id": file_index+i,
            "vector": vectors[i].tolist(),  # 只使用密集向量
            "text": texts[i],
            "file_name": file_path
        }
        for i in range(len(texts))
    ]
    
    # 插入数据到 Milvus
    res = client.insert(collection_name=collection_name, data=data)
    print(f"插入了 {res['insert_count']} 条数据")
    return res



# # 1. 插入数据
import os
for i, file in enumerate(os.listdir("./cti_text")):
    file_path = os.path.join("./cti_text", file)
    insert_result = process_and_insert_text(file_path,file_index=i)
# file_path = os.path.join("./cti_text", 'cti_1.txt')
# insert_result = process_and_insert_text(file_path)


  model_name='E:\CExperiment\ThreatRAG\models\embedding_model\\bge-m3',
  from .autonotebook import tqdm as notebook_tqdm
2025-03-12 19:23:55,198 [ERROR][_create_connection]: Failed to create new connection using: 64d4153fe1474746a56539ab5c2cf234 (milvus_client.py:920)
  model_name='E:\CExperiment\ThreatRAG\models\embedding_model\\bge-m3',


MilvusException: <MilvusException: (code=2, message=Fail connecting to server on localhost:19530, illegal connection params or server unavailable)>

In [11]:
# 向量搜索函数
def search_similar_texts(query_text, limit=5):
    print("开始搜索相似文本...")
    # 将查询文本转换为向量
    query_vector = embedding_model.encode_documents([query_text])
    print(f"query_vector: {query_vector}")
    # 执行向量搜索
    results = client.search(
        collection_name=collection_name,
        data=query_vector,  # 只使用密集向量
        limit=limit,
        output_fields=["text","file_name"],
        anns_field="vector",
        search_params={
            "metric_type": "IP"
        },
    )
    return results

# 2. 搜索示例
query = "流量中出现ip:12.25.13.2,这应该是一个IOC???/、我不知道是不是这个ip"
search_results = search_similar_texts(query, limit=5)
print(f"search_results: {search_results}")
print(search_results[0])
# 打印搜索结果
print("\n搜索结果：")
for item in search_results[0]:
    print(f"相似度得分: {item['distance']:.4f}")
    print(f"文本内容: {item['entity']['text']}")
    print(f"文件名: {item['entity']['file_name']}")
    print("---")


开始搜索相似文本...
query_vector: [array([ 0.00603214, -0.03136554, -0.02342976, ...,  0.00101231,
        0.03615713,  0.00715181])]
search_results: data: ['[{\'id\': 9, \'distance\': 0.6809192895889282, \'entity\': {\'text\': "ioc.ip: \'12.25.13.2\'", \'file_name\': \'./cti_text\\\\cti_2.txt\'}}, {\'id\': 8, \'distance\': 0.6761175394058228, \'entity\': {\'text\': "ioc.ip: \'12.25.13.1\'", \'file_name\': \'./cti_text\\\\cti_1.txt\'}}, {\'id\': 20, \'distance\': 0.6085014343261719, \'entity\': {\'text\': "traffic.ip: \'12.25.13.2\'", \'file_name\': \'./cti_text\\\\cti_2.txt\'}}, {\'id\': 19, \'distance\': 0.605629563331604, \'entity\': {\'text\': "traffic.ip: \'12.25.13.1\'", \'file_name\': \'./cti_text\\\\cti_1.txt\'}}, {\'id\': 15, \'distance\': 0.5724168419837952, \'entity\': {\'text\': "ioc.ipfs_address: \'QmS4ghgMgfFvqPjB4WKXHaN15ZDiS4J4Q4K9JgQ2J89FwN\'", \'file_name\': \'./cti_text\\\\cti_2.txt\'}}]']
[{'id': 9, 'distance': 0.6809192895889282, 'entity': {'text': "ioc.ip: '12.25.13.2'", 