In [3]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

# 加载预训练的BGE-M3模型
model = SentenceTransformer('E:\CExperiment\ThreatRAG\models\embedding_model\\bge-m3')

def load_documents(directory):
    """加载目录中的所有txt文档"""
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                documents.append((filename, content))
    return documents

def encode_documents(documents):
    """将文档内容转换为向量"""
    document_texts = [doc[1] for doc in documents]
    embeddings = model.encode(document_texts)
    return embeddings

def search_similar_documents(query_embedding, document_embeddings, top_k=5):
    """在文档向量中搜索最相似的文档"""
    # 创建FAISS索引
    d = query_embedding.shape[0]  # 向量维度
    index = faiss.IndexFlatIP(d)  # 使用内积作为相似度度量
    index.add(document_embeddings.astype(np.float32))

    # 搜索最相似的文档
    distances, indices = index.search(np.array([query_embedding]).astype(np.float32), top_k)
    return indices[0], distances[0]

def main(query_sentence, document_directory):
    # 加载文档
    documents = load_documents(document_directory)

    # 将文档内容转换为向量
    document_embeddings = encode_documents(documents)

    # 将查询句子转换为向量
    query_embedding = model.encode([query_sentence])[0]

    # 在文档向量中搜索最相似的文档
    top_indices, top_distances = search_similar_documents(query_embedding, document_embeddings, top_k=5)

    # 输出结果
    print("Top-5 matching documents:")
    for idx, distance in zip(top_indices, top_distances):
        print(f"Document: {documents[idx][0]}, Similarity Score: {distance}")

# 示例用法
query_sentence = "ioc.ip: '12.25.13.1'"
document_directory = "./cti_text"  # 包含txt文件的目录
main(query_sentence, document_directory)

  model = SentenceTransformer('E:\CExperiment\ThreatRAG\models\embedding_model\\bge-m3')


Top-5 matching documents:
Document: cti_3.txt, Similarity Score: 0.502585232257843
Document: cti_2.txt, Similarity Score: 0.5020726919174194
Document: cti_1.txt, Similarity Score: 0.501251220703125
Document: cti_3.txt, Similarity Score: -3.4028234663852886e+38
Document: cti_3.txt, Similarity Score: -3.4028234663852886e+38
