In [1]:
from sentence_transformers import SentenceTransformer, util
import os
import csv
import pickle
import time
import torch
import faiss
import numpy as np
import pandas as pd

In [2]:
# GPU
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

In [2]:
pretrain_model_path = "/Users/zyl/Desktop/LLM/pretrain_models/quora-distilbert-multilingual"
# pretrain_model_path = "/Users/zyl/Desktop/LLM/pretrain_models/all-MiniLM-L6-v2"
path = "/Users/zyl/Desktop/告警标签/第三次训练-辽宁农信/辽宁农信告警清单/tb_event/"

# 加载 sentence transformer 预训练模型

In [4]:
model = SentenceTransformer(pretrain_model_path)
# model = model.to(device)

In [7]:
# for name, module in model.named_children():
#     print(name, "is: ", module)
for module in model.modules():
    print(module)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)
Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=76

# 输入数据

In [None]:
# df_list = []
# for file in os.listdir(path):
#     file_path = os.path.join(path, file)
#     df = pd.read_csv(file_path, encoding='utf8')
#     df = df.drop_duplicates(subset=['告警内容'])
#     df_list.append(df)
# all_df = pd.concat(df_list, axis=0, ignore_index=False)
df = pd.read_csv(os.path.join(path, 'all_df.csv'), encoding='utf8')
all_df = df.drop_duplicates(subset=['告警内容'])
print(df.shape, all_df.shape)

In [12]:
# all_df.to_csv(os.path.join(path, 'all_df_duplicates.csv'), index=False)

In [5]:
dataset_path = os.path.join(path, 'all_df_duplicates.csv')                     # 下载的数据集的路径
all_df = pd.read_csv(dataset_path)
corpus_sentences = list(set(all_df['告警内容'].values.tolist()))
print(len(corpus_sentences))

175371


# encode编码

In [6]:
max_corpus_size = 175371                                            # 在数据集里选择多少条构建候选语料库
embedding_cache_path = os.path.join(path, 'ln-embeddings.pkl')      # 候选语料保存在本地文件的名称
# embedding_cache_path = os.path.join(path, 'ln-MiniLM-embeddings.pkl')
embedding_size = 768                                                # 预训练模型的编码输出特征维度

In [7]:
corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_numpy=True)

Batches:   0%|          | 0/5481 [00:00<?, ?it/s]

# 保存Embedding向量

In [8]:
with open(embedding_cache_path, "wb") as fout:
    pickle.dump({'sentences': corpus_sentences, 'embeddings': corpus_embeddings}, fout)

# 加载Embedding向量

In [38]:
with open(embedding_cache_path, 'rb') as file:
    loaded_data = pickle.load(file)
corpus_sentences = loaded_data['sentences']
corpus_embeddings = loaded_data['embeddings']
print(len(corpus_sentences))

176805


In [39]:
top_N = 300          # 查询前 K个结果
n_clusters = 200     # 聚类的数量
_nprobe = 3          # 在最相关的多少个簇中搜索答案，越大查的越全，越耗时间

# 创建 Faiss 

In [40]:
quantizer = faiss.IndexFlatIP(embedding_size)
index = faiss.IndexIVFFlat(quantizer, embedding_size, n_clusters, faiss.METRIC_INNER_PRODUCT)
index.nprobe = _nprobe

# 训练

In [41]:
# 因为对向量做点积计算以进行召回，先对所有语料的编码进行normalize
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1)[:, None]
print(corpus_embeddings.shape)
index.train(corpus_embeddings)
index.add(corpus_embeddings)

(176805, 768)


# 检验

In [45]:
similarity_threshold = 0.8

df = pd.DataFrame()
sim = []
content = []

input_question = input("请输入告警content: ")
start_time = time.time()
question_embedding = model.encode(input_question)
question_embedding = question_embedding / np.linalg.norm(question_embedding)
question_embedding = np.expand_dims(question_embedding, axis=0)

# 使用FAISS进行检索
distances, corpus_ids = index.search(question_embedding, top_N)

hits = [{'corpus_id': id, 'score': score} for id, score in zip(corpus_ids[0], distances[0])]
# 相似度阈值过滤---删-----
hits = [dic for dic in hits if dic['score'] > similarity_threshold]

hits = sorted(hits, key=lambda x: x['score'], reverse=True)
end_time = time.time()

print("输入告警内容:", input_question)
print("查询Faiss向量数据库 (耗时 {:.3f} 秒):".format(end_time-start_time))
for hit in hits[0:top_N]:
    content.append(str(corpus_sentences[hit['corpus_id']]).strip())
    sim.append(str(hit['score']).strip())
    # print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences[hit['corpus_id']]))
df['cos_sim'] = sim
df['content'] = content
df = df.drop_duplicates()
df = df.reset_index(drop=True)

请输入告警content:  CPU 使用率超过95%
输入告警内容: CPU 使用率超过95%
查询Faiss向量数据库 (耗时 0.038 秒):


In [46]:
display(df.head(50))

Unnamed: 0,cos_sim,content
0,1.0,CPU 使用率超过95%
1,1.0,CPU使用率超过95%
2,0.962934,CPU 使用率超过90%
3,0.96293396,CPU 使用率超过90%
4,0.9601265,CPU使用率过高，超过75%
5,0.9467776,CPU 使用率超过70%
6,0.9467775,CPU 使用率超过70%
7,0.9391147,CPU使用率过高
8,0.9345511,华为交换机CPU平均使用率超过 70%
9,0.9335337,华为交换机CPU使用率超过70%


In [None]:
# 多GPU
# model = SentenceTransformer('all-MiniLM-L6-v2')

# #Start the multi-process pool on all available CUDA devices
# pool = model.start_multi_process_pool()

# for i, batch in enumerate(tqdm(dataloader)):
#     #Compute the embeddings using the multi-process pool
#     sentences = batch['best_answer']
#     batch_emb = model.encode_multi_process(sentences, pool, chunk_size=chunk_size, batch_size=encode_batch_size)
#     print("Embeddings computed for 1 batch. Shape:", batch_emb.shape)

# #Optional: Stop the proccesses in the pool
# model.stop_multi_process_pool(pool)

In [3]:
l = [1,2,3,4]
l[:3]

[1, 2, 3]