In [1]:
from sentence_transformers import SentenceTransformer, util
import torch

# 載入模型（建議初始化一次後重複用）
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

def get_sbert_augmented_sentences(original_sentence: str, corpus_sentences: list, top_k: int = 5, score_threshold: float = 0.7):
    """
    用 SBERT 找出語意最相近的句子作為資料增強。

    參數:
        original_sentence: str，原始的短句
        corpus_sentences: list[str]，可用來搜尋語意相近句子的語料庫
        top_k: int，最多取幾個相似句
        score_threshold: float，最低語意相似度門檻

    回傳:
        List[str]，符合相似度門檻的句子
    """
    # 對語料與輸入句子做編碼
    corpus_embeddings = sbert_model.encode(corpus_sentences, convert_to_tensor=True)
    query_embedding = sbert_model.encode(original_sentence, convert_to_tensor=True)

    # 計算 cosine similarity
    cosine_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cosine_scores, k=min(top_k + 1, len(corpus_sentences)))  # +1 是因為可能包含自己

    # 收集結果（去掉與自己相同的句子）
    results = []
    for score, idx in zip(top_results[0], top_results[1]):
        candidate = corpus_sentences[idx]
        if candidate != original_sentence and score >= score_threshold:
            results.append(candidate)

    return results

In [2]:
# 範例語料庫（可以是你資料集裡的所有短句）
corpus = [
    "login failed",
    "user cannot login",
    "system crash occurred",
    "password reset not working",
    "email authentication failed",
    "cannot access server"
]

original = "login issue in server"
augmented = get_sbert_augmented_sentences(original, corpus, top_k=3, score_threshold=0.6)

print("原句:", original)
print("擴增候選:")
for i, s in enumerate(augmented):
    print(f"{i+1}. {s}")

原句: login issue in server
擴增候選:
1. login failed
2. user cannot login
3. cannot access server
