In [1]:
import torch
import tqdm

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [2]:
from sentence_transformers import SentenceTransformer

In [3]:
# 1. Load a pretrained Sentence Transformer model
# baai/bge-large-zh
# baai/bge-small-zh
# intfloat/multilingual-e5-large
model = SentenceTransformer(
    "intfloat/multilingual-e5-large",
    device=device,
    prompts={
        "classification": "Classify the following text: ",
        "retrieval": "Retrieve semantically similar text: ",
        "clustering": "Identify the topic or theme based on the text: ",
    },
)
model.default_prompt_name = "retrieval"

In [4]:
# The sentences to encode
sentences = [
    "天气很好",
    "阳光充沛",
    "今天是晴天",
    "他去体育馆",
]

In [None]:
# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)

In [None]:
# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)

In [None]:
model.max_seq_length