In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

tokenizer = AutoTokenizer.from_pretrained("/root/m3e-base")
model = AutoModel.from_pretrained("/root/m3e-base")

def get_embedding(phrase, tokenizer, model):

    inputs = tokenizer(phrase, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def read_phrases(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        phrases = [line.strip() for line in f.readlines()]
    return phrases

def compute_similarities(phrases, keyword, tokenizer, model):
    keyword_embedding = get_embedding(keyword, tokenizer, model)
    
    similarities = []
    for phrase in phrases:
        phrase_embedding = get_embedding(phrase, tokenizer, model)
        similarity = cosine_similarity(keyword_embedding, phrase_embedding)
        similarities.append((phrase, similarity))
    
    return sorted(similarities, key=lambda x: x[1], reverse=True)

def process_file(file_path, keyword, threshold=0.75):
    phrases = read_phrases(file_path)
    
    sorted_similarities = compute_similarities(phrases, keyword, tokenizer, model)
    
    for phrase, similarity in sorted_similarities:
        if similarity >= threshold:
            print(f"Phrase: {phrase}, Similarity: {similarity:.4f}")


In [None]:
file_path = "words.txt" # Key-phrasebank
keyword = "美国" # choose "China"(for tiktok) or "美国"(for douyin)
process_file(file_path, keyword, threshold=0.75)