In [6]:
from rouge import Rouge 

hypothesis = """
  Anna searches for her favorite book 'The Silent Stars', and Ben admits 
  he borrowed it without asking, leading to a plan to discuss it later
"""
reference = """
  Anna realizes Ben borrowed her favorite book without asking, and they 
  agree to discuss it once he's finished reading
"""

rouge = Rouge()
scores = rouge.get_scores(hypothesis, reference)

print(scores[0]["rouge-1"])
print(scores[0]["rouge-2"])
print(scores[0]["rouge-l"])

{'r': 0.631578947368421, 'p': 0.5217391304347826, 'f': 0.571428566473923}
{'r': 0.2777777777777778, 'p': 0.20833333333333334, 'f': 0.23809523319727902}
{'r': 0.47368421052631576, 'p': 0.391304347826087, 'f': 0.42857142361678}


In [7]:
# 使用
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, hypothesis)
print(scores)

{'rouge1': Score(precision=0.52, recall=0.65, fmeasure=0.5777777777777778), 'rouge2': Score(precision=0.20833333333333334, recall=0.2631578947368421, fmeasure=0.23255813953488372), 'rougeL': Score(precision=0.36, recall=0.45, fmeasure=0.39999999999999997)}


In [22]:
# BLEU评估
import sacrebleu

references = ["今天天气真不错"]
hypothesis = "今天天气真不错"

# BLEU
bleu = sacrebleu.corpus_bleu(
    [hypothesis],
    [references],
    tokenize='zh'
)
print(f"BLEU分数: {bleu.score}")

BLEU分数: 100.00000000000004


In [31]:
import nltk
from nltk.translate import meteor_score

# 准备英文测试文本
reference = "The weather is nice today"
hypothesis = "The weather is nice today"

# 简单分词（按空格分）
reference_tokens = reference.split()
hypothesis_tokens = hypothesis.split()

# 计算METEOR分数
score = meteor_score.meteor_score([reference_tokens], hypothesis_tokens)
print(f"METEOR score: {score}")

METEOR score: 0.996


In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from transformers import AutoTokenizer, AutoModel
from torch.nn.functional import cosine_similarity

class TextSimilarity:
    def __init__(self):
        # 初始化 TF-IDF
        self.tfidf = TfidfVectorizer()
        
        # 初始化预训练模型（可选）
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_name = "sentence-transformers/all-MiniLM-L6-v2"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name).to(self.device)
    
    def tfidf_similarity(self, text1: str, text2: str) -> float:
        """使用TF-IDF计算文本相似度"""
        # 构建TF-IDF矩阵
        tfidf_matrix = self.tfidf.fit_transform([text1, text2])
        # 计算余弦相似度
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
        return float(similarity[0][0])
    
    def embedding_similarity(self, text1: str, text2: str) -> float:
        """使用预训练模型计算文本嵌入相似度"""
        # 编码文本
        inputs1 = self.tokenizer(text1, return_tensors="pt", 
                               truncation=True, max_length=512).to(self.device)
        inputs2 = self.tokenizer(text2, return_tensors="pt", 
                               truncation=True, max_length=512).to(self.device)
        
        # 获取文本嵌入
        with torch.no_grad():
            embedding1 = self.model(**inputs1).last_hidden_state.mean(dim=1)
            embedding2 = self.model(**inputs2).last_hidden_state.mean(dim=1)
        
        # 计算余弦相似度
        similarity = torch.nn.functional.cosine_similarity(embedding1, embedding2)
        return float(similarity[0])

# 测试代码
if __name__ == "__main__":
    # 初始化相似度计算器
    similarity_calculator = TextSimilarity()
    
    # 测试用例
    test_cases = [
        # 完全相同的句子
        ("今天天气真好", "今天天气真好"),
        # 相似的句子
        ("今天天气真好", "今天天气不错"),
        # 部分相关的句子
        ("我喜欢吃苹果", "我喜欢吃香蕉"),
        # 不相关的句子
        ("今天天气真好", "我喜欢吃苹果"),
        # 英文测试
        ("The weather is nice today", "The weather is good today"),
        # 长文本测试
        ("这是一个很长的句子，包含了很多内容，说明了很多事情", 
         "这也是一个很长的句子，同样包含很多内容，也说明了很多事情"),
    ]
    
    # 运行测试
    print("测试结果：")
    print("-" * 50)
    for text1, text2 in test_cases:
        print(f"\n文本1: {text1}")
        print(f"文本2: {text2}")
        
        # TF-IDF 相似度
        tfidf_sim = similarity_calculator.tfidf_similarity(text1, text2)
        print(f"TF-IDF 相似度: {tfidf_sim:.4f}")
        
        # 预训练模型相似度
        emb_sim = similarity_calculator.embedding_similarity(text1, text2)
        print(f"预训练模型相似度: {emb_sim:.4f}")
        print("-" * 50)


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [12]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def compute_perplexity(model, tokenizer, text):
    # 将文本转换为tensor
    encodings = tokenizer(text, return_tensors="pt").to(model.device)
    
    # 计算
    with torch.no_grad():
        outputs = model(**encodings)
        loss = outputs.loss
        
    # 返回perplexity
    return torch.exp(loss).item()

# 使用示例
model_path = "Qwen/Qwen2.5-0.5B-Instruct"  # 你的模型路径

# 加载模型和tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True
)

# 测试文本
test_texts = [
    "今天天气真不错",
    "人工智能正在快速发展",
    "这是一个测试句子"
]

# 计算每个文本的perplexity
print("各句子的困惑度:")
for text in test_texts:
    ppl = compute_perplexity(model, tokenizer, text)
    print(f"文本: {text}")
    print(f"Perplexity: {ppl:.2f}")
    print("-" * 50)

# 计算平均perplexity
avg_ppl = sum(compute_perplexity(model, tokenizer, text) for text in test_texts) / len(test_texts)
print(f"\n平均困惑度: {avg_ppl:.2f}")

各句子的困惑度:


TypeError: exp(): argument 'input' (position 1) must be Tensor, not NoneType

In [None]:
import pandas as pd
from rouge import Rouge
import torch
from tqdm import tqdm

def generate_answer(model, tokenizer, instruction):
    """生成回答"""
    messages = [
        {"role": "user", "content": instruction}
    ]
    
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids, 
            max_new_tokens=128,
            pad_token_id=tokenizer.eos_token_id
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

def evaluate_model(model, tokenizer, test_data, num_samples=None):
    """评估模型并保存详细结果"""
    rouge = Rouge()
    
    # 创建结果列表
    results = []
    
    # 如果需要抽样
    if num_samples and num_samples < len(test_data):
        test_data = test_data.sample(n=num_samples, random_state=42)
    
    # 对每个样本进行评估
    for idx, row in tqdm(test_data.iterrows(), total=len(test_data)):
        instruction = row['instruction']
        reference = row['output']
        
        # 生成回答
        generated = generate_answer(model, tokenizer, instruction)
        
        try:
            # 计算ROUGE分数
            scores = rouge.get_scores(generated, reference)[0]
            
            # 保存该样本的所有信息
            result = {
                'instruction': instruction,
                'reference': reference,
                'generated': generated,
                'rouge-1-p': scores['rouge-1']['p'],
                'rouge-1-r': scores['rouge-1']['r'],
                'rouge-1-f': scores['rouge-1']['f'],
                'rouge-2-p': scores['rouge-2']['p'],
                'rouge-2-r': scores['rouge-2']['r'],
                'rouge-2-f': scores['rouge-2']['f'],
                'rouge-l-p': scores['rouge-l']['p'],
                'rouge-l-r': scores['rouge-l']['r'],
                'rouge-l-f': scores['rouge-l']['f']
            }
            results.append(result)
            
        except Exception as e:
            print(f"评估出错 (行 {idx}): {e}")
            print(f"生成文本: {generated}")
            print(f"参考文本: {reference}")
            continue
    
    # 转换为DataFrame
    results_df = pd.DataFrame(results)
    
    # 计算平均分数
    avg_scores = {
        'rouge-1': results_df['rouge-1-f'].mean(),
        'rouge-2': results_df['rouge-2-f'].mean(),
        'rouge-l': results_df['rouge-l-f'].mean()
    }
    
    return avg_scores, results_df

# 主程序
if __name__ == "__main__":
    # 1. 加载数据
    test_data = pd.read_csv('your_data.csv')
    
    # 2. 评估模型
    avg_scores, results_df = evaluate_model(
        model,
        tokenizer,
        test_data,
        num_samples=100  # 可选：设置样本数量
    )
    
    # 3. 保存详细结果
    results_df.to_csv('detailed_rouge_scores.csv', index=False)
    
    # 4. 打印平均分数
    print("\n平均ROUGE分数:")
    for metric, score in avg_scores.items():
        print(f"{metric}: {score:.4f}")
    
    # 5. 打印部分示例结果
    print("\n部分示例结果:")
    print(results_df[['instruction', 'generated', 'rouge-1-f', 'rouge-2-f', 'rouge-l-f']].head())