In [None]:
import math
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

# 自定义 TF-IDF 实现, 不知道为什么和sk-learn 总是第一个对不上，后续的都一样，而自己用excel 计算确实是对的
def compute_tf(word: str, count: Counter):
    return count[word] / sum(count.values()) if word in count else 0

def compute_idf(word: str, count_all: list[Counter]):
    N = len(count_all)
    df = sum(1 for count in count_all if word in count)
    #print(N+1,df+1)
    #print(word,math.log2((N + 1) / (df + 1)) + 1)
    return math.log((N + 1) / (df + 1)) + 1

def compute_tfidf_doc(count: Counter, count_all: list[Counter], vocab: set):
    # 计算原始 TF-IDF 值，包含所有词汇
    tfidf_scores = {}
    for word in vocab:
        tf = compute_tf(word, count)
        idf = compute_idf(word, count_all)
        tfidf_scores[word] = tf * idf
    print(tfidf_scores)
    # L2 正则化
    norm = np.sqrt(sum(value * value for value in tfidf_scores.values()))
    print(norm)
    if norm > 0:
        for word in tfidf_scores:
            tfidf_scores[word] /= norm
    print(tfidf_scores) 
    return tfidf_scores

def ready_data(documents: list[str]):
    # 计算词频
    count_all = [Counter(doc.split()) for doc in documents]
    # 构建全局词汇表
    vocab = set().union(*[set(count.keys()) for count in count_all])
    
    # 计算 TF-IDF 并正则化
    tfidf = []
    for count in count_all:
        doc_tfidf = compute_tfidf_doc(count, count_all, vocab)
        tfidf.append(doc_tfidf)
    return tfidf

# 测试数据
documents = [
    "this is a sample text",
    "this text is different from the first one",
    "sample text for testing"
]

# 自定义实现
print("自定义 TF-IDF 实现结果:")
custom_tfidf = ready_data(documents)
for i, doc_tfidf in enumerate(custom_tfidf):
    print(f"文档 {i}:")
    for word, score in sorted(doc_tfidf.items()):
        if score > 0:
            print(f"{word}: {score:.4f}")
    print()

# 使用 sklearn 验证
print("Sklearn TF-IDF 实现结果:")
vectorizer = TfidfVectorizer(norm='l2', smooth_idf=True)
sklearn_tfidf = vectorizer.fit_transform(documents).toarray()
feature_names = vectorizer.get_feature_names_out()

for i, doc_vector in enumerate(sklearn_tfidf):
    print(f"文档 {i}:")
    for j, score in enumerate(doc_vector):
        if score > 0:
            print(f"{feature_names[j]}: {score:.4f}")
    print()

# 比较特定词的 TF-IDF 值
test_word = "sample"
if test_word in feature_names:
    sklearn_idx = np.where(feature_names == test_word)[0][0]
    print(f"\n对比 '{test_word}' 的 TF-IDF 值:")
    for i in range(len(documents)):
        custom_score = custom_tfidf[i].get(test_word, 0)
        sklearn_score = sklearn_tfidf[i][sklearn_idx]
        print(f"文档 {i} - 自定义: {custom_score:.4f}, Sklearn: {sklearn_score:.4f}")

自定义 TF-IDF 实现结果:
{'sample': 0.2575364144903562, 'a': 0.3386294361119891, 'different': 0.0, 'this': 0.2575364144903562, 'first': 0.0, 'is': 0.2575364144903562, 'one': 0.0, 'testing': 0.0, 'text': 0.2, 'the': 0.0, 'from': 0.0, 'for': 0.0}
0.5946805103306896
{'sample': np.float64(0.43306684852870914), 'a': np.float64(0.5694308628404254), 'different': np.float64(0.0), 'this': np.float64(0.43306684852870914), 'first': np.float64(0.0), 'is': np.float64(0.43306684852870914), 'one': np.float64(0.0), 'testing': np.float64(0.0), 'text': np.float64(0.3363150406405351), 'the': np.float64(0.0), 'from': np.float64(0.0), 'for': np.float64(0.0)}
{'sample': 0.0, 'a': 0.0, 'different': 0.21164339756999317, 'this': 0.1609602590564726, 'first': 0.21164339756999317, 'is': 0.1609602590564726, 'one': 0.21164339756999317, 'testing': 0.0, 'text': 0.125, 'the': 0.21164339756999317, 'from': 0.21164339756999317, 'for': 0.0}
0.5398203855597753
{'sample': np.float64(0.0), 'a': np.float64(0.0), 'different': np.float