In [1]:
import math

# Sample documents
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "The dog is fast and the cat is not.",
    "Quick brown foxes leap over lazy dogs in summer."
]

# Step 1: Calculate term frequencies (TF)
def calculate_tf(document, term):
    term_count = document.count(term)
    return term_count / len(document.split())

# Step 2: Calculate inverse document frequencies (IDF)
def calculate_idf(documents, term):
    num_documents = len(documents)
    num_documents_with_term = sum(1 for doc in documents if term in doc)
    return math.log(num_documents / (1 + num_documents_with_term))

# Step 3: Calculate TF-IDF
def calculate_tfidf(documents, term):
    idf = calculate_idf(documents, term)
    tfs = [calculate_tf(doc, term) for doc in documents]
    return [tf * idf for tf in tfs]

# Step 4: Calculate BM25
def calculate_bm25(documents, term, k1=1.2, b=0.75):
    idf = calculate_idf(documents, term)
    tfs = [calculate_tf(doc, term) for doc in documents]
    average_doc_length = sum(len(doc.split()) for doc in documents) / len(documents)
    bm25_scores = []
    for tf, doc in zip(tfs, documents):
        doc_length = len(doc.split())
        numerator = tf * (k1 + 1)
        denominator = tf + k1 * (1 - b + b * (doc_length / average_doc_length))
        bm25_scores.append(idf * (numerator / denominator))
    return bm25_scores

# Example usage
term = "quick"
tfidf_scores = calculate_tfidf(documents, term)
bm25_scores = calculate_bm25(documents, term)

print(f"TF-IDF scores for term '{term}': {tfidf_scores}")
print(f"BM25 scores for term '{term}': {bm25_scores}")

TF-IDF scores for term 'quick': [0.04505167867868493, 0.0, 0.0]
BM25 scores for term 'quick': [0.07559518964728489, 0.0, 0.0]
