In [2]:
import math
from collections import Counter

# Function to compute Term Frequency (TF)
def compute_tf(document):
    tf_scores = {}
    total_terms = len(document)
    term_counts = Counter(document)
    for term, count in term_counts.items():
        tf_scores[term] = count / total_terms  # Term Frequency = count of term / total number of terms in the document
    return tf_scores

# Function to compute Inverse Document Frequency (IDF)
def compute_idf(documents):
    num_docs = len(documents)
    document_frequencies = Counter()

    # Count how many documents contain each term
    for document in documents:
        unique_terms = set(document)  # Ensure each term is counted once per document
        for term in unique_terms:
            document_frequencies[term] += 1

    # Calculate IDF for each term
    idf_scores = {}
    for term, doc_count in document_frequencies.items():
        idf_scores[term] = math.log(num_docs / (1 + doc_count))  # Add 1 to avoid division by zero
    return idf_scores

# Function to compute TF-IDF for a query and a document
def compute_tfidf(query, document, idf_scores):
    tf_scores = compute_tf(document)
    tfidf_scores = {}

    for term in query:
        tf = tf_scores.get(term, 0)  # If term is not in document, TF is 0
        idf = idf_scores.get(term, 0)  # If term is not in the corpus, IDF is 0
        tfidf_scores[term] = tf * idf  # TF-IDF score = TF * IDF
    return tfidf_scores

# Example documents (each document is a list of terms/words)
documents = [
    ['apple', 'banana', 'fruit', 'salad'],
    ['banana', 'orange', 'fruit', 'smoothie'],
    ['dog', 'cat', 'pet', 'animal', 'love'],
    ['cat', 'fish', 'pet', 'love']
]

# Query for which we want to compute TF-IDF scores
query = ['fruit', 'banana']

# Compute IDF scores from all documents
idf_scores = compute_idf(documents)

# Compute TF-IDF scores for the query against each document
for i, doc in enumerate(documents):
    print(f"TF-IDF scores for Document {i+1}: {compute_tfidf(query, doc, idf_scores)}")


TF-IDF scores for Document 1: {'fruit': 0.07192051811294521, 'banana': 0.07192051811294521}
TF-IDF scores for Document 2: {'fruit': 0.07192051811294521, 'banana': 0.07192051811294521}
TF-IDF scores for Document 3: {'fruit': 0.0, 'banana': 0.0}
TF-IDF scores for Document 4: {'fruit': 0.0, 'banana': 0.0}
