In [None]:
import re
import math
from collections import Counter, defaultdict

# ----------------------------------------------------------
# 1️⃣ Preprocessing
# ----------------------------------------------------------
def preprocess(sentence):
    """
    Tokenize a sentence and replace:
    - Numbers with 'NUMBER'
    - URLs with 'URL'
    - Punctuations with 'PUNCT'
    - Convert to lowercase
    """
    # Convert to lowercase
    sentence = sentence.lower()

    # Replace URLs
    sentence = re.sub(r'https?://\S+|www\.\S+', ' URL ', sentence)

    # Replace numbers
    sentence = re.sub(r'\b\d+(\.\d+)?\b', ' NUMBER ', sentence)

    # Replace punctuations
    sentence = re.sub(r'[^\w\s]', ' PUNCT ', sentence)

    # Tokenize (split on whitespace)
    tokens = sentence.split()

    return tokens


# ----------------------------------------------------------
# 2️⃣ Term Frequency (TF)
# ----------------------------------------------------------
def compute_tf_with_normalization(sentence, vocab, smoothing=False):
    """
    Compute normalized term frequency for one sentence.
    TF(t,d) = log(1 + count(t,d) / total_terms)
    If smoothing=True, add 1 to counts to handle unseen words.
    """
    tokens = preprocess(sentence)
    total_terms = len(tokens)
    token_counts = Counter(tokens)

    tf = {}
    for term in vocab:
        if smoothing:
            count = token_counts.get(term, 0) + 1
        else:
            count = token_counts.get(term, 0)
        # Normalized TF
        tf[term] = math.log(1 + count / total_terms)

    return tf


# ----------------------------------------------------------
# 3️⃣ Inverse Document Frequency (IDF)
# ----------------------------------------------------------
def compute_idf(sentences, vocab, smoothing=False):
    """
    Compute IDF for each term.
    IDF(t) = log( N / df(t) )
    If smoothing=True, use IDF(t) = log( (N + 1) / (df(t) + 1) )
    """
    N = len(sentences)
    df = defaultdict(int)

    for sentence in sentences:
        tokens = set(preprocess(sentence))
        for term in vocab:
            if term in tokens:
                df[term] += 1

    idf = {}
    for term in vocab:
        if smoothing:
            idf[term] = math.log((N + 1) / (df[term] + 1))
        else:
            if df[term] > 0:
                idf[term] = math.log(N / df[term])
            else:
                idf[term] = 0.0  # unseen word case

    return idf


# ----------------------------------------------------------
# 4️⃣ TF-IDF computation
# ----------------------------------------------------------
def compute_tf_idf_scores(sentences, smoothing=False):
    """
    Compute TF-IDF scores for all sentences.
    Returns a list of dictionaries: one dict per sentence.
    """
    # Build vocabulary
    vocab = sorted(set(token for s in sentences for token in preprocess(s)))

    idf = compute_idf(sentences, vocab, smoothing=smoothing)

    tf_idf_all = []
    for sentence in sentences:
        tf = compute_tf_with_normalization(sentence, vocab, smoothing=smoothing)
        tf_idf = {term: tf[term] * idf[term] for term in vocab}
        tf_idf_all.append(tf_idf)

    return vocab, tf_idf_all


# ----------------------------------------------------------
# 5️⃣ Main function
# ----------------------------------------------------------
def main():
    sentences = [
        "I scored 95 marks in the exam!",
        "Visit https://example.com for more details.",
        "Numbers like 1000 or 3.14 are replaced.",
        "Punctuation, such as commas, should be replaced too!"
    ]

    print("\n--- Preprocessing ---")
    for s in sentences:
        print(f"Original: {s}")
        print(f"Tokens: {preprocess(s)}\n")

    vocab, tfidf_scores = compute_tf_idf_scores(sentences, smoothing=True)

    print("\n--- Vocabulary ---")
    print(vocab)

    print("\n--- TF-IDF Scores ---")
    for i, sent_scores in enumerate(tfidf_scores):
        print(f"\nSentence {i+1}:")
        for term, score in sent_scores.items():
            if score > 0:
                print(f"{term:15s}: {score:.4f}")


# ----------------------------------------------------------
if __name__ == "__main__":
    main()



--- Preprocessing ---
Original: I scored 95 marks in the exam!
Tokens: ['i', 'scored', 'NUMBER', 'marks', 'in', 'the', 'exam', 'PUNCT']

Original: Visit https://example.com for more details.
Tokens: ['visit', 'URL', 'for', 'more', 'details', 'PUNCT']

Original: Numbers like 1000 or 3.14 are replaced.
Tokens: ['numbers', 'like', 'NUMBER', 'or', 'NUMBER', 'are', 'replaced', 'PUNCT']

Original: Punctuation, such as commas, should be replaced too!
Tokens: ['punctuation', 'PUNCT', 'such', 'as', 'commas', 'PUNCT', 'should', 'be', 'replaced', 'too', 'PUNCT']


--- Vocabulary ---
['NUMBER', 'PUNCT', 'URL', 'are', 'as', 'be', 'commas', 'details', 'exam', 'for', 'i', 'in', 'like', 'marks', 'more', 'numbers', 'or', 'punctuation', 'replaced', 'scored', 'should', 'such', 'the', 'too', 'visit']

--- TF-IDF Scores ---

Sentence 1:
NUMBER         : 0.1140
URL            : 0.1079
are            : 0.1079
as             : 0.1079
be             : 0.1079
commas         : 0.1079
details        : 0.1079
exa