In [None]:
import re
import math
from collections import Counter, defaultdict

def preprocess(sentence):
    # Convert to lowercase
    sentence = sentence.lower()

    # Replace URLs
    sentence = re.sub(r'https?://\S+|www\.\S+', ' URL ', sentence)

    # Replace numbers
    sentence = re.sub(r'\b\d+(\.\d+)?\b', ' NUMBER ', sentence)

    # Replace punctuations
    sentence = re.sub(r'[^\w\s]', ' PUNCT ', sentence)

    # Tokenize (split on whitespace)
    tokens = sentence.split()

    return tokens


def compute_tf_with_normalization(sentence, vocab, smoothing=False):
    tokens = preprocess(sentence)
    total_terms = len(tokens)
    token_counts = Counter(tokens)

    tf = {}
    for term in vocab:
        if smoothing:
            count = token_counts.get(term, 0) + 1
        else:
            count = token_counts.get(term, 0)
        # Normalized TF
        tf[term] = math.log(1 + count / total_terms)

    return tf


def compute_idf(sentences, vocab, smoothing=False):
    N = len(sentences)
    df = defaultdict(int)

    for sentence in sentences:
        tokens = set(preprocess(sentence))
        for term in vocab:
            if term in tokens:
                df[term] += 1

    idf = {}
    for term in vocab:
        if smoothing:
            idf[term] = math.log((N + 1) / (df[term] + 1))
        else:
            if df[term] > 0:
                idf[term] = math.log(N / df[term])
            else:
                idf[term] = 0.0  # unseen word case

    return idf


def compute_tf_idf_scores(sentences, smoothing=False):
    # Build vocabulary
    vocab = sorted(set(token for s in sentences for token in preprocess(s)))

    idf = compute_idf(sentences, vocab, smoothing=smoothing)

    tf_idf_all = []
    for sentence in sentences:
        tf = compute_tf_with_normalization(sentence, vocab, smoothing=smoothing)
        tf_idf = {term: tf[term] * idf[term] for term in vocab}
        tf_idf_all.append(tf_idf)

    return vocab, tf_idf_all

def main():
    sentences = [
        "I scored 95 marks in the exam!",
        "Visit https://example.com for more details.",
        "Numbers like 1000 or 3.14 are replaced.",
        "Punctuation, such as commas, should be replaced too!"
    ]

    print("\n--- Preprocessing ---")
    for s in sentences:
        print(f"Original: {s}")
        print(f"Tokens: {preprocess(s)}\n")

    vocab, tfidf_scores = compute_tf_idf_scores(sentences, smoothing=True)

    print("\n--- Vocabulary ---")
    print(vocab)

    print("\n--- TF-IDF Scores ---")
    for i, sent_scores in enumerate(tfidf_scores):
        print(f"\nSentence {i+1}:")
        for term, score in sent_scores.items():
            if score > 0:
                print(f"{term:15s}: {score:.4f}")


# ----------------------------------------------------------
if __name__ == "__main__":
    main()



--- Preprocessing ---
Original: I scored 95 marks in the exam!
Tokens: ['i', 'scored', 'NUMBER', 'marks', 'in', 'the', 'exam', 'PUNCT']

Original: Visit https://example.com for more details.
Tokens: ['visit', 'URL', 'for', 'more', 'details', 'PUNCT']

Original: Numbers like 1000 or 3.14 are replaced.
Tokens: ['numbers', 'like', 'NUMBER', 'or', 'NUMBER', 'are', 'replaced', 'PUNCT']

Original: Punctuation, such as commas, should be replaced too!
Tokens: ['punctuation', 'PUNCT', 'such', 'as', 'commas', 'PUNCT', 'should', 'be', 'replaced', 'too', 'PUNCT']


--- Vocabulary ---
['NUMBER', 'PUNCT', 'URL', 'are', 'as', 'be', 'commas', 'details', 'exam', 'for', 'i', 'in', 'like', 'marks', 'more', 'numbers', 'or', 'punctuation', 'replaced', 'scored', 'should', 'such', 'the', 'too', 'visit']

--- TF-IDF Scores ---

Sentence 1:
NUMBER         : 0.1140
URL            : 0.1079
are            : 0.1079
as             : 0.1079
be             : 0.1079
commas         : 0.1079
details        : 0.1079
exa

In [2]:
import re
from collections import Counter, defaultdict

# Step 1: Dataset
sentences = [
    "The boy hugs the cat.",
    "The boys are hugging the dogs.",
    "The dogs are chasing the cats.",
    "The dog and the cat sit quietly.",
    "The boy is sitting on the dog."
]

# Step 2: Preprocess (lowercase + remove punctuation)
def preprocess(sent):
    sent = re.sub(r'[^\w\s]', '', sent.lower())
    return sent.split()

corpus = [preprocess(s) for s in sentences]

# Step 3: Convert each word into character tokens
def get_initial_vocab(corpus):
    word_freq = Counter()
    for sent in corpus:
        for word in sent:
            chars = " ".join(list(word)) + " </w>"  # Add end of word marker
            word_freq[chars] += 1
    return word_freq

vocab = get_initial_vocab(corpus)

# Step 4: Function to get pair frequencies
def get_stats(vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[(symbols[i], symbols[i+1])] += freq
    return pairs

# Step 5: Merge the most frequent pair
def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = pattern.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

# Step 6: Apply 20 merge iterations
num_merges = 20
for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    print(f"Merge {i+1}: {best}")

# Step 7: Build final vocabulary
normalized_vocab = set()
for token in vocab:
    for piece in token.split():
        # remove </w> marker, add clean token
        if piece.endswith("</w>"):
            normalized_vocab.add(piece.replace("</w>", ""))
        else:
            normalized_vocab.add(piece)

final_vocab = sorted(normalized_vocab)

print("\nâœ… Normalized WordPiece Vocabulary:")
print(final_vocab)


# Step 8: Tokenize new sentence using learned vocab
def wordpiece_tokenize(word, vocab):
    if word in vocab:
        return [word]
    chars = list(word)
    tokens = []
    i = 0
    while i < len(chars):
        subtoken = None
        for j in range(len(chars), i, -1):
            piece = ''.join(chars[i:j])
            if i > 0:
                piece = '##' + piece
            if piece in vocab:
                subtoken = piece
                break
        if subtoken is None:
            tokens.append('[UNK]')
            break
        tokens.append(subtoken)
        i = j
    return tokens

# Step 9: Tokenize the new test sentence
test_sentence = "The cat is chasing the dog quietly."
test_words = preprocess(test_sentence)

print("\nTokenization of test sentence:")
tokens = []
for word in test_words:
    tokens.extend(wordpiece_tokenize(word, final_vocab))

print(tokens)


Merge 1: ('e', '</w>')
Merge 2: ('t', 'h')
Merge 3: ('th', 'e</w>')
Merge 4: ('s', '</w>')
Merge 5: ('g', '</w>')
Merge 6: ('d', 'o')
Merge 7: ('b', 'o')
Merge 8: ('bo', 'y')
Merge 9: ('g', 's</w>')
Merge 10: ('c', 'a')
Merge 11: ('ca', 't')
Merge 12: ('i', 'n')
Merge 13: ('in', 'g</w>')
Merge 14: ('boy', '</w>')
Merge 15: ('h', 'u')
Merge 16: ('cat', '</w>')
Merge 17: ('a', 'r')
Merge 18: ('ar', 'e</w>')
Merge 19: ('do', 'gs</w>')
Merge 20: ('do', 'g</w>')

âœ… Normalized WordPiece Vocabulary:
['', 'a', 'are', 'boy', 'c', 'cat', 'd', 'dog', 'dogs', 'e', 'g', 'gs', 'h', 'hu', 'i', 'ing', 'l', 'n', 'o', 'q', 's', 't', 'the', 'u', 'y']

Tokenization of test sentence:
['the', 'cat', 'i', '[UNK]', 'c', '[UNK]', 'the', 'dog', 'q', '[UNK]']


In [3]:
import re
import itertools
from collections import defaultdict, Counter
import math

# -------------------------------
# Step 1. Training data
# -------------------------------
data = [
    ("Check out https://example.com for more info!", "Inform"),
    ("Order 3 items, get 1 free! Limited offer!!!", "Promo"),
    ("Your package #12345 will arrive tomorrow.", "Inform"),
    ("Win $1000 now, visit http://winbig.com!!!", "Promo"),
    ("Meeting at 3pm, don't forget to bring the files.", "Reminder"),
    ("Exclusive deal for you: buy 2, get 1 free!!!", "Promo"),
    ("Download the report from https://reports.com.", "Inform"),
    ("The meeting is starting in 10 minutes.", "Reminder"),
    ("Reminder: submit your timesheet by 5pm today.", "Reminder"),
]

# -------------------------------
# Step 2. Preprocessing function
# -------------------------------
def preprocess(sentence):
    # Lowercase
    s = sentence.lower()

    # Replace URLs
    s = re.sub(r'http\S+|www\S+', 'URL', s)

    # Replace numbers
    s = re.sub(r'\d+(\.\d+)?', 'NUMBER', s)

    # Replace punctuation with token PUNCT (keep spaces around for token separation)
    s = re.sub(r'[!.,?#:;\'"-]+', ' PUNCT ', s)

    # Remove multiple spaces
    s = re.sub(r'\s+', ' ', s).strip()

    return s.split()

# Preprocess all sentences
processed_data = [(preprocess(s), label) for s, label in data]

print("âœ… Preprocessed Sentences:\n")
for tokens, label in processed_data:
    print(label, ":", tokens)
print()

# -------------------------------
# Step 3. Feature extraction functions
# -------------------------------

def extract_features(tokens):
    features = {}

    # Binary/frequency features
    features["has_url"] = int("url" in tokens)
    features["has_number"] = int("number" in tokens)
    features["punct_count"] = tokens.count("punct")
    features["has_timeword"] = int(any(t in tokens for t in ["pm", "am", "meeting", "minutes", "today"]))

    # Bigrams (for probability model)
    bigrams = list(zip(tokens[:-1], tokens[1:])) if len(tokens) > 1 else []
    features["bigrams"] = bigrams
    return features

# Collect features per class
class_features = defaultdict(list)
for tokens, label in processed_data:
    feats = extract_features(tokens)
    class_features[label].append(feats)

# -------------------------------
# Step 4. Calculate priors
# -------------------------------
labels = [label for _, label in processed_data]
label_counts = Counter(labels)
total_docs = len(labels)
priors = {lbl: label_counts[lbl] / total_docs for lbl in label_counts}

print("âœ… Class Priors:", priors, "\n")

# -------------------------------
# Step 5. Build bigram probabilities (Add-K smoothing)
# -------------------------------
K = 0.3
class_bigram_counts = defaultdict(Counter)
class_unigram_counts = defaultdict(Counter)
vocab_bigrams = set()

for lbl, feats_list in class_features.items():
    for feats in feats_list:
        for (w1, w2) in feats["bigrams"]:
            class_bigram_counts[lbl][(w1, w2)] += 1
            class_unigram_counts[lbl][w1] += 1
            vocab_bigrams.add((w1, w2))

V = len(vocab_bigrams)

# Compute conditional probabilities P(w2|w1, class)
bigram_probs = defaultdict(dict)
for lbl in class_bigram_counts:
    for (w1, w2) in vocab_bigrams:
        count_bigram = class_bigram_counts[lbl][(w1, w2)]
        count_w1 = class_unigram_counts[lbl][w1]
        prob = (count_bigram + K) / (count_w1 + K * V)
        bigram_probs[lbl][(w1, w2)] = prob

# -------------------------------
# Step 6. Naive Bayes Prediction
# -------------------------------
def predict(sentence):
    tokens = preprocess(sentence)
    feats = extract_features(tokens)

    scores = {}
    for lbl in priors:
        # Start with log prior
        log_prob = math.log(priors[lbl])

        # Binary/frequency features (simple log multipliers)
        if feats["has_url"]:
            log_prob += math.log(1.5) if lbl == "Inform" else math.log(0.8)
        if feats["punct_count"] >= 2 and lbl == "Promo":
            log_prob += math.log(2.0)
        if feats["has_timeword"] and lbl == "Reminder":
            log_prob += math.log(2.0)

        # Bigram probabilities
        for (w1, w2) in feats["bigrams"]:
            if (w1, w2) in bigram_probs[lbl]:
                log_prob += math.log(bigram_probs[lbl][(w1, w2)])
            else:
                # unseen bigram
                log_prob += math.log(K / (K * V))

        scores[lbl] = log_prob

    predicted = max(scores, key=scores.get)
    return predicted, scores, feats

# -------------------------------
# Step 7. Test on given sentence
# -------------------------------
test_sentence = "You will get an exclusive offer in the meeting!"
pred_label, scores, feats = predict(test_sentence)

print("âœ… Test Sentence (after preprocessing):", preprocess(test_sentence))
print("\nExtracted Features:", feats)
print("\nClass Scores:")
for lbl, sc in scores.items():
    print(f"{lbl:10s}: {sc:.4f}")
print("\nðŸŽ¯ Predicted Label:", pred_label)


âœ… Preprocessed Sentences:

Inform : ['check', 'out', 'URL', 'for', 'more', 'info', 'PUNCT']
Promo : ['order', 'NUMBER', 'items', 'PUNCT', 'get', 'NUMBER', 'free', 'PUNCT', 'limited', 'offer', 'PUNCT']
Inform : ['your', 'package', 'PUNCT', 'NUMBER', 'will', 'arrive', 'tomorrow', 'PUNCT']
Promo : ['win', '$NUMBER', 'now', 'PUNCT', 'visit', 'URL']
Reminder : ['meeting', 'at', 'NUMBERpm', 'PUNCT', 'don', 'PUNCT', 't', 'forget', 'to', 'bring', 'the', 'files', 'PUNCT']
Promo : ['exclusive', 'deal', 'for', 'you', 'PUNCT', 'buy', 'NUMBER', 'PUNCT', 'get', 'NUMBER', 'free', 'PUNCT']
Inform : ['download', 'the', 'report', 'from', 'URL']
Reminder : ['the', 'meeting', 'is', 'starting', 'in', 'NUMBER', 'minutes', 'PUNCT']
Reminder : ['reminder', 'PUNCT', 'submit', 'your', 'timesheet', 'by', 'NUMBERpm', 'today', 'PUNCT']

âœ… Class Priors: {'Inform': 0.3333333333333333, 'Promo': 0.3333333333333333, 'Reminder': 0.3333333333333333} 

âœ… Test Sentence (after preprocessing): ['you', 'will', 'get', 'a