### Text Categorization

In [1]:
from collections import defaultdict, Counter
import math

In [2]:
documents = [
    ("I love programming in Python", "tech"),
    ("Python and Java are popular programming languages", "tech"),
    ("I enjoy watching movies and series", "entertainment"),
    ("Cinema and film industry is booming", "entertainment"),
    ("Machine learning and AI are future tech", "tech"),
    ("Music concerts are fun", "entertainment")
]

train_docs = documents

In [4]:
def tokenize(text):
    return text.lower().split()

### Naive Bayes(Multinomial)

In [5]:
# Building vocabulary and count words per class
vocab = set()
class_word_counts = defaultdict(Counter)
class_counts = defaultdict(int)

for text, label in train_docs:
    words = tokenize(text)
    class_counts[label] += 1
    for w in words:
        class_word_counts[label][w] += 1
        vocab.add(w)

# Total number of documents
total_docs = sum(class_counts.values())

# Predict using Naive Bayes (multinomial)
def naive_bayes_predict(doc):
    words = tokenize(doc)
    best_label = None
    best_prob = -float("inf")
    
    for label in class_counts:
        # P(C) = count(C)/total_docs
        log_prob = math.log(class_counts[label]/total_docs)
        total_words_in_class = sum(class_word_counts[label].values())
        V = len(vocab)
        
        # P(w|C) = (count(w in class) + 1)/(total words in class + |V|)  -> Laplace smoothing
        for w in words:
            count_w = class_word_counts[label][w] if w in class_word_counts[label] else 0
            log_prob += math.log((count_w + 1) / (total_words_in_class + V))
        if log_prob > best_prob:
            best_prob = log_prob
            best_label = label
    return best_label

### Decision Tree using ID3 as attribute selector

In [6]:
def entropy(class_counts_dict):
    total = sum(class_counts_dict.values())
    if total == 0:
        return 0
    ent = 0
    for count in class_counts_dict.values():
        if count == 0:
            continue
        p = count / total
        ent -= p * math.log2(p)
    return ent

# Compute information gain
def info_gain(docs, word):
    # Parent entropy
    label_counts = Counter([label for _, label in docs])
    parent_entropy = entropy(label_counts)
    
    # Split docs by word presence
    present = [d for d in docs if word in tokenize(d[0])]
    absent = [d for d in docs if word not in tokenize(d[0])]
    
    # Weighted entropy
    total = len(docs)
    present_entropy = entropy(Counter([label for _, label in present]))
    absent_entropy = entropy(Counter([label for _, label in absent]))
    
    weighted_entropy = (len(present)/total)*present_entropy + (len(absent)/total)*absent_entropy
    gain = parent_entropy - weighted_entropy  # IG formula
    return gain

# Select best word
all_words = set()
for text, _ in train_docs:
    all_words.update(tokenize(text))

best_word_id3 = max(all_words, key=lambda w: info_gain(train_docs, w))
print("Selected Word (ID3 split):", best_word_id3)

def decision_tree_id3_predict(doc):
    words = tokenize(doc)
    return "tech" if best_word_id3 in words else "entertainment"

Selected Word (ID3 split): python


### KNN(Cosine Similarity)

In [11]:
# Building term frequency vector
def tf_vector(text):
    words = tokenize(text)
    return Counter(words)

# Cosine similarity formula
def cosine_similarity(vec1, vec2):
    dot = sum(vec1.get(w,0) * vec2.get(w,0) for w in set(vec1) | set(vec2))
    mag1 = math.sqrt(sum(v**2 for v in vec1.values()))
    mag2 = math.sqrt(sum(v**2 for v in vec2.values()))
    if mag1 == 0 or mag2 == 0:
        return 0
    return dot / (mag1 * mag2)

# KNN prediction using cosine sim
def knn_cosine_predict(doc, k=3):
    vec_doc = tf_vector(doc)
    similarities = []
    
    for text, label in train_docs:
        vec_train = tf_vector(text)
        sim = cosine_similarity(vec_doc, vec_train)
        similarities.append((sim, label))
    
    similarities.sort(key=lambda x: x[0], reverse=True)
    top_k = [label for _, label in similarities[:k]]
    return Counter(top_k).most_common(1)[0][0]


### Rochhio algorithm

In [14]:
centroids = defaultdict(lambda: defaultdict(float))
class_counts_words = defaultdict(int)

for text, label in train_docs:
    words = tokenize(text)
    class_counts_words[label] += 1
    for w in words:
        centroids[label][w] += 1

# Computing centroid = sum(word vectors)/n
for label in centroids:
    n = class_counts_words[label]
    for w in centroids[label]:
        centroids[label][w] /= n

def rocchio_predict(doc):
    words = tokenize(doc)
    best_label = None
    best_score = -float("inf")
    for label, vec in centroids.items():
        score = sum(vec.get(w, 0) for w in words)
        if score > best_score:
            best_score = score
            best_label = label
    return best_label


### Outputs

In [13]:
test_docs = [
    "I love AI and programming",
    "Watching movies and cinema is fun",
    "Python machine learning"
]

print("Naive Bayes Predictions:")
for doc in test_docs:
    print(f"'{doc}' -> {naive_bayes_predict(doc)}")

print("\nDecision Tree (ID3) Predictions:")
for doc in test_docs:
    print(f"'{doc}' -> {decision_tree_id3_predict(doc)}")

print("\nKNN Predictions:")
for doc in test_docs:
    print(f"'{doc}' -> {knn_predict(doc)}")

print("\nRocchio Predictions:")
for doc in test_docs:
    print(f"'{doc}' -> {rocchio_predict(doc)}")

Naive Bayes Predictions:
'I love AI and programming' -> tech
'Watching movies and cinema is fun' -> entertainment
'Python machine learning' -> tech

Decision Tree (ID3) Predictions:
'I love AI and programming' -> entertainment
'Watching movies and cinema is fun' -> entertainment
'Python machine learning' -> tech

KNN Predictions:
'I love AI and programming' -> tech
'Watching movies and cinema is fun' -> entertainment
'Python machine learning' -> tech

Rocchio Predictions:
'I love AI and programming' -> tech
'Watching movies and cinema is fun' -> entertainment
'Python machine learning' -> tech
