In [27]:
import re, math
from dataclasses import dataclass
from typing import List, Tuple, Set, Iterable
from collections import Counter, defaultdict

In [28]:
# ==================================================
# 0) Sample data (label, text)
# ==================================================
docs = [
    (0, "The government announced a new economic policy today."),
    (0, "The election results sparked debates across the country."),
    (0, "The president met with foreign leaders to discuss trade."),
    (0, "New laws were introduced to reform the healthcare system."),
    (0, "The parliament voted on the proposed budget plan."),

    (1, "The football team won the championship after a tough season."),
    (1, "Fans celebrated the victory late into the night."),
    (1, "The coach emphasized teamwork and discipline."),
    (1, "A star player scored the winning goal in the final match."),
    (1, "The team trained hard to prepare for the tournament.")
]

In [29]:
 #Build the skeleton

In [30]:

# ==================================================
# 1) make_subset
# ==================================================
def make_subset(docs: List[Tuple[int, str]]) -> Tuple[
    List[Tuple[int, str]],
    List[Tuple[int, str]]
]:
    c0_docs, c1_docs = [], []
    for label, text in docs:
        if label == 0:
            c0_docs.append((label, text))
        elif label == 1:
            c1_docs.append((label, text))
    return c0_docs, c1_docs


In [31]:
# ==================================================
# 2) Preprocessing config
# ==================================================
@dataclass(frozen=True)
class PrepConfig:
    pattern: re.Pattern
    stopwords: Set[str]
    min_len: int = 2
    drop_pure_digits: bool = True
    lowercase: bool = True

STOPWORDS = {
    "the","a","an","and","or","to","of","for","with","was","were",
    "is","are","in","on","after","new","we","i","them","they",
    "this","that","it","as","helps"
}

CFG = PrepConfig(
    pattern=re.compile(r"[a-z0-9%]+"),
    stopwords=STOPWORDS
)

In [32]:
#This function tokenizes and normalizes text by applying case normalization, regex-based token extraction, and multiple noise-filtering steps.

In [33]:
# ==================================================
# 3) tokenize / text_prep
# ==================================================
def tokenize(text: str, cfg: PrepConfig = CFG) -> List[str]:
    # 1) Convert text to lowercase (case normalization)
    if cfg.lowercase:
        text = text.lower()

    # 2) Extract tokens using the regex pattern
    #    → keeps only allowed character sequences (e.g., a-z, 0-9, %)
    tokens = cfg.pattern.findall(text)

    # 3) Filter out tokens shorter than the minimum length
    #    → removes short, low-information noise tokens
    if cfg.min_len > 1:
        tokens = [t for t in tokens if len(t) >= cfg.min_len]

    # 4) Remove stopwords
    #    → removes common function words with little semantic value
    if cfg.stopwords:
        tokens = [t for t in tokens if t not in cfg.stopwords]

    # 5) Remove tokens that consist of digits only
    #    → prevents numeric-only tokens from inflating the vocabulary
    if cfg.drop_pure_digits:
        tokens = [t for t in tokens if not t.isdigit()]

    # Return the final list of normalized tokens
    return tokens

def text_prep(
    docs: Iterable[Tuple[int, str]],
    cfg: PrepConfig = CFG
) -> List[Tuple[int, List[str]]]:
    return [(label, tokenize(text, cfg)) for label, text in docs]


In [34]:
# ==================================================
# 4) tokens → sparse vectors (TF)
# ==================================================
def to_sparse_vectors(
    prepped_docs: Iterable[Tuple[int, List[str]]]
) -> List[Tuple[int, Counter]]:
    return [(label, Counter(tokens)) for label, tokens in prepped_docs]

In [41]:
#Rocchio training computes a centroid for each class by averaging the document vectors belonging to that class.

In [37]:
# ==================================================
# 5) Rocchio: centroid (class prototype)
# ==================================================
def make_centroid(vectors: List[Counter]) -> Counter:
    # Create the centroid (average vector) for a class
    centroid = Counter()

    # Safety check: return empty vector if there are no documents
    if not vectors:
        return centroid

    # 1) Sum all document vectors belonging to the same class
    #    → accumulate term frequencies
    for vec in vectors:
        centroid.update(vec)

    # 2) Divide by the number of documents to compute the average
    #    → results in the class centroid
    n = len(vectors)
    for term in centroid:
        centroid[term] /= n

    return centroid


def train_rocchio(
    sparse_docs: List[Tuple[int, Counter]]
) -> dict:
    # Group document vectors by class label
    by_label = defaultdict(list)

    # Convert [(label, vector), ...] into
    # {label: [vector1, vector2, ...]}
    for label, vec in sparse_docs:
        by_label[label].append(vec)

    # Compute a centroid for each class
    # Result: {label: centroid_vector}
    return {
        label: make_centroid(vecs)
        for label, vecs in by_label.items()
    }


In [38]:
# ==================================================
# 6) cosine similarity
# ==================================================
def cosine_sim(a: Counter, b: Counter) -> float:
    dot = sum(v * b.get(t, 0.0) for t, v in a.items())
    norm_a = math.sqrt(sum(v*v for v in a.values()))
    norm_b = math.sqrt(sum(v*v for v in b.values()))

    if norm_a == 0 or norm_b == 0:
        return 0.0
    return dot / (norm_a * norm_b)


In [39]:
# ==================================================
# 7) predict
# ==================================================
def predict_rocchio(vec: Counter, centroids: dict) -> int:
    scores = {
        label: cosine_sim(vec, centroid)
        for label, centroid in centroids.items()
    }
    return max(scores, key=scores.get)

In [40]:

# ==================================================
# 8) RUN
# ==================================================
# subset (optional, test)
c0_docs, c1_docs = make_subset(docs)

# preprocessing
prepped_docs = text_prep(docs)

# vectorize
sparse_docs = to_sparse_vectors(prepped_docs)

# train
centroids = train_rocchio(sparse_docs)

print("=== Centroids ===")
for label, vec in centroids.items():
    print(f"label={label}, unique_terms={len(vec)}")

print("\n=== Predictions (training data) ===")
for i, (label, vec) in enumerate(sparse_docs):
    pred = predict_rocchio(vec, centroids)
    print(f"doc {i}: true={label}, pred={pred}")

=== Centroids ===
label=0, unique_terms=27
label=1, unique_terms=27

=== Predictions (training data) ===
doc 0: true=0, pred=0
doc 1: true=0, pred=0
doc 2: true=0, pred=0
doc 3: true=0, pred=0
doc 4: true=0, pred=0
doc 5: true=1, pred=1
doc 6: true=1, pred=1
doc 7: true=1, pred=1
doc 8: true=1, pred=1
doc 9: true=1, pred=1
