In [None]:
ㄹ

In [1]:
import re, math
from dataclasses import dataclass
from typing import List, Tuple, Set, Iterable
from collections import Counter, defaultdict

In [2]:
# ==================================================
# 0) Sample data (label, text)
# ==================================================
docs = [
    (0, "The government announced a new economic policy today."),
    (0, "The election results sparked debates across the country."),
    (0, "The president met with foreign leaders to discuss trade."),
    (0, "New laws were introduced to reform the healthcare system."),
    (0, "The parliament voted on the proposed budget plan."),

    (1, "The football team won the championship after a tough season."),
    (1, "Fans celebrated the victory late into the night."),
    (1, "The coach emphasized teamwork and discipline."),
    (1, "A star player scored the winning goal in the final match."),
    (1, "The team trained hard to prepare for the tournament.")
]

In [3]:
 #Build the skeleton

In [5]:
# ==================================================
# 1) make_subset
# ==================================================
def make_subset(docs: List[Tuple[int, str]]) -> Tuple[
    List[Tuple[int, str]],
    List[Tuple[int, str]]
]:
    # Initialize two empty lists to store documents
    # belonging to class 0 and class 1 respectively
    c0_docs, c1_docs = [], []

    # Iterate over all documents
    # Each document is a (label, text) tuple
    for label, text in docs:

        # If the document belongs to class 0,
        # append it to the class-0 list
        if label == 0:
            c0_docs.append((label, text))

        # If the document belongs to class 1,
        # append it to the class-1 list
        elif label == 1:
            c1_docs.append((label, text))

        # (Optional) other labels are ignored
        # This assumes binary classification (0 vs 1)

    # Return the two subsets:
    # - documents with label 0
    # - documents with label 1
    return c0_docs, c1_docs


In [7]:
# ==================================================
# 2) Preprocessing config
# ==================================================

from dataclasses import dataclass
from typing import Set
import re


@dataclass(frozen=True)
class PrepConfig:
    # Regular expression pattern used for token extraction
    # e.g., r"[a-z0-9%]+" will match lowercase words, digits, percentages
    pattern: re.Pattern

    # Set of stopwords to remove during preprocessing
    # These are common words that usually do not carry semantic meaning
    stopwords: Set[str]

    # Minimum token length to keep (e.g., drop tokens of length < 2)
    min_len: int = 2

    # Whether to drop tokens that are purely digits (e.g., "123")
    drop_pure_digits: bool = True

    # Whether to lowercase text before tokenization
    lowercase: bool = True


# A predefined stopword list for this project
# (kept simple and task-specific, not a huge generic list)
STOPWORDS = {
    "the","a","an","and","or","to","of","for","with","was","were",
    "is","are","in","on","after","new","we","i","them","they",
    "this","that","it","as","helps"
}

# Concrete preprocessing configuration used throughout the pipeline
# This object will be passed to tokenization / preprocessing functions
CFG = PrepConfig(
    pattern=re.compile(r"[a-z0-9%]+"),
    stopwords=STOPWORDS
)


In [8]:
# ==================================================
# 3) tokenize / text_prep
# ==================================================
def tokenize(text: str, cfg: PrepConfig = CFG) -> List[str]:
    # 1) Convert text to lowercase (case normalization)
    if cfg.lowercase:
        text = text.lower()

    # 2) Extract tokens using the regex pattern
    #    → keeps only allowed character sequences (e.g., a-z, 0-9, %)
    tokens = cfg.pattern.findall(text)

    # 3) Filter out tokens shorter than the minimum length
    #    → removes short, low-information noise tokens
    if cfg.min_len > 1:
        tokens = [t for t in tokens if len(t) >= cfg.min_len]

    # 4) Remove stopwords
    #    → removes common function words with little semantic value
    if cfg.stopwords:
        tokens = [t for t in tokens if t not in cfg.stopwords]

    # 5) Remove tokens that consist of digits only
    #    → prevents numeric-only tokens from inflating the vocabulary
    if cfg.drop_pure_digits:
        tokens = [t for t in tokens if not t.isdigit()]

    # Return the final list of normalized tokens
    return tokens

def text_prep(
    docs: Iterable[Tuple[int, str]],
    cfg: PrepConfig = CFG
) -> List[Tuple[int, List[str]]]:
    return [(label, tokenize(text, cfg)) for label, text in docs]


In [9]:
# ==================================================
# 4) tokens → sparse vectors (TF)
# ==================================================
def to_sparse_vectors(
    prepped_docs: Iterable[Tuple[int, List[str]]]
) -> List[Tuple[int, Counter]]:
    return [(label, Counter(tokens)) for label, tokens in prepped_docs]

In [10]:
def debug_before_after(prepped_docs):
    print("=== BEFORE (tokens) ===")
    for i, (label, tokens) in enumerate(prepped_docs):
        print(f"[doc {i}] label={label}, tokens={tokens}")

    sparse = to_sparse_vectors(prepped_docs)

    print("\n=== AFTER (Counter) ===")
    for i, (label, vec) in enumerate(sparse):
        print(f"[doc {i}] label={label}, vec={dict(vec)}")


In [11]:
test_docs = [
    (1, "NLP search NLP model"),
    (0, "cat dog dog")
]

prepped = text_prep(test_docs)
debug_before_after(prepped)

=== BEFORE (tokens) ===
[doc 0] label=1, tokens=['nlp', 'search', 'nlp', 'model']
[doc 1] label=0, tokens=['cat', 'dog', 'dog']

=== AFTER (Counter) ===
[doc 0] label=1, vec={'nlp': 2, 'search': 1, 'model': 1}
[doc 1] label=0, vec={'cat': 1, 'dog': 2}


In [12]:
# ==================================================
# 5) Rocchio: centroid (class prototype)
# ==================================================
def make_centroid(vectors: List[Counter]) -> Counter:
    # Create the centroid (average vector) for a class
    centroid = Counter()

    # Safety check: return empty vector if there are no documents
    if not vectors:
        return centroid

    # 1) Sum all document vectors belonging to the same class
    #    → accumulate term frequencies
    for vec in vectors:
        centroid.update(vec)

    # 2) Divide by the number of documents to compute the average
    #    → results in the class centroid
    n = len(vectors)
    for term in centroid:
        centroid[term] /= n

    return centroid


def train_rocchio(
    sparse_docs: List[Tuple[int, Counter]]
) -> dict:
    # Group document vectors by class label
    by_label = defaultdict(list)

    # Convert [(label, vector), ...] into
    # {label: [vector1, vector2, ...]}
    for label, vec in sparse_docs:
        by_label[label].append(vec)

    # Compute a centroid for each class
    # Result: {label: centroid_vector}
    return {
        label: make_centroid(vecs)
        for label, vecs in by_label.items()
    }


In [14]:
def cosine_sim(a: Counter, b: Counter, verbose: bool = False) -> float:
    # 1) Dot product: sum of products over shared tokens
    dot = sum(v * b.get(t, 0.0) for t, v in a.items())

    # 2) L2 norm (vector length)
    norm_a = math.sqrt(sum(v * v for v in a.values()))
    norm_b = math.sqrt(sum(v * v for v in b.values()))

    if verbose:
        print("=== cosine_sim DEBUG ===")
        print(f"dot product   : {dot}")
        print(f"norm_a (||a||): {norm_a}")
        print(f"norm_b (||b||): {norm_b}")
        print(f"a size        : {len(a)}")
        print(f"b size        : {len(b)}")

    # 3) Guard for zero vectors
    if norm_a == 0 or norm_b == 0:
        if verbose:
            print("→ Zero vector detected, returning 0.0\n")
        return 0.0

    sim = dot / (norm_a * norm_b)

    if verbose:
        print(f"cosine sim    : {sim}\n")

    return sim


In [17]:
#Debudding 

from collections import Counter

vec = Counter({"nlp": 2, "ir": 1, "model": 1})
centroid = Counter({"nlp": 1.5, "ir": 0.5, "data": 2})

score = cosine_sim(vec, centroid, verbose=True)

=== cosine_sim DEBUG ===
dot product   : 3.5
norm_a (||a||): 2.449489742783178
norm_b (||b||): 2.5495097567963922
a size        : 3
b size        : 3
cosine sim    : 0.5604485383178051



In [18]:
# ==================================================
# 7) predict
# ==================================================
def predict_rocchio(vec: Counter, centroids: dict) -> int:
    # 1) Compute similarity scores for each class
    #    scores[label] = cosine similarity between
    #    the input vector and that class's centroid
    scores = {lbl: cosine_sim(vec, c) for lbl, c in centroids.items()}

    # 2) Sanity check: no centroids means no classification is possible
    #    (fail fast to avoid obscure errors later)
    if not scores:
        raise ValueError("empty centroids")

    # 3) Find the highest similarity score
    #    This represents the most similar centroid
    best = max(scores.values())

    # 4) Collect all labels that achieve this maximum score
    #    (ties are possible if two centroids are equally similar)
    winners = [lbl for lbl, s in scores.items() if s == best]

    # 5) Deterministic tie-break:
    #    Always return the same label order for reproducibility
    return sorted(winners)[0]

In [19]:
# ==================================================
# 8) RUN
# ==================================================
# subset (optional, test)
c0_docs, c1_docs = make_subset(docs)

# preprocessing
prepped_docs = text_prep(docs)

# vectorize
sparse_docs = to_sparse_vectors(prepped_docs)

# train
centroids = train_rocchio(sparse_docs)

print("=== Centroids ===")
for label, vec in centroids.items():
    print(f"label={label}, unique_terms={len(vec)}")

print("\n=== Predictions (training data) ===")
for i, (label, vec) in enumerate(sparse_docs):
    pred = predict_rocchio(vec, centroids)
    print(f"doc {i}: true={label}, pred={pred}")

=== Centroids ===
label=0, unique_terms=27
label=1, unique_terms=27

=== Predictions (training data) ===
doc 0: true=0, pred=0
doc 1: true=0, pred=0
doc 2: true=0, pred=0
doc 3: true=0, pred=0
doc 4: true=0, pred=0
doc 5: true=1, pred=1
doc 6: true=1, pred=1
doc 7: true=1, pred=1
doc 8: true=1, pred=1
doc 9: true=1, pred=1
