In [60]:
import numpy as np

In [61]:
# =========================
# 0) Data preparation
# =========================
vocab = ["China", "Japan", "Tokyo", "Beijing"]
term2idx = {term: i for i, term in enumerate(vocab)}

class1_texts = ["China China Tokyo", "China Beijing"]
class2_texts = ["Japan Japan Tokyo", "Japan Beijing Beijing"]

In [62]:
#functions

In [63]:
def text_to_tf(text: str, term2idx: dict, vocab_size: int) -> np.ndarray:
    """
    Convert raw text into a raw term-frequency (tf) vector.

    Parameters
    ----------
    text : str
        Input document text (space-separated tokens)
    term2idx : dict
        Mapping from term to index (e.g., "China" -> 0)
    vocab_size : int
        Size of the vocabulary

    Returns
    -------
    np.ndarray
        Raw tf vector of shape (vocab_size,)
    """

    # At initial state, generate an empty vector
    # Example: [0.0, 0.0, 0.0, 0.0]
    vec = np.zeros(vocab_size, dtype=float)

    # Tokenize text and count term frequency
    for token in text.split():
        # If the token exists in the vocabulary, increment its count
        if token in term2idx:
            vec[term2idx[token]] += 1.0

    return vec


In [64]:
print(text_to_tf("China China Tokyo", term2idx, len(vocab)))
print(text_to_tf("Japan Beijing Beijing", term2idx, len(vocab)))

[2. 0. 1. 0.]
[0. 1. 0. 2.]


In [65]:
def l2_normalize(vec: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    """
    Perform L2 normalization on a vector.

    Parameters
    ----------
    vec : np.ndarray
        Input vector (e.g., raw term-frequency vector)
    eps : float
        Small constant (epsilon) to avoid division by zero

    Returns
    -------
    np.ndarray
        L2-normalized vector
    """

    # Step 1. Compute the L2 norm (length) of the vector
    # Mathematical definition:
    # ||d|| = sqrt(d1^2 + d2^2 + ... + dn^2)
    norm = np.linalg.norm(vec)

    print("vec:", vec)
    print("norm:", norm)
    print("vec / norm:", vec / norm)
    

    # Step 2. Safety check
    # If the norm is zero (or extremely close to zero),
    # normalization would cause division by zero.
    # In that case, return a copy of the original vector.
    if norm < eps:
        return vec.copy()

    # Step 3. Normalize the vector
    # Mathematical definition:
    # d_hat = d / ||d||
    # This preserves the direction of the vector
    # while scaling its length to 1.
    return vec / norm

In [66]:
v = text_to_tf("China China Tokyo", term2idx, len(vocab))
v_hat = l2_normalize(v)

vec: [2. 0. 1. 0.]
norm: 2.23606797749979
vec / norm: [0.89442719 0.         0.4472136  0.        ]


In [67]:
#Get centriod

In [68]:
def centroid_from_texts(texts, term2idx, vocab_size):
    #preprocessing : tern the document to rawtf 
    #orchestrates the process by applying the document-level transformations to all documents and then averaging them
    docs_raw = [text_to_tf(t, term2idx, vocab_size) for t in texts]
    #using l2_normalize to perform L2 normalization 
    docs_hat = [l2_normalize(d) for d in docs_raw]
    #compute average = Rocchio
    mu = np.mean(docs_hat, axis=0)
    
    return np.array(docs_raw), np.array(docs_hat), mu

In [69]:
#Rocchio Calculation

In [70]:
def sq_euclid(a: np.ndarray, b: np.ndarray) -> float:
    diff = a - b
    return float(np.dot(diff, diff))


def rocchio_predict(x_hat: np.ndarray, centroids: dict):
    # centroids: {"ClassName": mu_vector, ...}
    dists = {name: sq_euclid(x_hat, mu) for name, mu in centroids.items()}
    pred = min(dists, key=dists.get)
    return pred, dists


In [71]:
# Test document (user input / test document)
test_text = "China Tokyo Beijing"

In [72]:
# ------------------------------------------------------------
# 5) 실행
# ------------------------------------------------------------
V = len(vocab)

raw1, hat1, mu1 = centroid_from_texts(class1_texts, term2idx, V)
raw2, hat2, mu2 = centroid_from_texts(class2_texts, term2idx, V)

x_raw = text_to_tf(test_text, term2idx, V)
x_hat = l2_normalize(x_raw)

centroids = {"Class1_ChinaSide": mu1, "Class2_JapanSide": mu2}
pred, dists = rocchio_predict(x_hat, centroids)

# 보기 좋게 출력
def fmt(vec):
    return "[" + ", ".join(f"{v:.3f}" for v in vec) + "]"

print("Vocabulary:", vocab)
print("\n=== Class 1 docs (raw tf) ===")
for t, v in zip(class1_texts, raw1):
    print(f"{t:25s} -> {v}")

print("\n=== Class 1 docs (L2-normalized) ===")
for t, v in zip(class1_texts, hat1):
    print(f"{t:25s} -> {fmt(v)}")

print("\nmu1 (Class1 centroid):", fmt(mu1))

print("\n=== Class 2 docs (raw tf) ===")
for t, v in zip(class2_texts, raw2):
    print(f"{t:25s} -> {v}")

print("\n=== Class 2 docs (L2-normalized) ===")
for t, v in zip(class2_texts, hat2):
    print(f"{t:25s} -> {fmt(v)}")

print("\nmu2 (Class2 centroid):", fmt(mu2))

print("\n=== Test doc ===")
print("text:", test_text)
print("x_raw:", x_raw)
print("x_hat:", fmt(x_hat))

print("\n=== Squared distances to centroids ===")
for k, v in dists.items():
    print(f"{k}: {v:.6f}")

print("\nPREDICT =", pred)


vec: [2. 0. 1. 0.]
norm: 2.23606797749979
vec / norm: [0.89442719 0.         0.4472136  0.        ]
vec: [1. 0. 0. 1.]
norm: 1.4142135623730951
vec / norm: [0.70710678 0.         0.         0.70710678]
vec: [0. 2. 1. 0.]
norm: 2.23606797749979
vec / norm: [0.         0.89442719 0.4472136  0.        ]
vec: [0. 1. 0. 2.]
norm: 2.23606797749979
vec / norm: [0.         0.4472136  0.         0.89442719]
vec: [1. 0. 1. 1.]
norm: 1.7320508075688772
vec / norm: [0.57735027 0.         0.57735027 0.57735027]
Vocabulary: ['China', 'Japan', 'Tokyo', 'Beijing']

=== Class 1 docs (raw tf) ===
China China Tokyo         -> [2. 0. 1. 0.]
China Beijing             -> [1. 0. 0. 1.]

=== Class 1 docs (L2-normalized) ===
China China Tokyo         -> [0.894, 0.000, 0.447, 0.000]
China Beijing             -> [0.707, 0.000, 0.000, 0.707]

mu1 (Class1 centroid): [0.801, 0.000, 0.224, 0.354]

=== Class 2 docs (raw tf) ===
Japan Japan Tokyo         -> [0. 2. 1. 0.]
Japan Beijing Beijing     -> [0. 1. 0. 2.]

===