In [47]:
import re
import numpy as np
from collections import Counter

TOKEN_DEBUG = False
VEC_DEBUG = False

TF-IDF + kNN (Cosine) Pipeline

raw documents (label, text)
   ↓
tokenization
   - lowercase
   - regex cleanup
   - split into tokens (optionally drop very short tokens)
   ↓
build_vocab
   - collect unique terms from all tokenized documents
   - create term2idx mapping (term → index)
   ↓
compute_idf
   - compute document frequency (df) internally using unique terms per document
   - compute smoothed IDF: idf = log((N+1)/(df+1)) + 1
   ↓
vectorize_dense_tfidf
   - compute TF per document (raw / log / norm)
   - multiply TF * IDF to form a dense TF-IDF vector
   - optional L2 normalization
   ↓
make_Xy_dense_tfidf
   - stack document vectors into X (num_docs × vocab_size)
   - align labels into y
   ↓
(train/test split OR leave-one-out evaluation)
   - X_train, y_train / X_test, y_test
   ↓
kNN (cosine similarity)
   - compute cosine similarity between test vector and all train vectors
   - select top-k neighbors
   - majority vote to predict label
   ↓
evaluation
   - accuracy (standard)
   - top-k hit rate (optional, label appears among k neighbors)


In [48]:
#sample data

In [49]:
docs = [
    ("positive", "The service received excellent reviews from users."),
    ("negative", "The product launch was delayed due to technical issues."),
    ("positive", "Employee satisfaction improved after the policy change."),
    ("negative", "Several employees resigned amid internal conflicts."),
    ("positive", "Customers praised the new update for its improved performance."),
    ("negative", "The system outage caused frustration among users."),
    ("positive", "The successful launch boosted investor confidence."),
    ("negative", "Customer complaints about the service have increased recently."),
    ("positive", "The company announced record-breaking profits this quarter."),
    ("negative", "The company reported a significant drop in quarterly revenue.")
]


In [50]:
# 1. Prepare Documents
# - Collect raw text documents and their labels.

In [51]:
# Split + build docs_all and y
pos_docs, neg_docs = [], []
for label, sentence in docs:
    if label == "positive":
        pos_docs.append(sentence)
    else:
        neg_docs.append(sentence)

docs_all = pos_docs + neg_docs

# Create label array based on the number of documents:
# assign 1 for each positive document and 0 for each negative document
y = np.array([1]*len(pos_docs) + [0]*len(neg_docs))  # pos=1, neg=0


In [52]:
# 2. Text Tokenization
# - Convert each document into a list of tokens.
# - Typical steps:
#   - Lowercasing
#   - Regex-based token extraction
#   - Removing very short or noisy tokens

In [53]:
def text_tokenization(s):
    if TOKEN_DEBUG:
        print("[TOKEN_DEBUG] raw:", repr(s))

    # 1. Lowercasing
    s = s.lower()

    # 2. Regex-based token extraction (keep alphanumeric characters)
    normalized = re.sub(r"[^a-z0-9]", " ", s).strip()
    tokens = normalized.split()

    # 3. Remove very short or noisy tokens (length < 2)
    tokens = [t for t in tokens if len(t) > 1]

    if TOKEN_DEBUG:
        print("[TOKEN_DEBUG] tokens:", tokens)

    return tokens

In [54]:
# 3. Vocabulary Construction
# - Build a vocabulary from all tokenized documents (baseline version).
# - Note: Document-frequency (df) filtering is NOT applied here.
#   will compute df/idf next and can filter very rare terms later if needed.

In [81]:
def build_vocab(docs_all):
    vocab_set = set()
    for sentence in docs_all:
        # Add tokens one by one into the vocabulary set.
        # update() unpacks the token list and inserts each term separately,
        # which is required for proper vocabulary construction.
        vocab_set.update(text_tokenization(sentence))

    vocab = sorted(vocab_set)
    term2idx = {t: i for i, t in enumerate(vocab)}

    return vocab, term2idx

In [82]:
vocab, term2idx = build_vocab(docs_all)

In [103]:
print(len(term2idx))
print(list(term2idx.items())[:10])  # sanity check

62
[('about', 0), ('after', 1), ('amid', 2), ('among', 3), ('announced', 4), ('boosted', 5), ('breaking', 6), ('caused', 7), ('change', 8), ('company', 9)]


In [84]:
# 4. DF/IDF/ TFIDF Computation
# - Compute document frequency (df) and inverse document frequency (idf).
# - Optionally apply min_df filtering at this stage (e.g., remove terms with df < 2).

In [105]:
from collections import Counter
import numpy as np

def compute_df(docs_all, term2idx):
    """
    Compute document frequency (df) for each term.
    df[i] = number of documents containing term i.
    """
    # Initialize document frequency counts to zero for all terms in the vocabulary
    df = np.zeros(len(term2idx), dtype=int)

    # Iterate over each document 
    for sentence in docs_all:
        # Tokenize the document and convert to a set so that
        tokens = set(text_tokenization(sentence))  # one count per document
        # Loop over unique terms appearing in the current document
        for t in tokens:
            # Map the term to its vocabulary index (returns None if not in vocab)
            idx = term2idx.get(t)
            if idx is not None:
                df[idx] += 1

    return df


In [106]:
df = compute_df(docs_all, term2idx)

In [107]:
def compute_idf(df, N):
    """
    Compute smoothed inverse document frequency (idf).
    """
    return np.log((N + 1) / (df + 1)) + 1.0

In [108]:
idf = compute_idf(df, len(docs_all))

In [109]:
def compute_tf(tokens, tf_mode="raw"):
    """
    Compute term frequency values for a single document.
    """
    tf_counts = Counter(tokens)
    tf = {}

    for term, freq in tf_counts.items():
        if tf_mode == "raw":
            tf[term] = float(freq)
        elif tf_mode == "log":
            tf[term] = 1.0 + np.log(float(freq))
        elif tf_mode == "norm":
            tf[term] = float(freq) / max(len(tokens), 1)
        else:
            raise ValueError("tf_mode must be one of: 'raw', 'log', 'norm'")

    return tf

In [111]:
print(compute_tf(["a", "b", "a", "c"], tf_mode="raw"))
print(compute_tf(["a", "b", "a", "c"], tf_mode="log"))
print(compute_tf(["a", "b", "a", "c"], tf_mode="norm"))

{'a': 2.0, 'b': 1.0, 'c': 1.0}
{'a': 1.6931471805599454, 'b': 1.0, 'c': 1.0}
{'a': 0.5, 'b': 0.25, 'c': 0.25}


In [110]:
# L2-normalize the TF-IDF vector to remove document length effects

In [90]:
def vectorize_dense_tfidf(sentence, term2idx, idf, *, tf_mode="raw", l2_normalize=True):
    """
    Convert a document into a dense TF-IDF vector.
    """
    assert len(idf) == len(term2idx), "idf must align with vocab"

    tokens = text_tokenization(sentence)
    tf = compute_tf(tokens, tf_mode=tf_mode)

    vec = np.zeros(len(term2idx), dtype=float)

    # Iterate over each term and its TF value in the document
    for term, tf_value in tf.items():
        # Look up the vocabulary index for the current term

        # If the term exists in the vocabulary
        idx = term2idx.get(term)
        if idx is not None:
             # Compute the TF-IDF weight for this term and store it
            vec[idx] = tf_value * idf[idx]

    # L2-normalize the vector to remove document length effects
    if l2_normalize:
        # Compute the L2 (Euclidean) norm of the TF-IDF vector
        norm = np.linalg.norm(vec)
        if norm > 0:
            vec = vec / norm

    return vec

In [112]:
# Example: vectorize a single document
vec = vectorize_dense_tfidf(
    docs_all[0],      # one sentence
    term2idx,
    idf,
    tf_mode="raw",
    l2_normalize=True
)

print(vec.shape)
print(vec[:10])  # first 10 values


(62,)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [113]:
def make_Xy_dense_tfidf(docs_all, y, term2idx, idf, *, tf_mode="raw", l2_normalize=True):
    """
    Build TF-IDF feature matrix X and label vector y.
    """
    # Convert each document into a TF-IDF vector
    X = [
        vectorize_dense_tfidf(
            s,
            term2idx,
            idf,
            tf_mode=tf_mode,
            l2_normalize=l2_normalize
        )
        for s in docs_all
        # Iterate over all documents and vectorize them one by one
    ]

    # Stack all document vectors into a 2D feature matrix
    # Shape: (num_documents, vocab_size)
    # Convert labels into a NumPy array for alignment
    return np.vstack(X), np.array(y)

In [114]:
# --- Execute TF-IDF matrix construction ---

X, y = make_Xy_dense_tfidf(
    docs_all,      # list of documents
    y,             # label array
    term2idx,      # vocabulary mapping
    idf,           # IDF vector
    tf_mode="raw", # TF scheme: "raw", "log", or "norm"
    l2_normalize=True
)

# Sanity check
print("X shape:", X.shape)
print("y shape:", y.shape)
print("First row of X (first 10 values):", X[0][:10])

X shape: (10, 62)
y shape: (10,)
First row of X (first 10 values): [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [115]:
import numpy as np

def cosine_similarity(vec1, vec2):
    """
    Compute cosine similarity between two vectors.
    Returns 0.0 if either vector has zero norm.
    """
      # Compute the product of the L2 norms of the two vectors
    denom = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    if denom == 0:
        return 0.0
    return float(np.dot(vec1, vec2) / denom)

In [116]:
def get_top_k_neighbors_cosine(X_train, y_train, x_test, k=3):
    """
    Return top-k neighbors as a list of (similarity, label, index).
    """

    # Initialize a list to store similarity results
    sims = []
    # Iterate over all training vectors
    for i in range(len(X_train)):
        # Compute cosine similarity between the test vector and the i-th training vector
        sim = cosine_similarity(x_test, X_train[i])
        sims.append((sim, y_train[i], i))

    sims.sort(key=lambda x: x[0], reverse=True)
    return sims[:k]


In [117]:
from collections import Counter

def majority_vote(neighbors):
    """
    Predict label by majority vote from neighbors.
    neighbors: list of (sim, label, index)
    """
    # Extract labels from the neighbors and count how many times
    votes = Counter(lbl for _, lbl, _ in neighbors)

    # Select the label with the highest vote count
    return votes.most_common(1)[0][0]


In [118]:
def knn_predict_cosine(X_train, y_train, x_test, k=3):
    """
    Predict label for one test vector using cosine kNN.
    Returns (pred_label, neighbors).
    """
    neighbors = get_top_k_neighbors_cosine(X_train, y_train, x_test, k=k)
    # Find the top-k nearest neighbors of the test vector
    # using cosine similarity

    pred = majority_vote(neighbors)
    # Predict the label by majority voting among the k neighbors

    return pred, neighbors
    # Return the predicted label along with the neighbor list

In [119]:
# Perform leave-one-out kNN predictions for all samples

In [120]:
def knn_predict_all_loo(X, y, k=3):
    """
    Leave-one-out predictions for all samples.
    """
    n = X.shape[0]
    # Number of samples in the dataset

    y_pred = np.zeros(n, dtype=y.dtype)
    # Initialize an array to store predicted labels

    for i in range(n):
        # Iterate over each sample as the test instance

        X_train = np.delete(X, i, axis=0)
        # Remove the i-th sample from the feature matrix to form the training set

        y_train = np.delete(y, i, axis=0)
        # Remove the corresponding label from the training labels

        pred, _ = knn_predict_cosine(X_train, y_train, X[i], k=k)
        # Predict the label of the held-out sample using kNN

        y_pred[i] = pred
        # Store the prediction for the i-th sample

    return y_pred
    # Return predictions for all samples under leave-one-out evaluation

In [121]:
#6) Evaluation: accuracy / top-k accuracy

In [122]:
def accuracy(y_true, y_pred):
    """
    Compute simple accuracy.
    """
    y_true = np.array(y_true)
    # Convert true labels to a NumPy array for vectorized comparison

    y_pred = np.array(y_pred)
    # Convert predicted labels to a NumPy array

    return (y_true == y_pred).mean()
    # Compare predictions with true labels element-wise,
    # and return the proportion of correct predictions


In [123]:
# Compute LOO top-k hit rate: true label appears among k nearest neighbors

In [124]:
def top_k_hit_rate_knn_loo(X, y, k=3):
    """
    Leave-one-out top-k hit rate:
    count as correct if the true label appears among the k neighbor labels.
    (This is not the same as standard top-k classification accuracy with probabilities.)
    """
    correct = 0
    # Counter for the number of correct hits

    for i in range(len(X)):
        # Iterate over each sample as the test instance (leave-one-out)

        X_train = np.delete(X, i, axis=0)
        # Remove the i-th sample from the feature matrix to form the training set

        y_train = np.delete(y, i, axis=0)
        # Remove the corresponding label from the training labels

        _, neighbors = knn_predict_cosine(X_train, y_train, X[i], k=k)
        # Find the top-k nearest neighbors for the held-out sample

        neighbor_labels = [lbl for _, lbl, _ in neighbors]
        # Extract the labels of the k neighbors

        if y[i] in neighbor_labels:
            # Check if the true label appears among the neighbor labels
            correct += 1
            # Count as a correct hit

    return correct / len(X)
    # Return the proportion of samples whose true label
    # appears among the top-k neighbors


In [125]:
# --- Build TF-IDF feature matrix ---
X, y = make_Xy_dense_tfidf(
    docs_all,
    y,
    term2idx,
    idf,
    tf_mode="raw",
    l2_normalize=True
)

# --- Leave-One-Out kNN predictions ---
k = 3
y_pred = knn_predict_all_loo(X, y, k=k)

# --- Evaluation ---
acc = accuracy(y, y_pred)
topk_hit = top_k_hit_rate_knn_loo(X, y, k=k)

print(f"LOO Accuracy (k={k}): {acc:.4f}")
print(f"LOO Top-{k} Hit Rate: {topk_hit:.4f}")


LOO Accuracy (k=3): 0.4000
LOO Top-3 Hit Rate: 0.5000


In [126]:
print(term2idx)
print(len(term2idx))

{'about': 0, 'after': 1, 'amid': 2, 'among': 3, 'announced': 4, 'boosted': 5, 'breaking': 6, 'caused': 7, 'change': 8, 'company': 9, 'complaints': 10, 'confidence': 11, 'conflicts': 12, 'customer': 13, 'customers': 14, 'delayed': 15, 'drop': 16, 'due': 17, 'employee': 18, 'employees': 19, 'excellent': 20, 'for': 21, 'from': 22, 'frustration': 23, 'have': 24, 'improved': 25, 'in': 26, 'increased': 27, 'internal': 28, 'investor': 29, 'issues': 30, 'its': 31, 'launch': 32, 'new': 33, 'outage': 34, 'performance': 35, 'policy': 36, 'praised': 37, 'product': 38, 'profits': 39, 'quarter': 40, 'quarterly': 41, 'received': 42, 'recently': 43, 'record': 44, 'reported': 45, 'resigned': 46, 'revenue': 47, 'reviews': 48, 'satisfaction': 49, 'service': 50, 'several': 51, 'significant': 52, 'successful': 53, 'system': 54, 'technical': 55, 'the': 56, 'this': 57, 'to': 58, 'update': 59, 'users': 60, 'was': 61}
62
