# KNN Text Classification with TF-IDF (Dense Representation)

## 1. Dataset
#- Toy dataset with positive / negative labels

## 2. Text Preprocessing
#- Tokenization

## 3. Vocabulary Construction
#- Build vocab and term2idx

## 4. Posting List & TF
#- Term-centric posting list
#- Term frequency (TF)

## 5. DF & IDF
#- Document frequency
#- Inverse document frequency

## 6. TF-IDF Construction
#- Sparse TF-IDF (doc_id → {term: value})
#- Dense TF-IDF matrix

## 7. Cosine Similarity
#- Vector-based similarity measure

## 8. KNN Classification
#- Majority vote based on top-k neighbors

## 9. Leave-One-Out (LOO) Evaluation
#- Evaluation on small dataset

In [121]:
import math
import re
import numpy as np
from collections import defaultdict, Counter

TOKEN_DEBUG = False
VEC_DEBUG = False

In [122]:
docs = [
    ("pos", "good service"),
    ("pos", "great product"),
    ("pos", "excellent quality"),
    ("neg", "bad service"),
    ("neg", "poor product"),
    ("neg", "terrible quality"),
]

In [123]:
# ------------------------------------------------------------
# Step 1. Prepare Documents (Class-wise separation)
# ------------------------------------------------------------

pos_docs = []
neg_docs = []

for label, sentence in docs:
    if label == "pos":
        pos_docs.append(sentence)
    elif label == "neg":
        neg_docs.append(sentence)

docs_all = pos_docs + neg_docs
doc_labels = np.array([1]*len(pos_docs) + [0]*len(neg_docs))

In [124]:
[1] * 2

[1, 1]

In [125]:
print("Number of documents:", len(docs_all))
print(docs_all)

Number of documents: 6
['good service', 'great product', 'excellent quality', 'bad service', 'poor product', 'terrible quality']


In [126]:
# 2. Text Tokenization (Doc)

In [127]:
def text_tokenization(s):
    normalized = re.sub(r"[^A-Za-z0-9]", " ", s).strip()
    return normalized.split()

In [128]:
#3. Vocabulary construction

In [129]:
def build_vocab(docs_all):
    # Collect all unique tokens from the entire corpus
    vocab_set = set()
    for sentence in docs_all:
        # Tokenize each sentence and add tokens to the vocabulary set
        vocab_set.update(text_tokenization(sentence))

    # Sort the vocabulary to ensure a deterministic order
    vocab = sorted(vocab_set)

    # Create a mapping from each term to a unique index
    # This index defines the feature dimension in vector representations
    term2idx = {t: i for i, t in enumerate(vocab)}

    return vocab, term2idx

In [143]:
vocab, term2idx = build_vocab(docs_all)

In [154]:
print("vocab order:", vocab)

vocab order: ['bad', 'excellent', 'good', 'great', 'poor', 'product', 'quality', 'service', 'terrible']


 #4. Create a posting list :We iterate over term–frequency pairs in each document to populate the posting list, which reorganizes the corpus from a document-centric view to a term-centric view.”


In [144]:
def build_posting_list_tf_debug(docs_all, debug=True):
    posting = defaultdict(dict)  # term -> {doc_id: tf}

    #The same tokenization function is reused,but at a different stage and for a different purpose.
    for doc_id, sentence in enumerate(docs_all):
        # Tokenize the sentence again, this time for per-document analysis.
        # This tokenization is used to compute term frequencies (TF) within a single document,
        # not for vocabulary construction.
        tokens = text_tokenization(sentence)
    
        # Count how many times each token appears in this document.
        # The result summarizes the sentence as term -> frequency (TF).
        tf_counter = Counter(tokens)


        if debug:
            print(f"\n[DOC {doc_id}] {sentence}")
            print("  Tokens:", tokens)
            print("  TF Counter:", tf_counter)

        for term, tf in tf_counter.items():
            posting[term][doc_id] = tf

    if debug:
        print("\n=== POSTING LIST SUMMARY ===")
        for term in sorted(posting.keys()):
            print(f"{term} -> {posting[term]}")

    return dict(posting)


In [145]:
posting = build_posting_list_tf_debug(docs_all, debug=True)


[DOC 0] good service
  Tokens: ['good', 'service']
  TF Counter: Counter({'good': 1, 'service': 1})

[DOC 1] great product
  Tokens: ['great', 'product']
  TF Counter: Counter({'great': 1, 'product': 1})

[DOC 2] excellent quality
  Tokens: ['excellent', 'quality']
  TF Counter: Counter({'excellent': 1, 'quality': 1})

[DOC 3] bad service
  Tokens: ['bad', 'service']
  TF Counter: Counter({'bad': 1, 'service': 1})

[DOC 4] poor product
  Tokens: ['poor', 'product']
  TF Counter: Counter({'poor': 1, 'product': 1})

[DOC 5] terrible quality
  Tokens: ['terrible', 'quality']
  TF Counter: Counter({'terrible': 1, 'quality': 1})

=== POSTING LIST SUMMARY ===
bad -> {3: 1}
excellent -> {2: 1}
good -> {0: 1}
great -> {1: 1}
poor -> {4: 1}
product -> {1: 1, 4: 1}
quality -> {2: 1, 5: 1}
service -> {0: 1, 3: 1}
terrible -> {5: 1}


In [146]:
#DF(Document Frequency)
#이 단어가 몇 개의 문서에 등장했나?

In [147]:
def compute_df(posting):
    # DF(term) = number of documents that contain the term
    return {term: len(doc_dict) for term, doc_dict in posting.items()}

In [148]:
df = compute_df(posting)
print(df)

{'good': 1, 'service': 2, 'great': 1, 'product': 2, 'excellent': 1, 'quality': 2, 'bad': 1, 'poor': 1, 'terrible': 1}


In [149]:
#IDF(Inverse Document Frequency)
#이 단어가 얼마나 흔한가 / 희귀한가?

In [158]:
def compute_idf(df, N):
    idf = {}
    for term, df_t in df.items():
        idf[term] = math.log(N / df_t)
    return idf

N = len(docs_all)
idf = compute_idf(df, N)
print("IDF:", idf)

IDF: {'good': 1.791759469228055, 'service': 1.0986122886681098, 'great': 1.791759469228055, 'product': 1.0986122886681098, 'excellent': 1.791759469228055, 'quality': 1.0986122886681098, 'bad': 1.791759469228055, 'poor': 1.791759469228055, 'terrible': 1.791759469228055}


In [160]:
#Dense TF-IDF

In [161]:
from collections import defaultdict
import numpy as np

def compute_tfidf(posting, idf):
    # Create a sparse TF-IDF representation.
    # The result is organized by document:
    #   doc_id -> {term : tf-idf value}
    tfidf = defaultdict(dict)

    # Iterate over the posting list (term-centric structure)
    # posting: term -> {doc_id : tf}
    for term, doc_dict in posting.items():

        # For the current term, iterate over all documents
        # in which this term appears
        for doc_id, tf in doc_dict.items():

            # Compute TF-IDF for this (term, document) pair
            # TF  : term frequency in the document
            # IDF : inverse document frequency of the term
            tfidf[doc_id][term] = tf * idf[term]

    # Convert defaultdict to a regular dict for cleaner output
    return dict(tfidf)

# Build sparse TF-IDF representation
tfidf = compute_tfidf(posting, idf)

In [162]:
#---Debugging code--
for term, doc_dict in posting.items():
    print("\nTERM:", term)
    print("  doc_dict:", doc_dict)
    for doc_id, tf in doc_dict.items():
        print(f"    doc_id={doc_id}, tf={tf}")



TERM: good
  doc_dict: {0: 1}
    doc_id=0, tf=1

TERM: service
  doc_dict: {0: 1, 3: 1}
    doc_id=0, tf=1
    doc_id=3, tf=1

TERM: great
  doc_dict: {1: 1}
    doc_id=1, tf=1

TERM: product
  doc_dict: {1: 1, 4: 1}
    doc_id=1, tf=1
    doc_id=4, tf=1

TERM: excellent
  doc_dict: {2: 1}
    doc_id=2, tf=1

TERM: quality
  doc_dict: {2: 1, 5: 1}
    doc_id=2, tf=1
    doc_id=5, tf=1

TERM: bad
  doc_dict: {3: 1}
    doc_id=3, tf=1

TERM: poor
  doc_dict: {4: 1}
    doc_id=4, tf=1

TERM: terrible
  doc_dict: {5: 1}
    doc_id=5, tf=1


In [None]:
#문서별 TF-IDF(딕셔너리 형태)를모든 문서를 같은 길이의 벡터(행렬)로 바꾸는 함수

In [163]:
def tfidf_to_dense(tfidf, term2idx, N):
    # Number of unique terms in the vocabulary
    # This determines the dimensionality of each document vector
    V = len(term2idx)

    # Initialize a dense TF-IDF matrix with zeros
    # Shape:
    #   N rows    -> documents (doc_id)
    #   V columns -> vocabulary terms
    X = np.zeros((N, V))

    # Iterate over the sparse TF-IDF representation
    # tfidf structure:
    #   doc_id -> {term : tf-idf value}
    for doc_id, term_dict in tfidf.items():

        # Iterate over terms that actually appear in this document
        for term, val in term_dict.items():

            # Map the term to its column index using term2idx
            # and assign the TF-IDF value to the corresponding position
            X[doc_id, term2idx[term]] = val

    # Return the dense TF-IDF document-term matrix
    return X


# Convert sparse TF-IDF representation to dense matrix
X_tfidf = tfidf_to_dense(tfidf, term2idx, N)

# Inspect the shape and contents of the dense TF-IDF matrix
print("X_tfidf shape:", X_tfidf.shape)
print(X_tfidf)


X_tfidf shape: (6, 9)
[[0.         0.         1.79175947 0.         0.         0.
  0.         1.09861229 0.        ]
 [0.         0.         0.         1.79175947 0.         1.09861229
  0.         0.         0.        ]
 [0.         1.79175947 0.         0.         0.         0.
  1.09861229 0.         0.        ]
 [1.79175947 0.         0.         0.         0.         0.
  0.         1.09861229 0.        ]
 [0.         0.         0.         0.         1.79175947 1.09861229
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  1.09861229 0.         1.79175947]]


In [165]:
# Cosine similarity (dense)

In [166]:
def cosine_sim(a, b, eps=1e-12):
    return float(np.dot(a, b) / ((np.linalg.norm(a) * np.linalg.norm(b)) + eps))

In [167]:
def cosine_sim_debug(a, b, eps=1e-12):
    dot = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    denom = (norm_a * norm_b) + eps
    cos = dot / denom

    print("Vector a:", a)
    print("Vector b:", b)
    print("dot(a, b):", dot)
    print("||a||:", norm_a)
    print("||b||:", norm_b)
    print("denominator:", denom)
    print("cosine similarity:", cos)

    return float(cos)

In [168]:
a = np.array([1.0, 1.0, 0.0])
b = np.array([2.0, 2.0, 0.0])

cosine_sim_debug(a, b)


Vector a: [1. 1. 0.]
Vector b: [2. 2. 0.]
dot(a, b): 4.0
||a||: 1.4142135623730951
||b||: 2.8284271247461903
denominator: 4.000000000001001
cosine similarity: 0.9999999999997498


0.9999999999997498

In [169]:
cosine_sim_debug(X_tfidf[0], X_tfidf[3])

Vector a: [0.         0.         1.79175947 0.         0.         0.
 0.         1.09861229 0.        ]
Vector b: [1.79175947 0.         0.         0.         0.         0.
 0.         1.09861229 0.        ]
dot(a, b): 1.206948960812582
||a||: 2.1017494989605643
||b||: 2.1017494989605643
denominator: 4.417350956381983
cosine similarity: 0.27322913047441666


0.27322913047441666

In [None]:
# KNN: predict one doc (dense)

In [171]:
def knn_predict_one(X_train, y_train, x_test, k=3):
    # cosine similarity vs all train docs
    dots = X_train @ x_test
    denom = (np.linalg.norm(X_train, axis=1) * np.linalg.norm(x_test) + 1e-12)
    sims = dots / denom

    # top-k neighbors (highest similarity)
    topk_idx = np.argsort(-sims)[:k]
    topk_labels = y_train[topk_idx]

    # majority vote (tie -> 0)
    counts = np.bincount(topk_labels.astype(int), minlength=2)
    pred = int(np.argmax(counts))

    return pred, topk_idx, sims[topk_idx]


In [172]:
def knn_predict_one_debug(X_train, y_train, x_test, k=3):
    dots = X_train @ x_test
    denom = (np.linalg.norm(X_train, axis=1) * np.linalg.norm(x_test) + 1e-12)
    sims = dots / denom

    print("Cosine similarities:", sims)

    topk_idx = np.argsort(-sims)[:k]
    print("Top-k indices:", topk_idx)

    topk_labels = y_train[topk_idx]
    print("Top-k labels:", topk_labels)

    counts = np.bincount(topk_labels.astype(int), minlength=2)
    print("Label counts [neg, pos]:", counts)

    pred = int(np.argmax(counts))
    print("Prediction:", pred)

    return pred

In [173]:
knn_predict_one_debug(
    X_train=X_tfidf[1:], 
    y_train=doc_labels[1:], 
    x_test=X_tfidf[0], 
    k=3
)

Cosine similarities: [0.         0.         0.27322913 0.         0.        ]
Top-k indices: [2 0 1]
Top-k labels: [0 1 1]
Label counts [neg, pos]: [1 2]
Prediction: 1


1

As a result of selecting the three documents most similar to the test document, two of them were labeled as positive, so the final prediction is positive.

In [None]:
# Leave-One-Out evaluation

In [175]:
def loo_knn_eval(X, y, k=3):
    # Number of documents
    n = X.shape[0]

    # Array to store predictions for each document
    y_pred = np.zeros(n, dtype=int)

    # Leave-One-Out loop:
    # Each document is used once as the test document
    for i in range(n):
        # Use all documents except i as training data
        X_train = np.delete(X, i, axis=0)
        y_train = np.delete(y, i, axis=0)

        # Use the i-th document as the test sample
        x_test = X[i]

        # Predict the label using KNN
        # (returns prediction, neighbor indices, similarities)
        pred, _, _ = knn_predict_one(X_train, y_train, x_test, k=k)

        # Store the prediction
        y_pred[i] = pred

    # Compute accuracy by comparing predictions with true labels
    acc = float(np.mean(y_pred == y))

    return y_pred, acc

In [176]:
# -----------------------------
# Run
# -----------------------------
for k in [1, 3, 5]:
    y_pred, acc = loo_knn_eval(X_tfidf, doc_labels, k=k)
    print(f"k={k} | y_true={doc_labels.tolist()} | y_pred={y_pred.tolist()} | acc={acc:.3f}")


k=1 | y_true=[1, 1, 1, 0, 0, 0] | y_pred=[0, 0, 0, 1, 1, 1] | acc=0.000
k=3 | y_true=[1, 1, 1, 0, 0, 0] | y_pred=[1, 1, 1, 1, 1, 1] | acc=0.500
k=5 | y_true=[1, 1, 1, 0, 0, 0] | y_pred=[0, 0, 0, 1, 1, 1] | acc=0.000
